1
2
3
4
5
6
7
8 """
9 Functions to find and load NLTK X{resource files}, such as corpora,
10 grammars, and saved processing objects. Resource files are identified
11 using URLs, such as"C{nltk:corpora/abc/rural.txt}" or
12 "C{http://nltk.org/sample/toy.cfg}". The following URL protocols are
13 supported:
14
15 - "C{file:I{path}}": Specifies the file whose path is C{I{path}}.
16 Both relative and absolute paths may be used.
17
18 - "C{http://I{host}/{path}}": Specifies the file stored on the web
19 server C{I{host}} at path C{I{path}}.
20
21 - "C{nltk:I{path}}": Specifies the file stored in the NLTK data
22 package at C{I{path}}. NLTK will search for these files in the
23 directories specified by L{nltk.data.path}.
24
25 If no protocol is specified, then the default protocol "C{nltk:}" will
26 be used.
27
28 This module provides to functions that can be used to access a
29 resource file, given its URL: L{load()} loads a given resource, and
30 adds it to a resource cache; and L{retrieve()} copies a given resource
31 to a local file.
32 """
33
34 import sys
35 import os, os.path
36 import textwrap
37 import weakref
38 import yaml
39 import re
40 import urllib2
41 import zipfile
42 import codecs
43
44 from gzip import GzipFile, READ as GZ_READ, WRITE as GZ_WRITE
45
46 try:
47 from zlib import Z_SYNC_FLUSH as FLUSH
48 except:
49 from zlib import Z_FINISH as FLUSH
50
51 try:
52 import cPickle as pickle
53 except:
54 import pickle
55
56 try:
57 from cStringIO import StringIO
58 except:
59 from StringIO import StringIO
60
61 import nltk
62
63
64
65
66
67 path = []
68 """A list of directories where the NLTK data package might reside.
69 These directories will be checked in order when looking for a
70 resource in the data package. Note that this allows users to
71 substitute in their own versions of resources, if they have them
72 (e.g., in their home directory under ~/nltk_data)."""
73
74
75 path += [d for d in os.environ.get('NLTK_DATA', '').split(os.pathsep) if d]
76 if os.path.expanduser('~/') != '~/': path += [
77 os.path.expanduser('~/nltk_data')]
78
79
80 if sys.platform.startswith('win'): path += [
81 r'C:\nltk_data', r'D:\nltk_data', r'E:\nltk_data',
82 os.path.join(sys.prefix, 'nltk_data'),
83 os.path.join(sys.prefix, 'lib', 'nltk_data'),
84 os.path.join(os.environ.get('APPDATA', 'C:\\'), 'nltk_data')]
85
86
87 else: path += [
88 '/usr/share/nltk_data',
89 '/usr/local/share/nltk_data',
90 '/usr/lib/nltk_data',
91 '/usr/local/lib/nltk_data']
92
93
94
95
96
98 """
99 An abstract base class for 'path pointers,' used by NLTK's data
100 package to identify specific paths. Two subclasses exist:
101 L{FileSystemPathPointer} identifies a file that can be accessed
102 directly via a given absolute path. L{ZipFilePathPointer}
103 identifies a file contained within a zipfile, that can be accessed
104 by reading that zipfile.
105 """
106 - def open(self, encoding=None):
107 """
108 Return a seekable read-only stream that can be used to read
109 the contents of the file identified by this path pointer.
110
111 @raise IOError: If the path specified by this pointer does
112 not contain a readable file.
113 """
114 raise NotImplementedError('abstract base class')
115
117 """
118 Return the size of the file pointed to by this path pointer,
119 in bytes.
120
121 @raise IOError: If the path specified by this pointer does
122 not contain a readable file.
123 """
124 raise NotImplementedError('abstract base class')
125
126 - def join(self, fileid):
127 """
128 Return a new path pointer formed by starting at the path
129 identified by this pointer, and then following the relative
130 path given by C{fileid}. The path components of C{fileid}
131 should be seperated by forward slashes (C{/}), regardless of
132 the underlying file system's path seperator character.
133 """
134 raise NotImplementedError('abstract base class')
135
136
138 """
139 A path pointer that identifies a file which can be accessed
140 directly via a given absolute path. C{FileSystemPathPointer} is a
141 subclass of C{str} for backwards compatibility purposes --
142 this allows old code that expected C{nltk.data.find()} to expect a
143 string to usually work (assuming the resource is not found in a
144 zipfile). It also permits open() to work on a FileSystemPathPointer.
145
146 """
148 """
149 Create a new path pointer for the given absolute path.
150
151 @raise IOError: If the given path does not exist.
152 """
153 path = os.path.abspath(path)
154 if not os.path.exists(path):
155 raise IOError('No such file or directory: %r' % path)
156 self._path = path
157
158
159
160
161 path = property(lambda self: self._path, doc="""
162 The absolute path identified by this path pointer.""")
163
164 - def open(self, encoding=None):
169
171 return os.stat(self._path).st_size
172
173 - def join(self, fileid):
176
178 return 'FileSystemPathPointer(%r)' % self._path
179
182
183
185 """
186 A C{GzipFile} subclass that buffers calls to L{read()} and L{write()}.
187 This allows faster reads and writes of data to and from gzip-compressed
188 files at the cost of using more memory.
189
190 The default buffer size is 2mb.
191
192 C{BufferedGzipFile} is useful for loading large gzipped pickle objects
193 as well as writing large encoded feature files for classifier training.
194 """
195 SIZE = 2 * 2**20
196
197 - def __init__(self, filename=None, mode=None, compresslevel=9,
198 fileobj=None, **kwargs):
199 """
200 @return: a buffered gzip file object
201 @rtype: C{BufferedGzipFile}
202 @param filename: a filesystem path
203 @type filename: C{str}
204 @param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab',
205 'w', or 'wb'
206 @type mode: C{str}
207 @param compresslevel: The compresslevel argument is an integer from 1
208 to 9 controlling the level of compression; 1 is fastest and
209 produces the least compression, and 9 is slowest and produces the
210 most compression. The default is 9.
211 @type compresslevel: C{int}
212 @param fileobj: a StringIO stream to read from instead of a file.
213 @type fileobj: C{StringIO}
214 @kwparam size: number of bytes to buffer during calls to
215 L{read()} and L{write()}
216 @type size: C{int}
217 """
218 GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
219 self._size = kwargs.get('size', self.SIZE)
220 self._buffer = StringIO()
221
222 self._len = 0
223
225
226
227 self._buffer = StringIO()
228 self._len = 0
229
235
242
249
250 - def flush(self, lib_mode=FLUSH):
251 self._buffer.flush()
252 GzipFile.flush(self, lib_mode)
253
254 - def read(self, size=None):
255 if not size:
256 size = self._size
257 contents = StringIO()
258 while True:
259 blocks = GzipFile.read(self, size)
260 if not blocks:
261 contents.flush()
262 break
263 contents.write(blocks)
264 return contents.getvalue()
265 else:
266 return GzipFile.read(self, size)
267
268 - def write(self, data, size=-1):
269 """
270 @param data: C{str} to write to file or buffer
271 @type data: C{str}
272 @param size: buffer at least size bytes before writing to file
273 @type size: C{int}
274 """
275 if not size:
276 size = self._size
277 if self._len + len(data) <= size:
278 self._write_buffer(data)
279 else:
280 self._write_gzip(data)
281
282
284 """
285 A subclass of C{FileSystemPathPointer} that identifies a gzip-compressed
286 file located at a given absolute path. C{GzipFileSystemPathPointer} is
287 appropriate for loading large gzip-compressed pickle objects efficiently.
288 """
289 - def open(self, encoding=None):
294
295
297 """
298 A path pointer that identifies a file contained within a zipfile,
299 which can be accessed by reading that zipfile.
300 """
332
333 zipfile = property(lambda self: self._zipfile, doc="""
334 The C{zipfile.ZipFile} object used to access the zip file
335 containing the entry identified by this path pointer.""")
336 entry = property(lambda self: self._entry, doc="""
337 The name of the file within C{zipfile} that this path
338 pointer points to.""")
339
340 - def open(self, encoding=None):
348
351
352 - def join(self, fileid):
355
357 return 'ZipFilePathPointer(%r, %r)' % (
358 self._zipfile.filename, self._entry)
359
360
361
362
363
364
365
366 _resource_cache = {}
367 """A dictionary used to cache resources so that they won't
368 need to be loaded more than once."""
369
370 -def find(resource_name):
371 """
372 Find the given resource by searching through the directories and
373 zip files in L{nltk.data.path}, and return a corresponding path
374 name. If the given resource is not found, raise a C{LookupError},
375 whose message gives a pointer to the installation instructions for
376 the NLTK downloader.
377
378 Zip File Handling:
379
380 - If C{resource_name} contains a component with a C{.zip}
381 extension, then it is assumed to be a zipfile; and the
382 remaining path components are used to look inside the zipfile.
383
384 - If any element of C{nltk.data.path} has a C{.zip} extension,
385 then it is assumed to be a zipfile.
386
387 - If a given resource name that does not contain any zipfile
388 component is not found initially, then C{find()} will make a
389 second attempt to find that resource, by replacing each
390 component I{p} in the path with I{p.zip/p}. For example, this
391 allows C{find()} to map the resource name
392 C{corpora/chat80/cities.pl} to a zip file path pointer to
393 C{corpora/chat80.zip/chat80/cities.pl}.
394
395 - When using C{find()} to locate a directory contained in a
396 zipfile, the resource name I{must} end with the C{'/'}
397 character. Otherwise, C{find()} will not locate the
398 directory.
399
400 @type resource_name: C{str}
401 @param resource_name: The name of the resource to search for.
402 Resource names are posix-style relative path names, such as
403 C{'corpora/brown'}. In particular, directory names should
404 always be separated by the C{'/'} character, which will be
405 automatically converted to a platform-appropriate path
406 separator.
407 @rtype: C{str}
408 """
409
410 m = re.match('(.*\.zip)/?(.*)$|', resource_name)
411 zipfile, zipentry = m.groups()
412
413
414 for path_item in path:
415
416
417 if os.path.isfile(path_item) and path_item.endswith('.zip'):
418 try: return ZipFilePathPointer(path_item, resource_name)
419 except IOError: continue
420
421
422 elif os.path.isdir(path_item):
423 if zipfile is None:
424 p = os.path.join(path_item, *resource_name.split('/'))
425 if os.path.exists(p):
426 if p.endswith('.gz'):
427 return GzipFileSystemPathPointer(p)
428 else:
429 return FileSystemPathPointer(p)
430 else:
431 p = os.path.join(path_item, *zipfile.split('/'))
432 if os.path.exists(p):
433 try: return ZipFilePathPointer(p, zipentry)
434 except IOError: continue
435
436
437
438
439 if zipfile is None:
440 pieces = resource_name.split('/')
441 for i in range(len(pieces)):
442 modified_name = '/'.join(pieces[:i]+[pieces[i]+'.zip']+pieces[i:])
443 try: return find(modified_name)
444 except LookupError: pass
445
446
447 msg = textwrap.fill(
448 'Resource %r not found. Please use the NLTK Downloader to '
449 'obtain the resource: >>> nltk.download().' %
450 (resource_name,), initial_indent=' ', subsequent_indent=' ',
451 width=66)
452 msg += '\n Searched in:' + ''.join('\n - %r' % d for d in path)
453 sep = '*'*70
454 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
455 raise LookupError(resource_not_found)
456
457 -def retrieve(resource_url, filename=None, verbose=True):
458 """
459 Copy the given resource to a local file. If no filename is
460 specified, then use the URL's filename. If there is already a
461 file named C{filename}, then raise a C{ValueError}.
462
463 @type resource_url: C{str}
464 @param resource_url: A URL specifying where the resource should be
465 loaded from. The default protocol is C{"nltk:"}, which searches
466 for the file in the the NLTK data package.
467 """
468 if filename is None:
469 if resource_url.startswith('file:'):
470 filename = os.path.split(filename)[-1]
471 else:
472 filename = re.sub(r'(^\w+:)?.*/', '', resource_url)
473 if os.path.exists(filename):
474 filename = os.path.abspath(filename)
475 raise ValueError, "File %r already exists!" % filename
476
477 if verbose:
478 print 'Retrieving %r, saving to %r' % (resource_url, filename)
479
480
481 infile = _open(resource_url)
482 outfile = open(filename, 'wb')
483
484
485 while True:
486 s = infile.read(1024*64)
487 outfile.write(s)
488 if not s: break
489
490
491 infile.close()
492 outfile.close()
493
494
495
496
497 FORMATS = {
498 'pickle': "A serialized python object, stored using the pickle module.",
499 'yaml': "A serialized python object, stored using the yaml module.",
500 'cfg': "A context free grammar, parsed by nltk.parse_cfg().",
501 'pcfg': "A probabilistic CFG, parsed by nltk.parse_pcfg().",
502 'fcfg': "A feature CFG, parsed by nltk.parse_fcfg().",
503 'fol': "A list of first order logic expressions, parsed by "
504 "nltk.sem.parse_fol() using nltk.sem.logic.LogicParser.",
505 'logic': "A list of first order logic expressions, parsed by "
506 "nltk.sem.parse_logic(). Requires an additional logic_parser "
507 "parameter",
508 'val': "A semantic valuation, parsed by nltk.sem.parse_valuation().",
509 'raw': "The raw (byte string) contents of a file.",
510 }
511
512
513
514
515 AUTO_FORMATS = {
516 'pickle': 'pickle',
517 'yaml': 'yaml',
518 'cfg': 'cfg',
519 'pcfg': 'pcfg',
520 'fcfg': 'fcfg',
521 'fol': 'fol',
522 'logic': 'logic',
523 'val': 'val'}
524
525 -def load(resource_url, format='auto', cache=True, verbose=False,
526 logic_parser=None, fstruct_parser=None):
527 """
528 Load a given resource from the NLTK data package. The following
529 resource formats are currently supported:
530 - C{'pickle'}
531 - C{'yaml'}
532 - C{'cfg'} (context free grammars)
533 - C{'pcfg'} (probabilistic CFGs)
534 - C{'fcfg'} (feature-based CFGs)
535 - C{'fol'} (formulas of First Order Logic)
536 - C{'logic'} (Logical formulas to be parsed by the given logic_parser)
537 - C{'val'} (valuation of First Order Logic model)
538 - C{'raw'}
539
540 If no format is specified, C{load()} will attempt to determine a
541 format based on the resource name's file extension. If that
542 fails, C{load()} will raise a C{ValueError} exception.
543
544 @type resource_url: C{str}
545 @param resource_url: A URL specifying where the resource should be
546 loaded from. The default protocol is C{"nltk:"}, which searches
547 for the file in the the NLTK data package.
548 @type cache: C{bool}
549 @param cache: If true, add this resource to a cache. If C{load}
550 finds a resource in its cache, then it will return it from the
551 cache rather than loading it. The cache uses weak references,
552 so a resource wil automatically be expunged from the cache
553 when no more objects are using it.
554
555 @type verbose: C{bool}
556 @param verbose: If true, print a message when loading a resource.
557 Messages are not displayed when a resource is retrieved from
558 the cache.
559
560 @type logic_parser: C{LogicParser}
561 @param logic_parser: The parser that will be used to parse logical
562 expressions.
563 @type fstruct_parser: C{FeatStructParser}
564 @param fstruct_parser: The parser that will be used to parse the
565 feature structure of an fcfg.
566 """
567
568 if cache:
569 resource_val = _resource_cache.get(resource_url)
570 if resource_val is not None:
571 if verbose:
572 print '<<Using cached copy of %s>>' % (resource_url,)
573 return resource_val
574
575
576 if verbose:
577 print '<<Loading %s>>' % (resource_url,)
578
579
580 if format == 'auto':
581 resource_url_parts = resource_url.split('.')
582 ext = resource_url_parts[-1]
583 if ext == 'gz':
584 ext = resource_url_parts[-2]
585 format = AUTO_FORMATS.get(ext)
586 if format is None:
587 raise ValueError('Could not determine format for %s based '
588 'on its file\nextension; use the "format" '
589 'argument to specify the format explicitly.'
590 % resource_url)
591
592
593 if format == 'pickle':
594 resource_val = pickle.load(_open(resource_url))
595 elif format == 'yaml':
596 resource_val = yaml.load(_open(resource_url))
597 elif format == 'cfg':
598 resource_val = nltk.grammar.parse_cfg(_open(resource_url).read())
599 elif format == 'pcfg':
600 resource_val = nltk.grammar.parse_pcfg(_open(resource_url).read())
601 elif format == 'fcfg':
602 resource_val = nltk.grammar.parse_fcfg(_open(resource_url).read(),
603 logic_parser=logic_parser,
604 fstruct_parser=fstruct_parser)
605 elif format == 'fol':
606 resource_val = nltk.sem.parse_logic(_open(resource_url).read(),
607 logic_parser=nltk.sem.logic.LogicParser())
608 elif format == 'logic':
609 resource_val = nltk.sem.parse_logic(_open(resource_url).read(),
610 logic_parser=logic_parser)
611 elif format == 'val':
612 resource_val = nltk.sem.parse_valuation(_open(resource_url).read())
613 elif format == 'raw':
614 resource_val = _open(resource_url).read()
615 else:
616 assert format not in FORMATS
617 raise ValueError('Unknown format type!')
618
619
620 if cache:
621 try:
622 _resource_cache[resource_url] = resource_val
623 except TypeError:
624
625
626 pass
627
628 return resource_val
629
630 -def show_cfg(resource_url, escape='##'):
631 """
632 Write out a grammar file, ignoring escaped and empty lines
633 @type resource_url: C{str}
634 @param resource_url: A URL specifying where the resource should be
635 loaded from. The default protocol is C{"nltk:"}, which searches
636 for the file in the the NLTK data package.
637 @type escape: C{str}
638 @param escape: Prepended string that signals lines to be ignored
639 """
640 resource_val = load(resource_url, format='raw', cache=False)
641 lines = resource_val.splitlines()
642 for l in lines:
643 if l.startswith(escape): continue
644 if re.match('^$', l): continue
645 print l
646
647
649 """
650 Remove all objects from the resource cache.
651 @see: L{load()}
652 """
653 _resource_cache.clear()
654
656 """
657 Helper function that returns an open file object for a resource,
658 given its resource URL. If the given resource URL uses the 'ntlk'
659 protocol, or uses no protocol, then use L{nltk.data.find} to find
660 its path, and open it with the given mode; if the resource URL
661 uses the 'file' protocol, then open the file with the given mode;
662 otherwise, delegate to C{urllib2.urlopen}.
663
664 @type resource_url: C{str}
665 @param resource_url: A URL specifying where the resource should be
666 loaded from. The default protocol is C{"nltk:"}, which searches
667 for the file in the the NLTK data package.
668 """
669
670 protocol, path = re.match('(?:(\w+):)?(.*)', resource_url).groups()
671
672 if protocol is None or protocol.lower() == 'nltk':
673 return find(path).open()
674 elif protocol.lower() == 'file':
675
676 return open(path, 'rb')
677 else:
678 return urllib2.urlopen(resource_url)
679
680
681
682
683
687
689 resource = load(self.__path)
690
691
692
693 self.__dict__ = resource.__dict__
694 self.__class__ = resource.__class__
695
697 self.__load()
698
699
700 return getattr(self, attr)
701
703 self.__load()
704
705
706 return '%r' % self
707
708
709
710
711
713 """
714 A subclass of C{zipfile.ZipFile} that closes its file pointer
715 whenever it is not using it; and re-opens it when it needs to read
716 data from the zipfile. This is useful for reducing the number of
717 open file handles when many zip files are being accessed at once.
718 C{OpenOnDemandZipFile} must be constructed from a filename, not a
719 file-like object (to allow re-opening). C{OpenOnDemandZipFile} is
720 read-only (i.e., C{write} and C{writestr} are disabled.
721 """
728
729 - def read(self, name):
735
736 - def write(self, *args, **kwargs):
737 """@raise NotImplementedError: OpenOnDemandZipfile is read-only"""
738 raise NotImplementedError('OpenOnDemandZipfile is read-only')
739
741 """@raise NotImplementedError: OpenOnDemandZipfile is read-only"""
742 raise NotImplementedError('OpenOnDemandZipfile is read-only')
743
745 return 'OpenOnDemandZipFile(%r)' % self.filename
746
747
748
749
750
752 """
753 A stream reader that automatically encodes the source byte stream
754 into unicode (like C{codecs.StreamReader}); but still supports the
755 C{seek()} and C{tell()} operations correctly. This is in contrast
756 to C{codecs.StreamReader}, which provide *broken* C{seek()} and
757 C{tell()} methods.
758
759 This class was motivated by L{StreamBackedCorpusView}, which
760 makes extensive use of C{seek()} and C{tell()}, and needs to be
761 able to handle unicode-encoded files.
762
763 Note: this class requires stateless decoders. To my knowledge,
764 this shouldn't cause a problem with any of python's builtin
765 unicode encodings.
766 """
767 DEBUG = True
768
769 - def __init__(self, stream, encoding, errors='strict'):
770
771 stream.seek(0)
772
773 self.stream = stream
774 """The underlying stream."""
775
776 self.encoding = encoding
777 """The name of the encoding that should be used to encode the
778 underlying stream."""
779
780 self.errors = errors
781 """The error mode that should be used when decoding data from
782 the underlying stream. Can be 'strict', 'ignore', or
783 'replace'."""
784
785 self.decode = codecs.getdecoder(encoding)
786 """The function that is used to decode byte strings into
787 unicode strings."""
788
789 self.bytebuffer = ''
790 """A buffer to use bytes that have been read but have not yet
791 been decoded. This is only used when the final bytes from
792 a read do not form a complete encoding for a character."""
793
794 self.linebuffer = None
795 """A buffer used by L{readline()} to hold characters that have
796 been read, but have not yet been returned by L{read()} or
797 L{readline()}. This buffer consists of a list of unicode
798 strings, where each string corresponds to a single line.
799 The final element of the list may or may not be a complete
800 line. Note that the existence of a linebuffer makes the
801 L{tell()} operation more complex, because it must backtrack
802 to the beginning of the buffer to determine the correct
803 file position in the underlying byte stream."""
804
805 self._rewind_checkpoint = 0
806 """The file position at which the most recent read on the
807 underlying stream began. This is used, together with
808 L{_rewind_numchars}, to backtrack to the beginning of
809 L{linebuffer} (which is required by L{tell()})."""
810
811 self._rewind_numchars = None
812 """The number of characters that have been returned since the
813 read that started at L{_rewind_checkpoint}. This is used,
814 together with L{_rewind_checkpoint}, to backtrack to the
815 beginning of L{linebuffer} (which is required by
816 L{tell()})."""
817
818 self._bom = self._check_bom()
819 """The length of the byte order marker at the beginning of
820 the stream (or C{None} for no byte order marker)."""
821
822
823
824
825
826 - def read(self, size=None):
827 """
828 Read up to C{size} bytes, decode them using this reader's
829 encoding, and return the resulting unicode string.
830
831 @param size: The maximum number of bytes to read. If not
832 specified, then read as many bytes as possible.
833
834 @rtype: C{unicode}
835 """
836 chars = self._read(size)
837
838
839 if self.linebuffer:
840 chars = ''.join(self.linebuffer) + chars
841 self.linebuffer = None
842 self._rewind_numchars = None
843
844 return chars
845
847 """
848 Read a line of text, decode it using this reader's encoding,
849 and return the resulting unicode string.
850
851 @param size: The maximum number of bytes to read. If no
852 newline is encountered before C{size} bytes have been
853 read, then the returned value may not be a complete line
854 of text.
855 """
856
857
858
859 if self.linebuffer and len(self.linebuffer) > 1:
860 line = self.linebuffer.pop(0)
861 self._rewind_numchars += len(line)
862 return line
863
864 readsize = size or 72
865 chars = ''
866
867
868 if self.linebuffer:
869 chars += self.linebuffer.pop()
870 self.linebuffer = None
871
872 while True:
873 startpos = self.stream.tell() - len(self.bytebuffer)
874 new_chars = self._read(readsize)
875
876
877
878 if new_chars and new_chars.endswith('\r'):
879 new_chars += self._read(1)
880
881 chars += new_chars
882 lines = chars.splitlines(True)
883 if len(lines) > 1:
884 line = lines[0]
885 self.linebuffer = lines[1:]
886 self._rewind_numchars = len(new_chars)-(len(chars)-len(line))
887 self._rewind_checkpoint = startpos
888 break
889 elif len(lines) == 1:
890 line0withend = lines[0]
891 line0withoutend = lines[0].splitlines(False)[0]
892 if line0withend != line0withoutend:
893 line = line0withend
894 break
895
896 if not new_chars or size is not None:
897 line = chars
898 break
899
900
901 if readsize < 8000:
902 readsize *= 2
903
904 return line
905
906 - def readlines(self, sizehint=None, keepends=True):
907 """
908 Read this file's contents, decode them using this reader's
909 encoding, and return it as a list of unicode lines.
910
911 @rtype: C{list} of C{unicode}
912 @param sizehint: Ignored.
913 @param keepends: If false, then strip newlines.
914 """
915 return self.read().splitlines(keepends)
916
918 """Return the next decoded line from the underlying stream."""
919 line = self.readline()
920 if line: return line
921 else: raise StopIteration
922
924 """Return self"""
925 return self
926
928 """Return self"""
929 return self
930
931
932
933
934
935 closed = property(lambda self: self.stream.closed, doc="""
936 True if the underlying stream is closed.""")
937
938 name = property(lambda self: self.stream.name, doc="""
939 The name of the underlying stream.""")
940
941 mode = property(lambda self: self.stream.mode, doc="""
942 The mode of the underlying stream.""")
943
945 """
946 Close the underlying stream.
947 """
948 self.stream.close()
949
950
951
952
953
954 - def seek(self, offset, whence=0):
955 """
956 Move the stream to a new file position. If the reader is
957 maintaining any buffers, tehn they will be cleared.
958
959 @param offset: A byte count offset.
960 @param whence: If C{whence} is 0, then the offset is from the
961 start of the file (offset should be positive). If
962 C{whence} is 1, then the offset is from the current
963 position (offset may be positive or negative); and if 2,
964 then the offset is from the end of the file (offset should
965 typically be negative).
966 """
967 if whence == 1:
968 raise ValueError('Relative seek is not supported for '
969 'SeekableUnicodeStreamReader -- consider '
970 'using char_seek_forward() instead.')
971 self.stream.seek(offset, whence)
972 self.linebuffer = None
973 self.bytebuffer = ''
974 self._rewind_numchars = None
975 self._rewind_checkpoint = self.stream.tell()
976
978 """
979 Move the read pointer forward by C{offset} characters.
980 """
981 if offset < 0:
982 raise ValueError('Negative offsets are not supported')
983
984 self.seek(self.tell())
985
986 self._char_seek_forward(offset)
987
989 """
990 Move the file position forward by C{offset} characters,
991 ignoring all buffers.
992
993 @param est_bytes: A hint, giving an estimate of the number of
994 bytes that will be neded to move foward by C{offset} chars.
995 Defaults to C{offset}.
996 """
997 if est_bytes is None: est_bytes = offset
998 bytes = ''
999
1000 while True:
1001
1002 newbytes = self.stream.read(est_bytes-len(bytes))
1003 bytes += newbytes
1004
1005
1006 chars, bytes_decoded = self._incr_decode(bytes)
1007
1008
1009
1010 if len(chars) == offset:
1011 self.stream.seek(-len(bytes)+bytes_decoded, 1)
1012 return
1013
1014
1015
1016 if len(chars) > offset:
1017 while len(chars) > offset:
1018
1019 est_bytes += offset-len(chars)
1020 chars, bytes_decoded = self._incr_decode(bytes[:est_bytes])
1021 self.stream.seek(-len(bytes)+bytes_decoded, 1)
1022 return
1023
1024
1025 est_bytes += offset - len(chars)
1026
1028 """
1029 Return the current file position on the underlying byte
1030 stream. If this reader is maintaining any buffers, then the
1031 returned file position will be the position of the beginning
1032 of those buffers.
1033 """
1034
1035 if self.linebuffer is None:
1036 return self.stream.tell() - len(self.bytebuffer)
1037
1038
1039
1040
1041
1042 orig_filepos = self.stream.tell()
1043
1044
1045 bytes_read = ( (orig_filepos-len(self.bytebuffer)) -
1046 self._rewind_checkpoint )
1047 buf_size = sum([len(line) for line in self.linebuffer])
1048 est_bytes = (bytes_read * self._rewind_numchars /
1049 (self._rewind_numchars + buf_size))
1050
1051 self.stream.seek(self._rewind_checkpoint)
1052 self._char_seek_forward(self._rewind_numchars, est_bytes)
1053 filepos = self.stream.tell()
1054
1055
1056 if self.DEBUG:
1057 self.stream.seek(filepos)
1058 check1 = self._incr_decode(self.stream.read(50))[0]
1059 check2 = ''.join(self.linebuffer)
1060 assert check1.startswith(check2) or check2.startswith(check1)
1061
1062
1063
1064 self.stream.seek(orig_filepos)
1065
1066
1067 return filepos
1068
1069
1070
1071
1072
1073 - def _read(self, size=None):
1074 """
1075 Read up to C{size} bytes from the underlying stream, decode
1076 them using this reader's encoding, and return the resulting
1077 unicode string. C{linebuffer} is *not* included in the
1078 result.
1079 """
1080 if size == 0: return u''
1081
1082
1083 if self._bom and self.stream.tell() == 0:
1084 self.stream.read(self._bom)
1085
1086
1087 if size is None:
1088 new_bytes = self.stream.read()
1089 else:
1090 new_bytes = self.stream.read(size)
1091 bytes = self.bytebuffer + new_bytes
1092
1093
1094 chars, bytes_decoded = self._incr_decode(bytes)
1095
1096
1097 if (size is not None) and (not chars) and (len(new_bytes) > 0):
1098 while not chars:
1099 new_bytes = self.stream.read(1)
1100 if not new_bytes: break
1101 bytes += new_bytes
1102 chars, bytes_decoded = self._incr_decode(bytes)
1103
1104
1105 self.bytebuffer = bytes[bytes_decoded:]
1106
1107
1108 return chars
1109
1111 """
1112 Decode the given byte string into a unicode string, using this
1113 reader's encoding. If an exception is encountered that
1114 appears to be caused by a truncation error, then just decode
1115 the byte string without the bytes that cause the trunctaion
1116 error.
1117
1118 @return: A tuple C{(chars, num_consumed)}, where C{chars} is
1119 the decoded unicode string, and C{num_consumed} is the
1120 number of bytes that were consumed.
1121 """
1122 while True:
1123 try:
1124 return self.decode(bytes, 'strict')
1125 except UnicodeDecodeError, exc:
1126
1127
1128 if exc.end == len(bytes):
1129 return self.decode(bytes[:exc.start], self.errors)
1130
1131
1132 elif self.errors == 'strict':
1133 raise
1134
1135
1136
1137 else:
1138 return self.decode(bytes, self.errors)
1139
1140 _BOM_TABLE = {
1141 'utf8': [(codecs.BOM_UTF8, None)],
1142 'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'),
1143 (codecs.BOM_UTF16_BE, 'utf16-be')],
1144 'utf16le': [(codecs.BOM_UTF16_LE, None)],
1145 'utf16be': [(codecs.BOM_UTF16_BE, None)],
1146 'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'),
1147 (codecs.BOM_UTF32_BE, 'utf32-be')],
1148 'utf32le': [(codecs.BOM_UTF32_LE, None)],
1149 'utf32be': [(codecs.BOM_UTF32_BE, None)],
1150 }
1151
1153
1154 enc = re.sub('[ -]', '', self.encoding.lower())
1155
1156
1157 bom_info = self._BOM_TABLE.get(enc)
1158
1159 if bom_info:
1160
1161 bytes = self.stream.read(16)
1162 self.stream.seek(0)
1163
1164
1165 for (bom, new_encoding) in bom_info:
1166 if bytes.startswith(bom):
1167 if new_encoding: self.encoding = new_encoding
1168 return len(bom)
1169
1170 return None
1171
1172 __all__ = ['path', 'PathPointer', 'FileSystemPathPointer', 'BufferedGzipFile',
1173 'GzipFileSystemPathPointer', 'GzipFileSystemPathPointer',
1174 'find', 'retrieve', 'FORMATS', 'AUTO_FORMATS', 'load',
1175 'show_cfg', 'clear_cache', 'LazyLoader', 'OpenOnDemandZipFile',
1176 'GzipFileSystemPathPointer', 'SeekableUnicodeStreamReader']
1177