1
2
3
4
5
6
7
8
9 import os
10 import sys
11 import bisect
12 import re
13 import tempfile
14 try: import cPickle as pickle
15 except ImportError: import pickle
16 from itertools import islice
17
18
19 try: from xml.etree import cElementTree as ElementTree
20 except ImportError: from nltk.etree import ElementTree
21
22 from nltk.tokenize import wordpunct_tokenize
23 from nltk.internals import slice_bounds
24 from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
25 from nltk.data import SeekableUnicodeStreamReader
26 from nltk.sourcedstring import SourcedStringStream
27 from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation
34 """
35 A 'view' of a corpus file, which acts like a sequence of tokens:
36 it can be accessed by index, iterated over, etc. However, the
37 tokens are only constructed as-needed -- the entire corpus is
38 never stored in memory at once.
39
40 The constructor to C{StreamBackedCorpusView} takes two arguments:
41 a corpus fileid (specified as a string or as a L{PathPointer});
42 and a block reader. A X{block reader} is a function that reads
43 zero or more tokens from a stream, and returns them as a list. A
44 very simple example of a block reader is:
45
46 >>> def simple_block_reader(stream):
47 ... return stream.readline().split()
48
49 This simple block reader reads a single line at a time, and
50 returns a single token (consisting of a string) for each
51 whitespace-separated substring on the line.
52
53 When deciding how to define the block reader for a given
54 corpus, careful consideration should be given to the size of
55 blocks handled by the block reader. Smaller block sizes will
56 increase the memory requirements of the corpus view's internal
57 data structures (by 2 integers per block). On the other hand,
58 larger block sizes may decrease performance for random access to
59 the corpus. (But note that larger block sizes will I{not}
60 decrease performance for iteration.)
61
62 Internally, C{CorpusView} maintains a partial mapping from token
63 index to file position, with one entry per block. When a token
64 with a given index M{i} is requested, the C{CorpusView} constructs
65 it as follows:
66
67 1. First, it searches the toknum/filepos mapping for the token
68 index closest to (but less than or equal to) M{i}.
69
70 2. Then, starting at the file position corresponding to that
71 index, it reads one block at a time using the block reader
72 until it reaches the requested token.
73
74 The toknum/filepos mapping is created lazily: it is initially
75 empty, but every time a new block is read, the block's
76 initial token is added to the mapping. (Thus, the toknum/filepos
77 map has one entry per block.)
78
79 In order to increase efficiency for random access patterns that
80 have high degrees of locality, the corpus view may cache one or
81 more blocks.
82
83 @note: Each C{CorpusView} object internally maintains an open file
84 object for its underlying corpus file. This file should be
85 automatically closed when the C{CorpusView} is garbage collected,
86 but if you wish to close it manually, use the L{close()}
87 method. If you access a C{CorpusView}'s items after it has been
88 closed, the file object will be automatically re-opened.
89
90 @warning: If the contents of the file are modified during the
91 lifetime of the C{CorpusView}, then the C{CorpusView}'s behavior
92 is undefined.
93
94 @warning: If a unicode encoding is specified when constructing a
95 C{CorpusView}, then the block reader may only call
96 C{stream.seek()} with offsets that have been returned by
97 C{stream.tell()}; in particular, calling C{stream.seek()} with
98 relative offsets, or with offsets based on string lengths, may
99 lead to incorrect behavior.
100
101 @ivar _block_reader: The function used to read
102 a single block from the underlying file stream.
103 @ivar _toknum: A list containing the token index of each block
104 that has been processed. In particular, C{_toknum[i]} is the
105 token index of the first token in block C{i}. Together
106 with L{_filepos}, this forms a partial mapping between token
107 indices and file positions.
108 @ivar _filepos: A list containing the file position of each block
109 that has been processed. In particular, C{_toknum[i]} is the
110 file position of the first character in block C{i}. Together
111 with L{_toknum}, this forms a partial mapping between token
112 indices and file positions.
113 @ivar _stream: The stream used to access the underlying corpus file.
114 @ivar _len: The total number of tokens in the corpus, if known;
115 or C{None}, if the number of tokens is not yet known.
116 @ivar _eofpos: The character position of the last character in the
117 file. This is calculated when the corpus view is initialized,
118 and is used to decide when the end of file has been reached.
119 @ivar _cache: A cache of the most recently read block. It
120 is encoded as a tuple (start_toknum, end_toknum, tokens), where
121 start_toknum is the token index of the first token in the block;
122 end_toknum is the token index of the first token not in the
123 block; and tokens is a list of the tokens in the block.
124 """
125 - def __init__(self, fileid, block_reader=None, startpos=0,
126 encoding=None, source=None):
127 """
128 Create a new corpus view, based on the file C{fileid}, and
129 read with C{block_reader}. See the class documentation
130 for more information.
131
132 @param fileid: The path to the file that is read by this
133 corpus view. C{fileid} can either be a string or a
134 L{PathPointer}.
135
136 @param startpos: The file position at which the view will
137 start reading. This can be used to skip over preface
138 sections.
139
140 @param encoding: The unicode encoding that should be used to
141 read the file's contents. If no encoding is specified,
142 then the file's contents will be read as a non-unicode
143 string (i.e., a C{str}).
144
145 @param source: If specified, then use an L{SourcedStringStream}
146 to annotate all strings read from the file with
147 information about their start offset, end ofset,
148 and docid. The value of ``source`` will be used as the docid.
149 """
150 if block_reader:
151 self.read_block = block_reader
152
153 self._toknum = [0]
154 self._filepos = [startpos]
155 self._encoding = encoding
156 self._source = source
157
158 self._len = None
159
160 self._fileid = fileid
161 self._stream = None
162
163 self._current_toknum = None
164 """This variable is set to the index of the next token that
165 will be read, immediately before L{self.read_block()} is
166 called. This is provided for the benefit of the block
167 reader, which under rare circumstances may need to know
168 the current token number."""
169
170 self._current_blocknum = None
171 """This variable is set to the index of the next block that
172 will be read, immediately before L{self.read_block()} is
173 called. This is provided for the benefit of the block
174 reader, which under rare circumstances may need to know
175 the current block number."""
176
177
178 try:
179 if isinstance(self._fileid, PathPointer):
180 self._eofpos = self._fileid.file_size()
181 else:
182 self._eofpos = os.stat(self._fileid).st_size
183 except Exception, exc:
184 raise ValueError('Unable to open or access %r -- %s' %
185 (fileid, exc))
186
187
188
189 self._cache = (-1, -1, None)
190
191 fileid = property(lambda self: self._fileid, doc="""
192 The fileid of the file that is accessed by this view.
193
194 @type: C{str} or L{PathPointer}""")
195
197 """
198 Read a block from the input stream.
199
200 @return: a block of tokens from the input stream
201 @rtype: list of any
202 @param stream: an input stream
203 @type stream: stream
204 """
205 raise NotImplementedError('Abstract Method')
206
208 """
209 Open the file stream associated with this corpus view. This
210 will be called performed if any value is read from the view
211 while its file stream is closed.
212 """
213 if isinstance(self._fileid, PathPointer):
214 self._stream = self._fileid.open(self._encoding)
215 elif self._encoding:
216 self._stream = SeekableUnicodeStreamReader(
217 open(self._fileid, 'rb'), self._encoding)
218 else:
219 self._stream = open(self._fileid, 'rb')
220 if self._source is not None:
221 self._stream = SourcedStringStream(self._stream, self._source)
222
224 """
225 Close the file stream associated with this corpus view. This
226 can be useful if you are worried about running out of file
227 handles (although the stream should automatically be closed
228 upon garbage collection of the corpus view). If the corpus
229 view is accessed after it is closed, it will be automatically
230 re-opened.
231 """
232 if self._stream is not None:
233 self._stream.close()
234 self._stream = None
235
237 if self._len is None:
238
239
240 for tok in self.iterate_from(self._toknum[-1]): pass
241 return self._len
242
265
266
267
269
270 if self._cache[0] <= start_tok < self._cache[1]:
271 for tok in self._cache[2][start_tok-self._cache[0]:]:
272 yield tok
273 start_tok += 1
274
275
276
277
278 if start_tok < self._toknum[-1]:
279 block_index = bisect.bisect_right(self._toknum, start_tok)-1
280 toknum = self._toknum[block_index]
281 filepos = self._filepos[block_index]
282 else:
283 block_index = len(self._toknum)-1
284 toknum = self._toknum[-1]
285 filepos = self._filepos[-1]
286
287
288 if self._stream is None:
289 self._open()
290
291
292
293 while filepos < self._eofpos:
294
295 self._stream.seek(filepos)
296 self._current_toknum = toknum
297 self._current_blocknum = block_index
298 tokens = self.read_block(self._stream)
299 assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
300 'block reader %s() should return list or tuple.' %
301 self.read_block.__name__)
302 num_toks = len(tokens)
303 new_filepos = self._stream.tell()
304 assert new_filepos > filepos, (
305 'block reader %s() should consume at least 1 byte (filepos=%d)' %
306 (self.read_block.__name__, filepos))
307
308
309 self._cache = (toknum, toknum+num_toks, list(tokens))
310
311
312 assert toknum <= self._toknum[-1]
313 if num_toks > 0:
314 block_index += 1
315 if toknum == self._toknum[-1]:
316 assert new_filepos > self._filepos[-1]
317 self._filepos.append(new_filepos)
318 self._toknum.append(toknum+num_toks)
319 else:
320
321 assert new_filepos == self._filepos[block_index], (
322 'inconsistent block reader (num chars read)')
323 assert toknum+num_toks == self._toknum[block_index], (
324 'inconsistent block reader (num tokens returned)')
325
326
327 if new_filepos == self._eofpos:
328 self._len = toknum + num_toks
329
330
331
332 for tok in tokens[max(0, start_tok-toknum):]:
333 yield tok
334
335 assert new_filepos <= self._eofpos
336 if new_filepos == self._eofpos:
337 break
338
339 toknum += num_toks
340 filepos = new_filepos
341
342
343 assert self._len is not None
344
345
346
348 return concat([self, other])
350 return concat([other, self])
355
357 """
358 A 'view' of a corpus file that joins together one or more
359 L{StreamBackedCorpusViews<StreamBackedCorpusView>}. At most
360 one file handle is left open at any time.
361 """
363 self._pieces = corpus_views
364 """A list of the corpus subviews that make up this
365 concatenation."""
366
367 self._offsets = [0]
368 """A list of offsets, indicating the index at which each
369 subview begins. In particular::
370 offsets[i] = sum([len(p) for p in pieces[:i]])"""
371
372 self._open_piece = None
373 """The most recently accessed corpus subview (or C{None}).
374 Before a new subview is accessed, this subview will be closed."""
375
377 if len(self._offsets) <= len(self._pieces):
378
379 for tok in self.iterate_from(self._offsets[-1]): pass
380
381 return self._offsets[-1]
382
384 for piece in self._pieces:
385 piece.close()
386
388 piecenum = bisect.bisect_right(self._offsets, start_tok)-1
389
390 while piecenum < len(self._pieces):
391 offset = self._offsets[piecenum]
392 piece = self._pieces[piecenum]
393
394
395 if self._open_piece is not piece:
396 if self._open_piece is not None:
397 self._open_piece.close()
398 self._open_piece = piece
399
400
401 for tok in piece.iterate_from(max(0, start_tok-offset)):
402 yield tok
403
404
405 if piecenum+1 == len(self._offsets):
406 self._offsets.append(self._offsets[-1] + len(piece))
407
408
409 piecenum += 1
410
412 """
413 Concatenate together the contents of multiple documents from a
414 single corpus, using an appropriate concatenation function. This
415 utility function is used by corpus readers when the user requests
416 more than one document at a time.
417 """
418 if len(docs) == 1:
419 return docs[0]
420 if len(docs) == 0:
421 raise ValueError('concat() expects at least one object!')
422
423 types = set([d.__class__ for d in docs])
424
425
426 if types.issubset([str, unicode, basestring]):
427 return reduce((lambda a,b:a+b), docs, '')
428
429
430 for typ in types:
431 if not issubclass(typ, (StreamBackedCorpusView,
432 ConcatenatedCorpusView)):
433 break
434 else:
435 return ConcatenatedCorpusView(docs)
436
437
438 for typ in types:
439 if not issubclass(typ, AbstractLazySequence):
440 break
441 else:
442 return LazyConcatenation(docs)
443
444
445 if len(types) == 1:
446 typ = list(types)[0]
447
448 if issubclass(typ, list):
449 return reduce((lambda a,b:a+b), docs, [])
450
451 if issubclass(typ, tuple):
452 return reduce((lambda a,b:a+b), docs, ())
453
454 if ElementTree.iselement(typ):
455 xmltree = ElementTree.Element('documents')
456 for doc in docs: xmltree.append(doc)
457 return xmltree
458
459
460 raise ValueError("Don't know how to concatenate types: %r" % types)
461
467 """
468 A stream backed corpus view for corpus files that consist of
469 sequences of serialized Python objects (serialized using
470 C{pickle.dump}). One use case for this class is to store the
471 result of running feature detection on a corpus to disk. This can
472 be useful when performing feature detection is expensive (so we
473 don't want to repeat it); but the corpus is too large to store in
474 memory. The following example illustrates this technique:
475
476 >>> feature_corpus = LazyMap(detect_features, corpus)
477 >>> PickleCorpusView.write(feature_corpus, some_fileid)
478 >>> pcv = PickledCorpusView(some_fileid)
479 """
480 BLOCK_SIZE = 100
481 PROTOCOL = -1
482
483 - def __init__(self, fileid, delete_on_gc=False):
484 """
485 Create a new corpus view that reads the pickle corpus
486 C{fileid}.
487
488 @param delete_on_gc: If true, then C{fileid} will be deleted
489 whenever this object gets garbage-collected.
490 """
491 self._delete_on_gc = delete_on_gc
492 StreamBackedCorpusView.__init__(self, fileid)
493
500
502 """
503 If C{delete_on_gc} was set to true when this
504 C{PickleCorpusView} was created, then delete the corpus view's
505 fileid. (This method is called whenever a
506 C{PickledCorpusView} is garbage-collected.
507 """
508 if getattr(self, '_delete_on_gc'):
509 if os.path.exists(self._fileid):
510 try: os.remove(self._fileid)
511 except (OSError, IOError): pass
512 self.__dict__.clear()
513
514 @classmethod
515 - def write(cls, sequence, output_file):
516 if isinstance(output_file, basestring):
517 output_file = open(output_file, 'wb')
518 for item in sequence:
519 pickle.dump(item, output_file, cls.PROTOCOL)
520
521 @classmethod
523 """
524 Write the given sequence to a temporary file as a pickle
525 corpus; and then return a C{PickleCorpusView} view for that
526 temporary corpus file.
527
528 @param delete_on_gc: If true, then the temporary file will be
529 deleted whenever this object gets garbage-collected.
530 """
531 try:
532 fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-')
533 output_file = os.fdopen(fd, 'wb')
534 cls.write(sequence, output_file)
535 output_file.close()
536 return PickleCorpusView(output_file_name, delete_on_gc)
537 except (OSError, IOError), e:
538 raise ValueError('Error while creating temp file: %s' % e)
539
551
557
565
567 s = ''
568 while True:
569 line = stream.readline()
570
571 if not line:
572 if s: return [s]
573 else: return []
574
575 elif line and not line.strip():
576 if s: return [s]
577
578 else:
579 s += line
580
582 s = ''
583 while True:
584 line = stream.readline()
585 if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n':
586 continue
587
588 if not line:
589 if s: return [s]
590 else: return []
591
592 else:
593 s += line
594 if re.match('^[0-9]-[0-9]', line) != None:
595 return [s]
596
598 """
599 Read a sequence of tokens from a stream, where tokens begin with
600 lines that match C{start_re}. If C{end_re} is specified, then
601 tokens end with lines that match C{end_re}; otherwise, tokens end
602 whenever the next line matching C{start_re} or EOF is found.
603 """
604
605 while True:
606 line = stream.readline()
607 if not line: return []
608 if re.match(start_re, line): break
609
610
611 lines = [line]
612 while True:
613 oldpos = stream.tell()
614 line = stream.readline()
615
616 if not line:
617 return [''.join(lines)]
618
619 if end_re is not None and re.match(end_re, line):
620 return [''.join(lines)]
621
622
623 if end_re is None and re.match(start_re, line):
624 stream.seek(oldpos)
625 return [''.join(lines)]
626
627 lines.append(line)
628
630 """
631 Read a sequence of s-expressions from the stream, and leave the
632 stream's file position at the end the last complete s-expression
633 read. This function will always return at least one s-expression,
634 unless there are no more s-expressions in the file.
635
636 If the file ends in in the middle of an s-expression, then that
637 incomplete s-expression is returned when the end of the file is
638 reached.
639
640 @param block_size: The default block size for reading. If an
641 s-expression is longer than one block, then more than one
642 block will be read.
643 @param comment_char: A character that marks comments. Any lines
644 that begin with this character will be stripped out.
645 (If spaces or tabs preceed the comment character, then the
646 line will not be stripped.)
647 """
648 start = stream.tell()
649 block = stream.read(block_size)
650 encoding = getattr(stream, 'encoding', None)
651 assert encoding is not None or isinstance(block, str)
652 if encoding not in (None, 'utf-8'):
653 import warnings
654 warnings.warn('Parsing may fail, depending on the properties '
655 'of the %s encoding!' % encoding)
656
657
658
659 if comment_char:
660 COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char))
661 while True:
662 try:
663
664
665
666
667 if comment_char:
668 block += stream.readline()
669 block = re.sub(COMMENT, _sub_space, block)
670
671 tokens, offset = _parse_sexpr_block(block)
672
673 offset = re.compile(r'\s*').search(block, offset).end()
674
675
676 if encoding is None:
677 stream.seek(start+offset)
678 else:
679 stream.seek(start+len(block[:offset].encode(encoding)))
680
681
682 return tokens
683 except ValueError, e:
684 if e.args[0] == 'Block too small':
685 next_block = stream.read(block_size)
686 if next_block:
687 block += next_block
688 continue
689 else:
690
691 return [block.strip()]
692 else: raise
693
695 """Helper function: given a regexp match, return a string of
696 spaces that's the same length as the matched string."""
697 return ' '*(m.end()-m.start())
698
735
768
778
784
785 para = ''
786 while True:
787 line = stream.readline()
788
789 if re.match('======+\s*$', line):
790 if para.strip(): return [para]
791
792 elif line == '':
793 if para.strip(): return [para]
794 else: return []
795
796 else:
797 para += line
798