Package nltk :: Package corpus :: Package reader :: Module util
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.util

  1  # Natural Language Toolkit: Corpus Reader Utilities 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
  6  # URL: <http://www.nltk.org/> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import os 
 10  import sys 
 11  import bisect 
 12  import re 
 13  import tempfile 
 14  try: import cPickle as pickle 
 15  except ImportError: import pickle 
 16  from itertools import islice 
 17   
 18  # Use the c version of ElementTree, which is faster, if possible: 
 19  try: from xml.etree import cElementTree as ElementTree 
 20  except ImportError: from nltk.etree import ElementTree 
 21   
 22  from nltk.tokenize import wordpunct_tokenize 
 23  from nltk.internals import slice_bounds 
 24  from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer 
 25  from nltk.data import SeekableUnicodeStreamReader 
 26  from nltk.sourcedstring import SourcedStringStream 
 27  from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation 
28 29 ###################################################################### 30 #{ Corpus View 31 ###################################################################### 32 33 -class StreamBackedCorpusView(AbstractLazySequence):
34 """ 35 A 'view' of a corpus file, which acts like a sequence of tokens: 36 it can be accessed by index, iterated over, etc. However, the 37 tokens are only constructed as-needed -- the entire corpus is 38 never stored in memory at once. 39 40 The constructor to C{StreamBackedCorpusView} takes two arguments: 41 a corpus fileid (specified as a string or as a L{PathPointer}); 42 and a block reader. A X{block reader} is a function that reads 43 zero or more tokens from a stream, and returns them as a list. A 44 very simple example of a block reader is: 45 46 >>> def simple_block_reader(stream): 47 ... return stream.readline().split() 48 49 This simple block reader reads a single line at a time, and 50 returns a single token (consisting of a string) for each 51 whitespace-separated substring on the line. 52 53 When deciding how to define the block reader for a given 54 corpus, careful consideration should be given to the size of 55 blocks handled by the block reader. Smaller block sizes will 56 increase the memory requirements of the corpus view's internal 57 data structures (by 2 integers per block). On the other hand, 58 larger block sizes may decrease performance for random access to 59 the corpus. (But note that larger block sizes will I{not} 60 decrease performance for iteration.) 61 62 Internally, C{CorpusView} maintains a partial mapping from token 63 index to file position, with one entry per block. When a token 64 with a given index M{i} is requested, the C{CorpusView} constructs 65 it as follows: 66 67 1. First, it searches the toknum/filepos mapping for the token 68 index closest to (but less than or equal to) M{i}. 69 70 2. Then, starting at the file position corresponding to that 71 index, it reads one block at a time using the block reader 72 until it reaches the requested token. 73 74 The toknum/filepos mapping is created lazily: it is initially 75 empty, but every time a new block is read, the block's 76 initial token is added to the mapping. (Thus, the toknum/filepos 77 map has one entry per block.) 78 79 In order to increase efficiency for random access patterns that 80 have high degrees of locality, the corpus view may cache one or 81 more blocks. 82 83 @note: Each C{CorpusView} object internally maintains an open file 84 object for its underlying corpus file. This file should be 85 automatically closed when the C{CorpusView} is garbage collected, 86 but if you wish to close it manually, use the L{close()} 87 method. If you access a C{CorpusView}'s items after it has been 88 closed, the file object will be automatically re-opened. 89 90 @warning: If the contents of the file are modified during the 91 lifetime of the C{CorpusView}, then the C{CorpusView}'s behavior 92 is undefined. 93 94 @warning: If a unicode encoding is specified when constructing a 95 C{CorpusView}, then the block reader may only call 96 C{stream.seek()} with offsets that have been returned by 97 C{stream.tell()}; in particular, calling C{stream.seek()} with 98 relative offsets, or with offsets based on string lengths, may 99 lead to incorrect behavior. 100 101 @ivar _block_reader: The function used to read 102 a single block from the underlying file stream. 103 @ivar _toknum: A list containing the token index of each block 104 that has been processed. In particular, C{_toknum[i]} is the 105 token index of the first token in block C{i}. Together 106 with L{_filepos}, this forms a partial mapping between token 107 indices and file positions. 108 @ivar _filepos: A list containing the file position of each block 109 that has been processed. In particular, C{_toknum[i]} is the 110 file position of the first character in block C{i}. Together 111 with L{_toknum}, this forms a partial mapping between token 112 indices and file positions. 113 @ivar _stream: The stream used to access the underlying corpus file. 114 @ivar _len: The total number of tokens in the corpus, if known; 115 or C{None}, if the number of tokens is not yet known. 116 @ivar _eofpos: The character position of the last character in the 117 file. This is calculated when the corpus view is initialized, 118 and is used to decide when the end of file has been reached. 119 @ivar _cache: A cache of the most recently read block. It 120 is encoded as a tuple (start_toknum, end_toknum, tokens), where 121 start_toknum is the token index of the first token in the block; 122 end_toknum is the token index of the first token not in the 123 block; and tokens is a list of the tokens in the block. 124 """
125 - def __init__(self, fileid, block_reader=None, startpos=0, 126 encoding=None, source=None):
127 """ 128 Create a new corpus view, based on the file C{fileid}, and 129 read with C{block_reader}. See the class documentation 130 for more information. 131 132 @param fileid: The path to the file that is read by this 133 corpus view. C{fileid} can either be a string or a 134 L{PathPointer}. 135 136 @param startpos: The file position at which the view will 137 start reading. This can be used to skip over preface 138 sections. 139 140 @param encoding: The unicode encoding that should be used to 141 read the file's contents. If no encoding is specified, 142 then the file's contents will be read as a non-unicode 143 string (i.e., a C{str}). 144 145 @param source: If specified, then use an L{SourcedStringStream} 146 to annotate all strings read from the file with 147 information about their start offset, end ofset, 148 and docid. The value of ``source`` will be used as the docid. 149 """ 150 if block_reader: 151 self.read_block = block_reader 152 # Initialize our toknum/filepos mapping. 153 self._toknum = [0] 154 self._filepos = [startpos] 155 self._encoding = encoding 156 self._source = source 157 # We don't know our length (number of tokens) yet. 158 self._len = None 159 160 self._fileid = fileid 161 self._stream = None 162 163 self._current_toknum = None 164 """This variable is set to the index of the next token that 165 will be read, immediately before L{self.read_block()} is 166 called. This is provided for the benefit of the block 167 reader, which under rare circumstances may need to know 168 the current token number.""" 169 170 self._current_blocknum = None 171 """This variable is set to the index of the next block that 172 will be read, immediately before L{self.read_block()} is 173 called. This is provided for the benefit of the block 174 reader, which under rare circumstances may need to know 175 the current block number.""" 176 177 # Find the length of the file. 178 try: 179 if isinstance(self._fileid, PathPointer): 180 self._eofpos = self._fileid.file_size() 181 else: 182 self._eofpos = os.stat(self._fileid).st_size 183 except Exception, exc: 184 raise ValueError('Unable to open or access %r -- %s' % 185 (fileid, exc)) 186 187 # Maintain a cache of the most recently read block, to 188 # increase efficiency of random access. 189 self._cache = (-1, -1, None)
190 191 fileid = property(lambda self: self._fileid, doc=""" 192 The fileid of the file that is accessed by this view. 193 194 @type: C{str} or L{PathPointer}""") 195
196 - def read_block(self, stream):
197 """ 198 Read a block from the input stream. 199 200 @return: a block of tokens from the input stream 201 @rtype: list of any 202 @param stream: an input stream 203 @type stream: stream 204 """ 205 raise NotImplementedError('Abstract Method')
206
207 - def _open(self):
208 """ 209 Open the file stream associated with this corpus view. This 210 will be called performed if any value is read from the view 211 while its file stream is closed. 212 """ 213 if isinstance(self._fileid, PathPointer): 214 self._stream = self._fileid.open(self._encoding) 215 elif self._encoding: 216 self._stream = SeekableUnicodeStreamReader( 217 open(self._fileid, 'rb'), self._encoding) 218 else: 219 self._stream = open(self._fileid, 'rb') 220 if self._source is not None: 221 self._stream = SourcedStringStream(self._stream, self._source)
222
223 - def close(self):
224 """ 225 Close the file stream associated with this corpus view. This 226 can be useful if you are worried about running out of file 227 handles (although the stream should automatically be closed 228 upon garbage collection of the corpus view). If the corpus 229 view is accessed after it is closed, it will be automatically 230 re-opened. 231 """ 232 if self._stream is not None: 233 self._stream.close() 234 self._stream = None
235
236 - def __len__(self):
237 if self._len is None: 238 # iterate_from() sets self._len when it reaches the end 239 # of the file: 240 for tok in self.iterate_from(self._toknum[-1]): pass 241 return self._len
242
243 - def __getitem__(self, i):
244 if isinstance(i, slice): 245 start, stop = slice_bounds(self, i) 246 # Check if it's in the cache. 247 offset = self._cache[0] 248 if offset <= start and stop <= self._cache[1]: 249 return self._cache[2][start-offset:stop-offset] 250 # Construct & return the result. 251 return LazySubsequence(self, start, stop) 252 else: 253 # Handle negative indices 254 if i < 0: i += len(self) 255 if i < 0: raise IndexError('index out of range') 256 # Check if it's in the cache. 257 offset = self._cache[0] 258 if offset <= i < self._cache[1]: 259 return self._cache[2][i-offset] 260 # Use iterate_from to extract it. 261 try: 262 return self.iterate_from(i).next() 263 except StopIteration: 264 raise IndexError('index out of range')
265 266 # If we wanted to be thread-safe, then this method would need to 267 # do some locking.
268 - def iterate_from(self, start_tok):
269 # Start by feeding from the cache, if possible. 270 if self._cache[0] <= start_tok < self._cache[1]: 271 for tok in self._cache[2][start_tok-self._cache[0]:]: 272 yield tok 273 start_tok += 1 274 275 # Decide where in the file we should start. If `start` is in 276 # our mapping, then we can jump straight to the correct block; 277 # otherwise, start at the last block we've processed. 278 if start_tok < self._toknum[-1]: 279 block_index = bisect.bisect_right(self._toknum, start_tok)-1 280 toknum = self._toknum[block_index] 281 filepos = self._filepos[block_index] 282 else: 283 block_index = len(self._toknum)-1 284 toknum = self._toknum[-1] 285 filepos = self._filepos[-1] 286 287 # Open the stream, if it's not open already. 288 if self._stream is None: 289 self._open() 290 291 # Each iteration through this loop, we read a single block 292 # from the stream. 293 while filepos < self._eofpos: 294 # Read the next block. 295 self._stream.seek(filepos) 296 self._current_toknum = toknum 297 self._current_blocknum = block_index 298 tokens = self.read_block(self._stream) 299 assert isinstance(tokens, (tuple, list, AbstractLazySequence)), ( 300 'block reader %s() should return list or tuple.' % 301 self.read_block.__name__) 302 num_toks = len(tokens) 303 new_filepos = self._stream.tell() 304 assert new_filepos > filepos, ( 305 'block reader %s() should consume at least 1 byte (filepos=%d)' % 306 (self.read_block.__name__, filepos)) 307 308 # Update our cache. 309 self._cache = (toknum, toknum+num_toks, list(tokens)) 310 311 # Update our mapping. 312 assert toknum <= self._toknum[-1] 313 if num_toks > 0: 314 block_index += 1 315 if toknum == self._toknum[-1]: 316 assert new_filepos > self._filepos[-1] # monotonic! 317 self._filepos.append(new_filepos) 318 self._toknum.append(toknum+num_toks) 319 else: 320 # Check for consistency: 321 assert new_filepos == self._filepos[block_index], ( 322 'inconsistent block reader (num chars read)') 323 assert toknum+num_toks == self._toknum[block_index], ( 324 'inconsistent block reader (num tokens returned)') 325 326 # If we reached the end of the file, then update self._len 327 if new_filepos == self._eofpos: 328 self._len = toknum + num_toks 329 # Generate the tokens in this block (but skip any tokens 330 # before start_tok). Note that between yields, our state 331 # may be modified. 332 for tok in tokens[max(0, start_tok-toknum):]: 333 yield tok 334 # If we're at the end of the file, then we're done. 335 assert new_filepos <= self._eofpos 336 if new_filepos == self._eofpos: 337 break 338 # Update our indices 339 toknum += num_toks 340 filepos = new_filepos 341 342 # If we reach this point, then we should know our length. 343 assert self._len is not None
344 345 # Use concat for these, so we can use a ConcatenatedCorpusView 346 # when possible.
347 - def __add__(self, other):
348 return concat([self, other])
349 - def __radd__(self, other):
350 return concat([other, self])
351 - def __mul__(self, count):
352 return concat([self] * count)
353 - def __rmul__(self, count):
354 return concat([self] * count)
355
356 -class ConcatenatedCorpusView(AbstractLazySequence):
357 """ 358 A 'view' of a corpus file that joins together one or more 359 L{StreamBackedCorpusViews<StreamBackedCorpusView>}. At most 360 one file handle is left open at any time. 361 """
362 - def __init__(self, corpus_views):
363 self._pieces = corpus_views 364 """A list of the corpus subviews that make up this 365 concatenation.""" 366 367 self._offsets = [0] 368 """A list of offsets, indicating the index at which each 369 subview begins. In particular:: 370 offsets[i] = sum([len(p) for p in pieces[:i]])""" 371 372 self._open_piece = None 373 """The most recently accessed corpus subview (or C{None}). 374 Before a new subview is accessed, this subview will be closed."""
375
376 - def __len__(self):
377 if len(self._offsets) <= len(self._pieces): 378 # Iterate to the end of the corpus. 379 for tok in self.iterate_from(self._offsets[-1]): pass 380 381 return self._offsets[-1]
382
383 - def close(self):
384 for piece in self._pieces: 385 piece.close()
386
387 - def iterate_from(self, start_tok):
388 piecenum = bisect.bisect_right(self._offsets, start_tok)-1 389 390 while piecenum < len(self._pieces): 391 offset = self._offsets[piecenum] 392 piece = self._pieces[piecenum] 393 394 # If we've got another piece open, close it first. 395 if self._open_piece is not piece: 396 if self._open_piece is not None: 397 self._open_piece.close() 398 self._open_piece = piece 399 400 # Get everything we can from this piece. 401 for tok in piece.iterate_from(max(0, start_tok-offset)): 402 yield tok 403 404 # Update the offset table. 405 if piecenum+1 == len(self._offsets): 406 self._offsets.append(self._offsets[-1] + len(piece)) 407 408 # Move on to the next piece. 409 piecenum += 1
410
411 -def concat(docs):
412 """ 413 Concatenate together the contents of multiple documents from a 414 single corpus, using an appropriate concatenation function. This 415 utility function is used by corpus readers when the user requests 416 more than one document at a time. 417 """ 418 if len(docs) == 1: 419 return docs[0] 420 if len(docs) == 0: 421 raise ValueError('concat() expects at least one object!') 422 423 types = set([d.__class__ for d in docs]) 424 425 # If they're all strings, use string concatenation. 426 if types.issubset([str, unicode, basestring]): 427 return reduce((lambda a,b:a+b), docs, '') 428 429 # If they're all corpus views, then use ConcatenatedCorpusView. 430 for typ in types: 431 if not issubclass(typ, (StreamBackedCorpusView, 432 ConcatenatedCorpusView)): 433 break 434 else: 435 return ConcatenatedCorpusView(docs) 436 437 # If they're all lazy sequences, use a lazy concatenation 438 for typ in types: 439 if not issubclass(typ, AbstractLazySequence): 440 break 441 else: 442 return LazyConcatenation(docs) 443 444 # Otherwise, see what we can do: 445 if len(types) == 1: 446 typ = list(types)[0] 447 448 if issubclass(typ, list): 449 return reduce((lambda a,b:a+b), docs, []) 450 451 if issubclass(typ, tuple): 452 return reduce((lambda a,b:a+b), docs, ()) 453 454 if ElementTree.iselement(typ): 455 xmltree = ElementTree.Element('documents') 456 for doc in docs: xmltree.append(doc) 457 return xmltree 458 459 # No method found! 460 raise ValueError("Don't know how to concatenate types: %r" % types)
461
462 ###################################################################### 463 #{ Corpus View for Pickled Sequences 464 ###################################################################### 465 466 -class PickleCorpusView(StreamBackedCorpusView):
467 """ 468 A stream backed corpus view for corpus files that consist of 469 sequences of serialized Python objects (serialized using 470 C{pickle.dump}). One use case for this class is to store the 471 result of running feature detection on a corpus to disk. This can 472 be useful when performing feature detection is expensive (so we 473 don't want to repeat it); but the corpus is too large to store in 474 memory. The following example illustrates this technique: 475 476 >>> feature_corpus = LazyMap(detect_features, corpus) 477 >>> PickleCorpusView.write(feature_corpus, some_fileid) 478 >>> pcv = PickledCorpusView(some_fileid) 479 """ 480 BLOCK_SIZE = 100 481 PROTOCOL = -1 482
483 - def __init__(self, fileid, delete_on_gc=False):
484 """ 485 Create a new corpus view that reads the pickle corpus 486 C{fileid}. 487 488 @param delete_on_gc: If true, then C{fileid} will be deleted 489 whenever this object gets garbage-collected. 490 """ 491 self._delete_on_gc = delete_on_gc 492 StreamBackedCorpusView.__init__(self, fileid)
493
494 - def read_block(self, stream):
495 result = [] 496 for i in range(self.BLOCK_SIZE): 497 try: result.append(pickle.load(stream)) 498 except EOFError: break 499 return result
500
501 - def __del__(self):
502 """ 503 If C{delete_on_gc} was set to true when this 504 C{PickleCorpusView} was created, then delete the corpus view's 505 fileid. (This method is called whenever a 506 C{PickledCorpusView} is garbage-collected. 507 """ 508 if getattr(self, '_delete_on_gc'): 509 if os.path.exists(self._fileid): 510 try: os.remove(self._fileid) 511 except (OSError, IOError): pass 512 self.__dict__.clear() # make the garbage collector's job easier
513 514 @classmethod
515 - def write(cls, sequence, output_file):
516 if isinstance(output_file, basestring): 517 output_file = open(output_file, 'wb') 518 for item in sequence: 519 pickle.dump(item, output_file, cls.PROTOCOL)
520 521 @classmethod
522 - def cache_to_tempfile(cls, sequence, delete_on_gc=True):
523 """ 524 Write the given sequence to a temporary file as a pickle 525 corpus; and then return a C{PickleCorpusView} view for that 526 temporary corpus file. 527 528 @param delete_on_gc: If true, then the temporary file will be 529 deleted whenever this object gets garbage-collected. 530 """ 531 try: 532 fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-') 533 output_file = os.fdopen(fd, 'wb') 534 cls.write(sequence, output_file) 535 output_file.close() 536 return PickleCorpusView(output_file_name, delete_on_gc) 537 except (OSError, IOError), e: 538 raise ValueError('Error while creating temp file: %s' % e)
539
540 541 542 ###################################################################### 543 #{ Block Readers 544 ###################################################################### 545 546 -def read_whitespace_block(stream):
547 toks = [] 548 for i in range(20): # Read 20 lines at a time. 549 toks.extend(stream.readline().split()) 550 return toks
551
552 -def read_wordpunct_block(stream):
553 toks = [] 554 for i in range(20): # Read 20 lines at a time. 555 toks.extend(wordpunct_tokenize(stream.readline())) 556 return toks
557
558 -def read_line_block(stream):
559 toks = [] 560 for i in range(20): 561 line = stream.readline() 562 if not line: return toks 563 toks.append(line.rstrip('\n')) 564 return toks
565
566 -def read_blankline_block(stream):
567 s = '' 568 while True: 569 line = stream.readline() 570 # End of file: 571 if not line: 572 if s: return [s] 573 else: return [] 574 # Blank line: 575 elif line and not line.strip(): 576 if s: return [s] 577 # Other line: 578 else: 579 s += line
580
581 -def read_alignedsent_block(stream):
582 s = '' 583 while True: 584 line = stream.readline() 585 if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n': 586 continue 587 # End of file: 588 if not line: 589 if s: return [s] 590 else: return [] 591 # Other line: 592 else: 593 s += line 594 if re.match('^[0-9]-[0-9]', line) != None: 595 return [s]
596
597 -def read_regexp_block(stream, start_re, end_re=None):
598 """ 599 Read a sequence of tokens from a stream, where tokens begin with 600 lines that match C{start_re}. If C{end_re} is specified, then 601 tokens end with lines that match C{end_re}; otherwise, tokens end 602 whenever the next line matching C{start_re} or EOF is found. 603 """ 604 # Scan until we find a line matching the start regexp. 605 while True: 606 line = stream.readline() 607 if not line: return [] # end of file. 608 if re.match(start_re, line): break 609 610 # Scan until we find another line matching the regexp, or EOF. 611 lines = [line] 612 while True: 613 oldpos = stream.tell() 614 line = stream.readline() 615 # End of file: 616 if not line: 617 return [''.join(lines)] 618 # End of token: 619 if end_re is not None and re.match(end_re, line): 620 return [''.join(lines)] 621 # Start of new token: backup to just before it starts, and 622 # return the token we've already collected. 623 if end_re is None and re.match(start_re, line): 624 stream.seek(oldpos) 625 return [''.join(lines)] 626 # Anything else is part of the token. 627 lines.append(line)
628
629 -def read_sexpr_block(stream, block_size=16384, comment_char=None):
630 """ 631 Read a sequence of s-expressions from the stream, and leave the 632 stream's file position at the end the last complete s-expression 633 read. This function will always return at least one s-expression, 634 unless there are no more s-expressions in the file. 635 636 If the file ends in in the middle of an s-expression, then that 637 incomplete s-expression is returned when the end of the file is 638 reached. 639 640 @param block_size: The default block size for reading. If an 641 s-expression is longer than one block, then more than one 642 block will be read. 643 @param comment_char: A character that marks comments. Any lines 644 that begin with this character will be stripped out. 645 (If spaces or tabs preceed the comment character, then the 646 line will not be stripped.) 647 """ 648 start = stream.tell() 649 block = stream.read(block_size) 650 encoding = getattr(stream, 'encoding', None) 651 assert encoding is not None or isinstance(block, str) 652 if encoding not in (None, 'utf-8'): 653 import warnings 654 warnings.warn('Parsing may fail, depending on the properties ' 655 'of the %s encoding!' % encoding) 656 # (e.g., the utf-16 encoding does not work because it insists 657 # on adding BOMs to the beginning of encoded strings.) 658 659 if comment_char: 660 COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char)) 661 while True: 662 try: 663 # If we're stripping comments, then make sure our block ends 664 # on a line boundary; and then replace any comments with 665 # space characters. (We can't just strip them out -- that 666 # would make our offset wrong.) 667 if comment_char: 668 block += stream.readline() 669 block = re.sub(COMMENT, _sub_space, block) 670 # Read the block. 671 tokens, offset = _parse_sexpr_block(block) 672 # Skip whitespace 673 offset = re.compile(r'\s*').search(block, offset).end() 674 675 # Move to the end position. 676 if encoding is None: 677 stream.seek(start+offset) 678 else: 679 stream.seek(start+len(block[:offset].encode(encoding))) 680 681 # Return the list of tokens we processed 682 return tokens 683 except ValueError, e: 684 if e.args[0] == 'Block too small': 685 next_block = stream.read(block_size) 686 if next_block: 687 block += next_block 688 continue 689 else: 690 # The file ended mid-sexpr -- return what we got. 691 return [block.strip()] 692 else: raise
693
694 -def _sub_space(m):
695 """Helper function: given a regexp match, return a string of 696 spaces that's the same length as the matched string.""" 697 return ' '*(m.end()-m.start())
698
699 -def _parse_sexpr_block(block):
700 tokens = [] 701 start = end = 0 702 703 while end < len(block): 704 m = re.compile(r'\S').search(block, end) 705 if not m: 706 return tokens, end 707 708 start = m.start() 709 710 # Case 1: sexpr is not parenthesized. 711 if m.group() != '(': 712 m2 = re.compile(r'[\s(]').search(block, start) 713 if m2: 714 end = m2.start() 715 else: 716 if tokens: return tokens, end 717 raise ValueError('Block too small') 718 719 # Case 2: parenthesized sexpr. 720 else: 721 nesting = 0 722 for m in re.compile(r'[()]').finditer(block, start): 723 if m.group()=='(': nesting += 1 724 else: nesting -= 1 725 if nesting == 0: 726 end = m.end() 727 break 728 else: 729 if tokens: return tokens, end 730 raise ValueError('Block too small') 731 732 tokens.append(block[start:end]) 733 734 return tokens, end
735
736 737 ###################################################################### 738 #{ Finding Corpus Items 739 ###################################################################### 740 741 -def find_corpus_fileids(root, regexp):
742 if not isinstance(root, PathPointer): 743 raise TypeError('find_corpus_fileids: expected a PathPointer') 744 regexp += '$' 745 746 # Find fileids in a zipfile: scan the zipfile's namelist. Filter 747 # out entries that end in '/' -- they're directories. 748 if isinstance(root, ZipFilePathPointer): 749 fileids = [name[len(root.entry):] for name in root.zipfile.namelist() 750 if not name.endswith('/')] 751 items = [name for name in fileids if re.match(regexp, name)] 752 return sorted(items) 753 754 # Find fileids in a directory: use os.walk to search all 755 # subdirectories, and match paths against the regexp. 756 elif isinstance(root, FileSystemPathPointer): 757 items = [] 758 for dirname, subdirs, fileids in os.walk(root.path, followlinks=True): 759 prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname)) 760 items += [prefix+fileid for fileid in fileids 761 if re.match(regexp, prefix+fileid)] 762 # Don't visit svn directories: 763 if '.svn' in subdirs: subdirs.remove('.svn') 764 return sorted(items) 765 766 else: 767 raise AssertionError("Don't know how to handle %r" % root)
768
769 -def _path_from(parent, child):
770 if os.path.split(parent)[1] == '': 771 parent = os.path.split(parent)[0] 772 path = [] 773 while parent != child: 774 child, dirname = os.path.split(child) 775 path.insert(0, dirname) 776 assert os.path.split(child)[0] != child 777 return path
778
779 ###################################################################### 780 #{ Paragraph structure in Treebank files 781 ###################################################################### 782 783 -def tagged_treebank_para_block_reader(stream):
784 # Read the next paragraph. 785 para = '' 786 while True: 787 line = stream.readline() 788 # End of paragraph: 789 if re.match('======+\s*$', line): 790 if para.strip(): return [para] 791 # End of file: 792 elif line == '': 793 if para.strip(): return [para] 794 else: return [] 795 # Content line: 796 else: 797 para += line
798