Package nltk :: Module data
[hide private]
[frames] | no frames]

Source Code for Module nltk.data

   1  # Natural Language Toolkit: Utility functions 
   2  # 
   3  # Copyright (C) 2001-2011 NLTK Project 
   4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   5  # URL: <http://www.nltk.org/> 
   6  # For license information, see LICENSE.TXT 
   7   
   8  """ 
   9  Functions to find and load NLTK X{resource files}, such as corpora, 
  10  grammars, and saved processing objects.  Resource files are identified 
  11  using URLs, such as"C{nltk:corpora/abc/rural.txt}" or 
  12  "C{http://nltk.org/sample/toy.cfg}".  The following URL protocols are 
  13  supported: 
  14   
  15    - "C{file:I{path}}": Specifies the file whose path is C{I{path}}. 
  16      Both relative and absolute paths may be used. 
  17       
  18    - "C{http://I{host}/{path}}": Specifies the file stored on the web 
  19      server C{I{host}} at path C{I{path}}. 
  20       
  21    - "C{nltk:I{path}}": Specifies the file stored in the NLTK data 
  22      package at C{I{path}}.  NLTK will search for these files in the 
  23      directories specified by L{nltk.data.path}. 
  24   
  25  If no protocol is specified, then the default protocol "C{nltk:}" will 
  26  be used. 
  27    
  28  This module provides to functions that can be used to access a 
  29  resource file, given its URL: L{load()} loads a given resource, and 
  30  adds it to a resource cache; and L{retrieve()} copies a given resource 
  31  to a local file. 
  32  """ 
  33   
  34  import sys 
  35  import os, os.path 
  36  import textwrap 
  37  import weakref 
  38  import yaml 
  39  import re 
  40  import urllib2 
  41  import zipfile 
  42  import codecs 
  43   
  44  from gzip import GzipFile, READ as GZ_READ, WRITE as GZ_WRITE 
  45   
  46  try: 
  47      from zlib import Z_SYNC_FLUSH as FLUSH 
  48  except: 
  49      from zlib import Z_FINISH as FLUSH 
  50   
  51  try: 
  52      import cPickle as pickle 
  53  except: 
  54      import pickle 
  55       
  56  try: 
  57      from cStringIO import StringIO 
  58  except: 
  59      from StringIO import StringIO 
  60   
  61  import nltk 
  62   
  63  ###################################################################### 
  64  # Search Path 
  65  ###################################################################### 
  66   
  67  path = [] 
  68  """A list of directories where the NLTK data package might reside. 
  69     These directories will be checked in order when looking for a 
  70     resource in the data package.  Note that this allows users to 
  71     substitute in their own versions of resources, if they have them 
  72     (e.g., in their home directory under ~/nltk_data).""" 
  73   
  74  # User-specified locations: 
  75  path += [d for d in os.environ.get('NLTK_DATA', '').split(os.pathsep) if d] 
  76  if os.path.expanduser('~/') != '~/': path += [ 
  77      os.path.expanduser('~/nltk_data')] 
  78   
  79  # Common locations on Windows: 
  80  if sys.platform.startswith('win'): path += [ 
  81      r'C:\nltk_data', r'D:\nltk_data', r'E:\nltk_data', 
  82      os.path.join(sys.prefix, 'nltk_data'), 
  83      os.path.join(sys.prefix, 'lib', 'nltk_data'), 
  84      os.path.join(os.environ.get('APPDATA', 'C:\\'), 'nltk_data')] 
  85   
  86  # Common locations on UNIX & OS X: 
  87  else: path += [ 
  88      '/usr/share/nltk_data', 
  89      '/usr/local/share/nltk_data', 
  90      '/usr/lib/nltk_data', 
  91      '/usr/local/lib/nltk_data'] 
  92   
  93  ###################################################################### 
  94  # Path Pointers 
  95  ###################################################################### 
  96   
97 -class PathPointer(object):
98 """ 99 An abstract base class for 'path pointers,' used by NLTK's data 100 package to identify specific paths. Two subclasses exist: 101 L{FileSystemPathPointer} identifies a file that can be accessed 102 directly via a given absolute path. L{ZipFilePathPointer} 103 identifies a file contained within a zipfile, that can be accessed 104 by reading that zipfile. 105 """
106 - def open(self, encoding=None):
107 """ 108 Return a seekable read-only stream that can be used to read 109 the contents of the file identified by this path pointer. 110 111 @raise IOError: If the path specified by this pointer does 112 not contain a readable file. 113 """ 114 raise NotImplementedError('abstract base class')
115
116 - def file_size(self):
117 """ 118 Return the size of the file pointed to by this path pointer, 119 in bytes. 120 121 @raise IOError: If the path specified by this pointer does 122 not contain a readable file. 123 """ 124 raise NotImplementedError('abstract base class')
125
126 - def join(self, fileid):
127 """ 128 Return a new path pointer formed by starting at the path 129 identified by this pointer, and then following the relative 130 path given by C{fileid}. The path components of C{fileid} 131 should be seperated by forward slashes (C{/}), regardless of 132 the underlying file system's path seperator character. 133 """ 134 raise NotImplementedError('abstract base class')
135 136
137 -class FileSystemPathPointer(PathPointer, str):
138 """ 139 A path pointer that identifies a file which can be accessed 140 directly via a given absolute path. C{FileSystemPathPointer} is a 141 subclass of C{str} for backwards compatibility purposes -- 142 this allows old code that expected C{nltk.data.find()} to expect a 143 string to usually work (assuming the resource is not found in a 144 zipfile). It also permits open() to work on a FileSystemPathPointer. 145 146 """
147 - def __init__(self, path):
148 """ 149 Create a new path pointer for the given absolute path. 150 151 @raise IOError: If the given path does not exist. 152 """ 153 path = os.path.abspath(path) 154 if not os.path.exists(path): 155 raise IOError('No such file or directory: %r' % path) 156 self._path = path
157 158 # There's no need to call str.__init__(), since it's a no-op; 159 # str does all of its setup work in __new__. 160 161 path = property(lambda self: self._path, doc=""" 162 The absolute path identified by this path pointer.""") 163
164 - def open(self, encoding=None):
165 stream = open(self._path, 'rb') 166 if encoding is not None: 167 stream = SeekableUnicodeStreamReader(stream, encoding) 168 return stream
169
170 - def file_size(self):
171 return os.stat(self._path).st_size
172
173 - def join(self, fileid):
174 path = os.path.join(self._path, *fileid.split('/')) 175 return FileSystemPathPointer(path)
176
177 - def __repr__(self):
178 return 'FileSystemPathPointer(%r)' % self._path
179
180 - def __str__(self):
181 return self._path
182 183
184 -class BufferedGzipFile(GzipFile):
185 """ 186 A C{GzipFile} subclass that buffers calls to L{read()} and L{write()}. 187 This allows faster reads and writes of data to and from gzip-compressed 188 files at the cost of using more memory. 189 190 The default buffer size is 2mb. 191 192 C{BufferedGzipFile} is useful for loading large gzipped pickle objects 193 as well as writing large encoded feature files for classifier training. 194 """ 195 SIZE = 2 * 2**20 196
197 - def __init__(self, filename=None, mode=None, compresslevel=9, 198 fileobj=None, **kwargs):
199 """ 200 @return: a buffered gzip file object 201 @rtype: C{BufferedGzipFile} 202 @param filename: a filesystem path 203 @type filename: C{str} 204 @param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab', 205 'w', or 'wb' 206 @type mode: C{str} 207 @param compresslevel: The compresslevel argument is an integer from 1 208 to 9 controlling the level of compression; 1 is fastest and 209 produces the least compression, and 9 is slowest and produces the 210 most compression. The default is 9. 211 @type compresslevel: C{int} 212 @param fileobj: a StringIO stream to read from instead of a file. 213 @type fileobj: C{StringIO} 214 @kwparam size: number of bytes to buffer during calls to 215 L{read()} and L{write()} 216 @type size: C{int} 217 """ 218 GzipFile.__init__(self, filename, mode, compresslevel, fileobj) 219 self._size = kwargs.get('size', self.SIZE) 220 self._buffer = StringIO() 221 # cStringIO does not support len. 222 self._len = 0
223
224 - def _reset_buffer(self):
225 # For some reason calling StringIO.truncate() here will lead to 226 # inconsistent writes so just set _buffer to a new StringIO object. 227 self._buffer = StringIO() 228 self._len = 0
229
230 - def _write_buffer(self, data):
231 # Simply write to the buffer and increment the buffer size. 232 if data is not None: 233 self._buffer.write(data) 234 self._len += len(data)
235
236 - def _write_gzip(self, data):
237 # Write the current buffer to the GzipFile. 238 GzipFile.write(self, self._buffer.getvalue()) 239 # Then reset the buffer and write the new data to the buffer. 240 self._reset_buffer() 241 self._write_buffer(data)
242
243 - def close(self):
244 # GzipFile.close() doesn't actuallly close anything. 245 if self.mode == GZ_WRITE: 246 self._write_gzip(None) 247 self._reset_buffer() 248 return GzipFile.close(self)
249
250 - def flush(self, lib_mode=FLUSH):
251 self._buffer.flush() 252 GzipFile.flush(self, lib_mode)
253
254 - def read(self, size=None):
255 if not size: 256 size = self._size 257 contents = StringIO() 258 while True: 259 blocks = GzipFile.read(self, size) 260 if not blocks: 261 contents.flush() 262 break 263 contents.write(blocks) 264 return contents.getvalue() 265 else: 266 return GzipFile.read(self, size)
267
268 - def write(self, data, size=-1):
269 """ 270 @param data: C{str} to write to file or buffer 271 @type data: C{str} 272 @param size: buffer at least size bytes before writing to file 273 @type size: C{int} 274 """ 275 if not size: 276 size = self._size 277 if self._len + len(data) <= size: 278 self._write_buffer(data) 279 else: 280 self._write_gzip(data)
281 282
283 -class GzipFileSystemPathPointer(FileSystemPathPointer):
284 """ 285 A subclass of C{FileSystemPathPointer} that identifies a gzip-compressed 286 file located at a given absolute path. C{GzipFileSystemPathPointer} is 287 appropriate for loading large gzip-compressed pickle objects efficiently. 288 """
289 - def open(self, encoding=None):
290 stream = BufferedGzipFile(self._path, 'rb') 291 if encoding: 292 stream = SeekableUnicodeStreamReader(stream, encoding) 293 return stream
294 295
296 -class ZipFilePathPointer(PathPointer):
297 """ 298 A path pointer that identifies a file contained within a zipfile, 299 which can be accessed by reading that zipfile. 300 """
301 - def __init__(self, zipfile, entry=''):
302 """ 303 Create a new path pointer pointing at the specified entry 304 in the given zipfile. 305 306 @raise IOError: If the given zipfile does not exist, or if it 307 does not contain the specified entry. 308 """ 309 if isinstance(zipfile, basestring): 310 zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile)) 311 312 # Normalize the entry string: 313 entry = re.sub('(^|/)/+', r'\1', entry) 314 315 # Check that the entry exists: 316 if entry: 317 try: zipfile.getinfo(entry) 318 except: 319 # Sometimes directories aren't explicitly listed in 320 # the zip file. So if `entry` is a directory name, 321 # then check if the zipfile contains any files that 322 # are under the given directory. 323 if (entry.endswith('/') and 324 [n for n in zipfile.namelist() if n.startswith(entry)]): 325 pass # zipfile contains a file in that directory. 326 else: 327 # Otherwise, complain. 328 raise IOError('Zipfile %r does not contain %r' % 329 (zipfile.filename, entry)) 330 self._zipfile = zipfile 331 self._entry = entry
332 333 zipfile = property(lambda self: self._zipfile, doc=""" 334 The C{zipfile.ZipFile} object used to access the zip file 335 containing the entry identified by this path pointer.""") 336 entry = property(lambda self: self._entry, doc=""" 337 The name of the file within C{zipfile} that this path 338 pointer points to.""") 339
340 - def open(self, encoding=None):
341 data = self._zipfile.read(self._entry) 342 stream = StringIO(data) 343 if self._entry.endswith('.gz'): 344 stream = BufferedGzipFile(self._entry, fileobj=stream) 345 elif encoding is not None: 346 stream = SeekableUnicodeStreamReader(stream, encoding) 347 return stream
348
349 - def file_size(self):
350 return self._zipfile.getinfo(self._entry).file_size
351
352 - def join(self, fileid):
353 entry = '%s/%s' % (self._entry, fileid) 354 return ZipFilePathPointer(self._zipfile, entry)
355
356 - def __repr__(self):
357 return 'ZipFilePathPointer(%r, %r)' % ( 358 self._zipfile.filename, self._entry)
359 360 ###################################################################### 361 # Access Functions 362 ###################################################################### 363 364 # Don't use a weak dictionary, because in the common case this 365 # causes a lot more reloading that necessary. 366 _resource_cache = {} 367 """A dictionary used to cache resources so that they won't 368 need to be loaded more than once.""" 369
370 -def find(resource_name):
371 """ 372 Find the given resource by searching through the directories and 373 zip files in L{nltk.data.path}, and return a corresponding path 374 name. If the given resource is not found, raise a C{LookupError}, 375 whose message gives a pointer to the installation instructions for 376 the NLTK downloader. 377 378 Zip File Handling: 379 380 - If C{resource_name} contains a component with a C{.zip} 381 extension, then it is assumed to be a zipfile; and the 382 remaining path components are used to look inside the zipfile. 383 384 - If any element of C{nltk.data.path} has a C{.zip} extension, 385 then it is assumed to be a zipfile. 386 387 - If a given resource name that does not contain any zipfile 388 component is not found initially, then C{find()} will make a 389 second attempt to find that resource, by replacing each 390 component I{p} in the path with I{p.zip/p}. For example, this 391 allows C{find()} to map the resource name 392 C{corpora/chat80/cities.pl} to a zip file path pointer to 393 C{corpora/chat80.zip/chat80/cities.pl}. 394 395 - When using C{find()} to locate a directory contained in a 396 zipfile, the resource name I{must} end with the C{'/'} 397 character. Otherwise, C{find()} will not locate the 398 directory. 399 400 @type resource_name: C{str} 401 @param resource_name: The name of the resource to search for. 402 Resource names are posix-style relative path names, such as 403 C{'corpora/brown'}. In particular, directory names should 404 always be separated by the C{'/'} character, which will be 405 automatically converted to a platform-appropriate path 406 separator. 407 @rtype: C{str} 408 """ 409 # Check if the resource name includes a zipfile name 410 m = re.match('(.*\.zip)/?(.*)$|', resource_name) 411 zipfile, zipentry = m.groups() 412 413 # Check each item in our path 414 for path_item in path: 415 416 # Is the path item a zipfile? 417 if os.path.isfile(path_item) and path_item.endswith('.zip'): 418 try: return ZipFilePathPointer(path_item, resource_name) 419 except IOError: continue # resource not in zipfile 420 421 # Is the path item a directory? 422 elif os.path.isdir(path_item): 423 if zipfile is None: 424 p = os.path.join(path_item, *resource_name.split('/')) 425 if os.path.exists(p): 426 if p.endswith('.gz'): 427 return GzipFileSystemPathPointer(p) 428 else: 429 return FileSystemPathPointer(p) 430 else: 431 p = os.path.join(path_item, *zipfile.split('/')) 432 if os.path.exists(p): 433 try: return ZipFilePathPointer(p, zipentry) 434 except IOError: continue # resource not in zipfile 435 436 # Fallback: if the path doesn't include a zip file, then try 437 # again, assuming that one of the path components is inside a 438 # zipfile of the same name. 439 if zipfile is None: 440 pieces = resource_name.split('/') 441 for i in range(len(pieces)): 442 modified_name = '/'.join(pieces[:i]+[pieces[i]+'.zip']+pieces[i:]) 443 try: return find(modified_name) 444 except LookupError: pass 445 446 # Display a friendly error message if the resource wasn't found: 447 msg = textwrap.fill( 448 'Resource %r not found. Please use the NLTK Downloader to ' 449 'obtain the resource: >>> nltk.download().' % 450 (resource_name,), initial_indent=' ', subsequent_indent=' ', 451 width=66) 452 msg += '\n Searched in:' + ''.join('\n - %r' % d for d in path) 453 sep = '*'*70 454 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep) 455 raise LookupError(resource_not_found)
456
457 -def retrieve(resource_url, filename=None, verbose=True):
458 """ 459 Copy the given resource to a local file. If no filename is 460 specified, then use the URL's filename. If there is already a 461 file named C{filename}, then raise a C{ValueError}. 462 463 @type resource_url: C{str} 464 @param resource_url: A URL specifying where the resource should be 465 loaded from. The default protocol is C{"nltk:"}, which searches 466 for the file in the the NLTK data package. 467 """ 468 if filename is None: 469 if resource_url.startswith('file:'): 470 filename = os.path.split(filename)[-1] 471 else: 472 filename = re.sub(r'(^\w+:)?.*/', '', resource_url) 473 if os.path.exists(filename): 474 filename = os.path.abspath(filename) 475 raise ValueError, "File %r already exists!" % filename 476 477 if verbose: 478 print 'Retrieving %r, saving to %r' % (resource_url, filename) 479 480 # Open the input & output streams. 481 infile = _open(resource_url) 482 outfile = open(filename, 'wb') 483 484 # Copy infile -> outfile, using 64k blocks. 485 while True: 486 s = infile.read(1024*64) # 64k blocks. 487 outfile.write(s) 488 if not s: break 489 490 # Close both files. 491 infile.close() 492 outfile.close()
493 494 #: A dictionary describing the formats that are supported by NLTK's 495 #: L{load()} method. Keys are format names, and values are format 496 #: descriptions. 497 FORMATS = { 498 'pickle': "A serialized python object, stored using the pickle module.", 499 'yaml': "A serialized python object, stored using the yaml module.", 500 'cfg': "A context free grammar, parsed by nltk.parse_cfg().", 501 'pcfg': "A probabilistic CFG, parsed by nltk.parse_pcfg().", 502 'fcfg': "A feature CFG, parsed by nltk.parse_fcfg().", 503 'fol': "A list of first order logic expressions, parsed by " 504 "nltk.sem.parse_fol() using nltk.sem.logic.LogicParser.", 505 'logic': "A list of first order logic expressions, parsed by " 506 "nltk.sem.parse_logic(). Requires an additional logic_parser " 507 "parameter", 508 'val': "A semantic valuation, parsed by nltk.sem.parse_valuation().", 509 'raw': "The raw (byte string) contents of a file.", 510 } 511 512 #: A dictionary mapping from file extensions to format names, used 513 #: by L{load()} when C{format="auto"} to decide the format for a 514 #: given resource url. 515 AUTO_FORMATS = { 516 'pickle': 'pickle', 517 'yaml': 'yaml', 518 'cfg': 'cfg', 519 'pcfg': 'pcfg', 520 'fcfg': 'fcfg', 521 'fol': 'fol', 522 'logic': 'logic', 523 'val': 'val'} 524
525 -def load(resource_url, format='auto', cache=True, verbose=False, 526 logic_parser=None, fstruct_parser=None):
527 """ 528 Load a given resource from the NLTK data package. The following 529 resource formats are currently supported: 530 - C{'pickle'} 531 - C{'yaml'} 532 - C{'cfg'} (context free grammars) 533 - C{'pcfg'} (probabilistic CFGs) 534 - C{'fcfg'} (feature-based CFGs) 535 - C{'fol'} (formulas of First Order Logic) 536 - C{'logic'} (Logical formulas to be parsed by the given logic_parser) 537 - C{'val'} (valuation of First Order Logic model) 538 - C{'raw'} 539 540 If no format is specified, C{load()} will attempt to determine a 541 format based on the resource name's file extension. If that 542 fails, C{load()} will raise a C{ValueError} exception. 543 544 @type resource_url: C{str} 545 @param resource_url: A URL specifying where the resource should be 546 loaded from. The default protocol is C{"nltk:"}, which searches 547 for the file in the the NLTK data package. 548 @type cache: C{bool} 549 @param cache: If true, add this resource to a cache. If C{load} 550 finds a resource in its cache, then it will return it from the 551 cache rather than loading it. The cache uses weak references, 552 so a resource wil automatically be expunged from the cache 553 when no more objects are using it. 554 555 @type verbose: C{bool} 556 @param verbose: If true, print a message when loading a resource. 557 Messages are not displayed when a resource is retrieved from 558 the cache. 559 560 @type logic_parser: C{LogicParser} 561 @param logic_parser: The parser that will be used to parse logical 562 expressions. 563 @type fstruct_parser: C{FeatStructParser} 564 @param fstruct_parser: The parser that will be used to parse the 565 feature structure of an fcfg. 566 """ 567 # If we've cached the resource, then just return it. 568 if cache: 569 resource_val = _resource_cache.get(resource_url) 570 if resource_val is not None: 571 if verbose: 572 print '<<Using cached copy of %s>>' % (resource_url,) 573 return resource_val 574 575 # Let the user know what's going on. 576 if verbose: 577 print '<<Loading %s>>' % (resource_url,) 578 579 # Determine the format of the resource. 580 if format == 'auto': 581 resource_url_parts = resource_url.split('.') 582 ext = resource_url_parts[-1] 583 if ext == 'gz': 584 ext = resource_url_parts[-2] 585 format = AUTO_FORMATS.get(ext) 586 if format is None: 587 raise ValueError('Could not determine format for %s based ' 588 'on its file\nextension; use the "format" ' 589 'argument to specify the format explicitly.' 590 % resource_url) 591 592 # Load the resource. 593 if format == 'pickle': 594 resource_val = pickle.load(_open(resource_url)) 595 elif format == 'yaml': 596 resource_val = yaml.load(_open(resource_url)) 597 elif format == 'cfg': 598 resource_val = nltk.grammar.parse_cfg(_open(resource_url).read()) 599 elif format == 'pcfg': 600 resource_val = nltk.grammar.parse_pcfg(_open(resource_url).read()) 601 elif format == 'fcfg': 602 resource_val = nltk.grammar.parse_fcfg(_open(resource_url).read(), 603 logic_parser=logic_parser, 604 fstruct_parser=fstruct_parser) 605 elif format == 'fol': 606 resource_val = nltk.sem.parse_logic(_open(resource_url).read(), 607 logic_parser=nltk.sem.logic.LogicParser()) 608 elif format == 'logic': 609 resource_val = nltk.sem.parse_logic(_open(resource_url).read(), 610 logic_parser=logic_parser) 611 elif format == 'val': 612 resource_val = nltk.sem.parse_valuation(_open(resource_url).read()) 613 elif format == 'raw': 614 resource_val = _open(resource_url).read() 615 else: 616 assert format not in FORMATS 617 raise ValueError('Unknown format type!') 618 619 # If requested, add it to the cache. 620 if cache: 621 try: 622 _resource_cache[resource_url] = resource_val 623 except TypeError: 624 # We can't create weak references to some object types, like 625 # strings and tuples. For now, just don't cache them. 626 pass 627 628 return resource_val
629
630 -def show_cfg(resource_url, escape='##'):
631 """ 632 Write out a grammar file, ignoring escaped and empty lines 633 @type resource_url: C{str} 634 @param resource_url: A URL specifying where the resource should be 635 loaded from. The default protocol is C{"nltk:"}, which searches 636 for the file in the the NLTK data package. 637 @type escape: C{str} 638 @param escape: Prepended string that signals lines to be ignored 639 """ 640 resource_val = load(resource_url, format='raw', cache=False) 641 lines = resource_val.splitlines() 642 for l in lines: 643 if l.startswith(escape): continue 644 if re.match('^$', l): continue 645 print l
646 647
648 -def clear_cache():
649 """ 650 Remove all objects from the resource cache. 651 @see: L{load()} 652 """ 653 _resource_cache.clear()
654
655 -def _open(resource_url):
656 """ 657 Helper function that returns an open file object for a resource, 658 given its resource URL. If the given resource URL uses the 'ntlk' 659 protocol, or uses no protocol, then use L{nltk.data.find} to find 660 its path, and open it with the given mode; if the resource URL 661 uses the 'file' protocol, then open the file with the given mode; 662 otherwise, delegate to C{urllib2.urlopen}. 663 664 @type resource_url: C{str} 665 @param resource_url: A URL specifying where the resource should be 666 loaded from. The default protocol is C{"nltk:"}, which searches 667 for the file in the the NLTK data package. 668 """ 669 # Divide the resource name into "<protocol>:<path>". 670 protocol, path = re.match('(?:(\w+):)?(.*)', resource_url).groups() 671 672 if protocol is None or protocol.lower() == 'nltk': 673 return find(path).open() 674 elif protocol.lower() == 'file': 675 # urllib might not use mode='rb', so handle this one ourselves: 676 return open(path, 'rb') 677 else: 678 return urllib2.urlopen(resource_url)
679 680 ###################################################################### 681 # Lazy Resource Loader 682 ###################################################################### 683
684 -class LazyLoader(object):
685 - def __init__(self, path):
686 self.__path = path
687
688 - def __load(self):
689 resource = load(self.__path) 690 # This is where the magic happens! Transform ourselves into 691 # the object by modifying our own __dict__ and __class__ to 692 # match that of `resource`. 693 self.__dict__ = resource.__dict__ 694 self.__class__ = resource.__class__
695
696 - def __getattr__(self, attr):
697 self.__load() 698 # This looks circular, but its not, since __load() changes our 699 # __class__ to something new: 700 return getattr(self, attr)
701
702 - def __repr__(self):
703 self.__load() 704 # This looks circular, but its not, since __load() changes our 705 # __class__ to something new: 706 return '%r' % self
707 708 ###################################################################### 709 # Open-On-Demand ZipFile 710 ###################################################################### 711
712 -class OpenOnDemandZipFile(zipfile.ZipFile):
713 """ 714 A subclass of C{zipfile.ZipFile} that closes its file pointer 715 whenever it is not using it; and re-opens it when it needs to read 716 data from the zipfile. This is useful for reducing the number of 717 open file handles when many zip files are being accessed at once. 718 C{OpenOnDemandZipFile} must be constructed from a filename, not a 719 file-like object (to allow re-opening). C{OpenOnDemandZipFile} is 720 read-only (i.e., C{write} and C{writestr} are disabled. 721 """
722 - def __init__(self, filename):
723 if not isinstance(filename, basestring): 724 raise TypeError('ReopenableZipFile filename must be a string') 725 zipfile.ZipFile.__init__(self, filename) 726 assert self.filename == filename 727 self.close()
728
729 - def read(self, name):
730 assert self.fp is None 731 self.fp = open(self.filename, 'rb') 732 value = zipfile.ZipFile.read(self, name) 733 self.close() 734 return value
735
736 - def write(self, *args, **kwargs):
737 """@raise NotImplementedError: OpenOnDemandZipfile is read-only""" 738 raise NotImplementedError('OpenOnDemandZipfile is read-only')
739
740 - def writestr(self, *args, **kwargs):
741 """@raise NotImplementedError: OpenOnDemandZipfile is read-only""" 742 raise NotImplementedError('OpenOnDemandZipfile is read-only')
743
744 - def __repr__(self):
745 return 'OpenOnDemandZipFile(%r)' % self.filename
746 747 ###################################################################### 748 #{ Seekable Unicode Stream Reader 749 ###################################################################### 750
751 -class SeekableUnicodeStreamReader(object):
752 """ 753 A stream reader that automatically encodes the source byte stream 754 into unicode (like C{codecs.StreamReader}); but still supports the 755 C{seek()} and C{tell()} operations correctly. This is in contrast 756 to C{codecs.StreamReader}, which provide *broken* C{seek()} and 757 C{tell()} methods. 758 759 This class was motivated by L{StreamBackedCorpusView}, which 760 makes extensive use of C{seek()} and C{tell()}, and needs to be 761 able to handle unicode-encoded files. 762 763 Note: this class requires stateless decoders. To my knowledge, 764 this shouldn't cause a problem with any of python's builtin 765 unicode encodings. 766 """ 767 DEBUG = True #: If true, then perform extra sanity checks. 768
769 - def __init__(self, stream, encoding, errors='strict'):
770 # Rewind the stream to its beginning. 771 stream.seek(0) 772 773 self.stream = stream 774 """The underlying stream.""" 775 776 self.encoding = encoding 777 """The name of the encoding that should be used to encode the 778 underlying stream.""" 779 780 self.errors = errors 781 """The error mode that should be used when decoding data from 782 the underlying stream. Can be 'strict', 'ignore', or 783 'replace'.""" 784 785 self.decode = codecs.getdecoder(encoding) 786 """The function that is used to decode byte strings into 787 unicode strings.""" 788 789 self.bytebuffer = '' 790 """A buffer to use bytes that have been read but have not yet 791 been decoded. This is only used when the final bytes from 792 a read do not form a complete encoding for a character.""" 793 794 self.linebuffer = None 795 """A buffer used by L{readline()} to hold characters that have 796 been read, but have not yet been returned by L{read()} or 797 L{readline()}. This buffer consists of a list of unicode 798 strings, where each string corresponds to a single line. 799 The final element of the list may or may not be a complete 800 line. Note that the existence of a linebuffer makes the 801 L{tell()} operation more complex, because it must backtrack 802 to the beginning of the buffer to determine the correct 803 file position in the underlying byte stream.""" 804 805 self._rewind_checkpoint = 0 806 """The file position at which the most recent read on the 807 underlying stream began. This is used, together with 808 L{_rewind_numchars}, to backtrack to the beginning of 809 L{linebuffer} (which is required by L{tell()}).""" 810 811 self._rewind_numchars = None 812 """The number of characters that have been returned since the 813 read that started at L{_rewind_checkpoint}. This is used, 814 together with L{_rewind_checkpoint}, to backtrack to the 815 beginning of L{linebuffer} (which is required by 816 L{tell()}).""" 817 818 self._bom = self._check_bom() 819 """The length of the byte order marker at the beginning of 820 the stream (or C{None} for no byte order marker)."""
821 822 #///////////////////////////////////////////////////////////////// 823 # Read methods 824 #///////////////////////////////////////////////////////////////// 825
826 - def read(self, size=None):
827 """ 828 Read up to C{size} bytes, decode them using this reader's 829 encoding, and return the resulting unicode string. 830 831 @param size: The maximum number of bytes to read. If not 832 specified, then read as many bytes as possible. 833 834 @rtype: C{unicode} 835 """ 836 chars = self._read(size) 837 838 # If linebuffer is not empty, then include it in the result 839 if self.linebuffer: 840 chars = ''.join(self.linebuffer) + chars 841 self.linebuffer = None 842 self._rewind_numchars = None 843 844 return chars
845
846 - def readline(self, size=None):
847 """ 848 Read a line of text, decode it using this reader's encoding, 849 and return the resulting unicode string. 850 851 @param size: The maximum number of bytes to read. If no 852 newline is encountered before C{size} bytes have been 853 read, then the returned value may not be a complete line 854 of text. 855 """ 856 # If we have a non-empty linebuffer, then return the first 857 # line from it. (Note that the last element of linebuffer may 858 # not be a complete line; so let _read() deal with it.) 859 if self.linebuffer and len(self.linebuffer) > 1: 860 line = self.linebuffer.pop(0) 861 self._rewind_numchars += len(line) 862 return line 863 864 readsize = size or 72 865 chars = '' 866 867 # If there's a remaining incomplete line in the buffer, add it. 868 if self.linebuffer: 869 chars += self.linebuffer.pop() 870 self.linebuffer = None 871 872 while True: 873 startpos = self.stream.tell() - len(self.bytebuffer) 874 new_chars = self._read(readsize) 875 876 # If we're at a '\r', then read one extra character, since 877 # it might be a '\n', to get the proper line ending. 878 if new_chars and new_chars.endswith('\r'): 879 new_chars += self._read(1) 880 881 chars += new_chars 882 lines = chars.splitlines(True) 883 if len(lines) > 1: 884 line = lines[0] 885 self.linebuffer = lines[1:] 886 self._rewind_numchars = len(new_chars)-(len(chars)-len(line)) 887 self._rewind_checkpoint = startpos 888 break 889 elif len(lines) == 1: 890 line0withend = lines[0] 891 line0withoutend = lines[0].splitlines(False)[0] 892 if line0withend != line0withoutend: # complete line 893 line = line0withend 894 break 895 896 if not new_chars or size is not None: 897 line = chars 898 break 899 900 # Read successively larger blocks of text. 901 if readsize < 8000: 902 readsize *= 2 903 904 return line
905
906 - def readlines(self, sizehint=None, keepends=True):
907 """ 908 Read this file's contents, decode them using this reader's 909 encoding, and return it as a list of unicode lines. 910 911 @rtype: C{list} of C{unicode} 912 @param sizehint: Ignored. 913 @param keepends: If false, then strip newlines. 914 """ 915 return self.read().splitlines(keepends)
916
917 - def next(self):
918 """Return the next decoded line from the underlying stream.""" 919 line = self.readline() 920 if line: return line 921 else: raise StopIteration
922
923 - def __iter__(self):
924 """Return self""" 925 return self
926
927 - def xreadlines(self):
928 """Return self""" 929 return self
930 931 #///////////////////////////////////////////////////////////////// 932 # Pass-through methods & properties 933 #///////////////////////////////////////////////////////////////// 934 935 closed = property(lambda self: self.stream.closed, doc=""" 936 True if the underlying stream is closed.""") 937 938 name = property(lambda self: self.stream.name, doc=""" 939 The name of the underlying stream.""") 940 941 mode = property(lambda self: self.stream.mode, doc=""" 942 The mode of the underlying stream.""") 943
944 - def close(self):
945 """ 946 Close the underlying stream. 947 """ 948 self.stream.close()
949 950 #///////////////////////////////////////////////////////////////// 951 # Seek and tell 952 #///////////////////////////////////////////////////////////////// 953
954 - def seek(self, offset, whence=0):
955 """ 956 Move the stream to a new file position. If the reader is 957 maintaining any buffers, tehn they will be cleared. 958 959 @param offset: A byte count offset. 960 @param whence: If C{whence} is 0, then the offset is from the 961 start of the file (offset should be positive). If 962 C{whence} is 1, then the offset is from the current 963 position (offset may be positive or negative); and if 2, 964 then the offset is from the end of the file (offset should 965 typically be negative). 966 """ 967 if whence == 1: 968 raise ValueError('Relative seek is not supported for ' 969 'SeekableUnicodeStreamReader -- consider ' 970 'using char_seek_forward() instead.') 971 self.stream.seek(offset, whence) 972 self.linebuffer = None 973 self.bytebuffer = '' 974 self._rewind_numchars = None 975 self._rewind_checkpoint = self.stream.tell()
976
977 - def char_seek_forward(self, offset):
978 """ 979 Move the read pointer forward by C{offset} characters. 980 """ 981 if offset < 0: 982 raise ValueError('Negative offsets are not supported') 983 # Clear all buffers. 984 self.seek(self.tell()) 985 # Perform the seek operation. 986 self._char_seek_forward(offset)
987
988 - def _char_seek_forward(self, offset, est_bytes=None):
989 """ 990 Move the file position forward by C{offset} characters, 991 ignoring all buffers. 992 993 @param est_bytes: A hint, giving an estimate of the number of 994 bytes that will be neded to move foward by C{offset} chars. 995 Defaults to C{offset}. 996 """ 997 if est_bytes is None: est_bytes = offset 998 bytes = '' 999 1000 while True: 1001 # Read in a block of bytes. 1002 newbytes = self.stream.read(est_bytes-len(bytes)) 1003 bytes += newbytes 1004 1005 # Decode the bytes to characters. 1006 chars, bytes_decoded = self._incr_decode(bytes) 1007 1008 # If we got the right number of characters, then seek 1009 # backwards over any truncated characters, and return. 1010 if len(chars) == offset: 1011 self.stream.seek(-len(bytes)+bytes_decoded, 1) 1012 return 1013 1014 # If we went too far, then we can back-up until we get it 1015 # right, using the bytes we've already read. 1016 if len(chars) > offset: 1017 while len(chars) > offset: 1018 # Assume at least one byte/char. 1019 est_bytes += offset-len(chars) 1020 chars, bytes_decoded = self._incr_decode(bytes[:est_bytes]) 1021 self.stream.seek(-len(bytes)+bytes_decoded, 1) 1022 return 1023 1024 # Otherwise, we haven't read enough bytes yet; loop again. 1025 est_bytes += offset - len(chars)
1026
1027 - def tell(self):
1028 """ 1029 Return the current file position on the underlying byte 1030 stream. If this reader is maintaining any buffers, then the 1031 returned file position will be the position of the beginning 1032 of those buffers. 1033 """ 1034 # If nothing's buffered, then just return our current filepos: 1035 if self.linebuffer is None: 1036 return self.stream.tell() - len(self.bytebuffer) 1037 1038 # Otherwise, we'll need to backtrack the filepos until we 1039 # reach the beginning of the buffer. 1040 1041 # Store our original file position, so we can return here. 1042 orig_filepos = self.stream.tell() 1043 1044 # Calculate an estimate of where we think the newline is. 1045 bytes_read = ( (orig_filepos-len(self.bytebuffer)) - 1046 self._rewind_checkpoint ) 1047 buf_size = sum([len(line) for line in self.linebuffer]) 1048 est_bytes = (bytes_read * self._rewind_numchars / 1049 (self._rewind_numchars + buf_size)) 1050 1051 self.stream.seek(self._rewind_checkpoint) 1052 self._char_seek_forward(self._rewind_numchars, est_bytes) 1053 filepos = self.stream.tell() 1054 1055 # Sanity check 1056 if self.DEBUG: 1057 self.stream.seek(filepos) 1058 check1 = self._incr_decode(self.stream.read(50))[0] 1059 check2 = ''.join(self.linebuffer) 1060 assert check1.startswith(check2) or check2.startswith(check1) 1061 1062 # Return to our original filepos (so we don't have to throw 1063 # out our buffer.) 1064 self.stream.seek(orig_filepos) 1065 1066 # Return the calculated filepos 1067 return filepos
1068 1069 #///////////////////////////////////////////////////////////////// 1070 # Helper methods 1071 #///////////////////////////////////////////////////////////////// 1072
1073 - def _read(self, size=None):
1074 """ 1075 Read up to C{size} bytes from the underlying stream, decode 1076 them using this reader's encoding, and return the resulting 1077 unicode string. C{linebuffer} is *not* included in the 1078 result. 1079 """ 1080 if size == 0: return u'' 1081 1082 # Skip past the byte order marker, if present. 1083 if self._bom and self.stream.tell() == 0: 1084 self.stream.read(self._bom) 1085 1086 # Read the requested number of bytes. 1087 if size is None: 1088 new_bytes = self.stream.read() 1089 else: 1090 new_bytes = self.stream.read(size) 1091 bytes = self.bytebuffer + new_bytes 1092 1093 # Decode the bytes into unicode characters 1094 chars, bytes_decoded = self._incr_decode(bytes) 1095 1096 # If we got bytes but couldn't decode any, then read further. 1097 if (size is not None) and (not chars) and (len(new_bytes) > 0): 1098 while not chars: 1099 new_bytes = self.stream.read(1) 1100 if not new_bytes: break # end of file. 1101 bytes += new_bytes 1102 chars, bytes_decoded = self._incr_decode(bytes) 1103 1104 # Record any bytes we didn't consume. 1105 self.bytebuffer = bytes[bytes_decoded:] 1106 1107 # Return the result 1108 return chars
1109
1110 - def _incr_decode(self, bytes):
1111 """ 1112 Decode the given byte string into a unicode string, using this 1113 reader's encoding. If an exception is encountered that 1114 appears to be caused by a truncation error, then just decode 1115 the byte string without the bytes that cause the trunctaion 1116 error. 1117 1118 @return: A tuple C{(chars, num_consumed)}, where C{chars} is 1119 the decoded unicode string, and C{num_consumed} is the 1120 number of bytes that were consumed. 1121 """ 1122 while True: 1123 try: 1124 return self.decode(bytes, 'strict') 1125 except UnicodeDecodeError, exc: 1126 # If the exception occurs at the end of the string, 1127 # then assume that it's a truncation error. 1128 if exc.end == len(bytes): 1129 return self.decode(bytes[:exc.start], self.errors) 1130 1131 # Otherwise, if we're being strict, then raise it. 1132 elif self.errors == 'strict': 1133 raise 1134 1135 # If we're not strcit, then re-process it with our 1136 # errors setting. This *may* raise an exception. 1137 else: 1138 return self.decode(bytes, self.errors)
1139 1140 _BOM_TABLE = { 1141 'utf8': [(codecs.BOM_UTF8, None)], 1142 'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'), 1143 (codecs.BOM_UTF16_BE, 'utf16-be')], 1144 'utf16le': [(codecs.BOM_UTF16_LE, None)], 1145 'utf16be': [(codecs.BOM_UTF16_BE, None)], 1146 'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'), 1147 (codecs.BOM_UTF32_BE, 'utf32-be')], 1148 'utf32le': [(codecs.BOM_UTF32_LE, None)], 1149 'utf32be': [(codecs.BOM_UTF32_BE, None)], 1150 } 1151
1152 - def _check_bom(self):
1153 # Normalize our encoding name 1154 enc = re.sub('[ -]', '', self.encoding.lower()) 1155 1156 # Look up our encoding in the BOM table. 1157 bom_info = self._BOM_TABLE.get(enc) 1158 1159 if bom_info: 1160 # Read a prefix, to check against the BOM(s) 1161 bytes = self.stream.read(16) 1162 self.stream.seek(0) 1163 1164 # Check for each possible BOM. 1165 for (bom, new_encoding) in bom_info: 1166 if bytes.startswith(bom): 1167 if new_encoding: self.encoding = new_encoding 1168 return len(bom) 1169 1170 return None
1171 1172 __all__ = ['path', 'PathPointer', 'FileSystemPathPointer', 'BufferedGzipFile', 1173 'GzipFileSystemPathPointer', 'GzipFileSystemPathPointer', 1174 'find', 'retrieve', 'FORMATS', 'AUTO_FORMATS', 'load', 1175 'show_cfg', 'clear_cache', 'LazyLoader', 'OpenOnDemandZipFile', 1176 'GzipFileSystemPathPointer', 'SeekableUnicodeStreamReader'] 1177