Package nltk :: Module util
[hide private]
[frames] | no frames]

Source Code for Module nltk.util

   1  # Natural Language Toolkit: Utility functions 
   2  # 
   3  # Copyright (C) 2001-2011 NLTK Project 
   4  # Author: Steven Bird <sb@csse.unimelb.edu.au> 
   5  # URL: <http://www.nltk.org/> 
   6  # For license information, see LICENSE.TXT 
   7   
   8  import locale 
   9  import re 
  10  import types 
  11  import textwrap 
  12  import pydoc 
  13  import bisect 
  14  import os 
  15   
  16  from itertools import islice, chain 
  17  from pprint import pprint 
  18  from nltk.compat import defaultdict 
  19   
  20  from nltk.internals import slice_bounds 
  21   
  22  ###################################################################### 
  23  # Short usage message 
  24  ###################################################################### 
  25   
26 -def usage(obj, selfname='self'):
27 import inspect 28 str(obj) # In case it's lazy, this will load it. 29 30 if not isinstance(obj, (types.TypeType, types.ClassType)): 31 obj = obj.__class__ 32 33 print '%s supports the following operations:' % obj.__name__ 34 for (name, method) in sorted(pydoc.allmethods(obj).items()): 35 if name.startswith('_'): continue 36 if getattr(method, '__deprecated__', False): continue 37 38 args, varargs, varkw, defaults = inspect.getargspec(method) 39 if (args and args[0]=='self' and 40 (defaults is None or len(args)>len(defaults))): 41 args = args[1:] 42 name = '%s.%s' % (selfname, name) 43 argspec = inspect.formatargspec( 44 args, varargs, varkw, defaults) 45 print textwrap.fill('%s%s' % (name, argspec), 46 initial_indent=' - ', 47 subsequent_indent=' '*(len(name)+5))
48 49 ########################################################################## 50 # IDLE 51 ########################################################################## 52
53 -def in_idle():
54 """ 55 @rtype: C{boolean} 56 @return: true if this function is run within idle. Tkinter 57 programs that are run in idle should never call C{Tk.mainloop}; so 58 this function should be used to gate all calls to C{Tk.mainloop}. 59 60 @warning: This function works by checking C{sys.stdin}. If the 61 user has modified C{sys.stdin}, then it may return incorrect 62 results. 63 """ 64 import sys, types 65 return (type(sys.stdin) == types.InstanceType and \ 66 sys.stdin.__class__.__name__ == 'PyShell')
67 68 ########################################################################## 69 # PRETTY PRINTING 70 ########################################################################## 71
72 -def pr(data, start=0, end=None):
73 """ 74 Pretty print a sequence of data items 75 76 @param data: the data stream to print 77 @type data: C{sequence} or C{iterator} 78 @param start: the start position 79 @type start: C{int} 80 @param end: the end position 81 @type end: C{int} 82 """ 83 pprint(list(islice(data, start, end)))
84 95
96 -def tokenwrap(tokens, separator=" ", width=70):
97 """ 98 Pretty print a list of text tokens, breaking lines on whitespace 99 100 @param tokens: the tokens to print 101 @type tokens: C{list} 102 @param separator: the string to use to separate tokens 103 @type separator: C{str} 104 @param width: the display width (default=70) 105 @type width: C{int} 106 """ 107 return '\n'.join(textwrap.wrap(separator.join(tokens), width=width))
108 109 110 ########################################################################## 111 # Indexing 112 ########################################################################## 113
114 -class Index(defaultdict):
115
116 - def __init__(self, pairs):
117 defaultdict.__init__(self, list) 118 for key, value in pairs: 119 self[key].append(value)
120 121 122 ###################################################################### 123 ## Regexp display (thanks to David Mertz) 124 ###################################################################### 125
126 -def re_show(regexp, string, left="{", right="}"):
127 """ 128 Search C{string} for substrings matching C{regexp} and wrap 129 the matches with braces. This is convenient for learning about 130 regular expressions. 131 132 @param regexp: The regular expression. 133 @type regexp: C{string} 134 @param string: The string being matched. 135 @type string: C{string} 136 @param left: The left delimiter (printed before the matched substring) 137 @type left: C{string} 138 @param right: The right delimiter (printed after the matched substring) 139 @type right: C{string} 140 @rtype: C{string} 141 @return: A string with markers surrounding the matched substrings. 142 """ 143 print re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip())
144 145 146 ########################################################################## 147 # READ FROM FILE OR STRING 148 ########################################################################## 149 150 # recipe from David Mertz
151 -def filestring(f):
152 if hasattr(f, 'read'): 153 return f.read() 154 elif isinstance(f, basestring): 155 return open(f).read() 156 else: 157 raise ValueError, "Must be called with a filename or file-like object"
158 159 ########################################################################## 160 # Breadth-First Search 161 ########################################################################## 162
163 -def breadth_first(tree, children=iter, depth=-1, queue=None):
164 """Traverse the nodes of a tree in breadth-first order. 165 (No need to check for cycles.) 166 The first argument should be the tree root; 167 children should be a function taking as argument a tree node 168 and returning an iterator of the node's children. 169 """ 170 if queue == None: 171 queue = [] 172 queue.append(tree) 173 174 while queue: 175 node = queue.pop(0) 176 yield node 177 if depth != 0: 178 try: 179 queue += children(node) 180 depth -= 1 181 except: 182 pass
183 184 ########################################################################## 185 # Guess Character Encoding 186 ########################################################################## 187 188 # adapted from io.py in the docutils extension module (http://docutils.sourceforge.net) 189 # http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html 190
191 -def guess_encoding(data):
192 """ 193 Given a byte string, attempt to decode it. 194 Tries the standard 'UTF8' and 'latin-1' encodings, 195 Plus several gathered from locale information. 196 197 The calling program *must* first call:: 198 199 locale.setlocale(locale.LC_ALL, '') 200 201 If successful it returns C{(decoded_unicode, successful_encoding)}. 202 If unsuccessful it raises a C{UnicodeError}. 203 """ 204 successful_encoding = None 205 # we make 'utf-8' the first encoding 206 encodings = ['utf-8'] 207 # 208 # next we add anything we can learn from the locale 209 try: 210 encodings.append(locale.nl_langinfo(locale.CODESET)) 211 except AttributeError: 212 pass 213 try: 214 encodings.append(locale.getlocale()[1]) 215 except (AttributeError, IndexError): 216 pass 217 try: 218 encodings.append(locale.getdefaultlocale()[1]) 219 except (AttributeError, IndexError): 220 pass 221 # 222 # we try 'latin-1' last 223 encodings.append('latin-1') 224 for enc in encodings: 225 # some of the locale calls 226 # may have returned None 227 if not enc: 228 continue 229 try: 230 decoded = unicode(data, enc) 231 successful_encoding = enc 232 233 except (UnicodeError, LookupError): 234 pass 235 else: 236 break 237 if not successful_encoding: 238 raise UnicodeError( 239 'Unable to decode input data. Tried the following encodings: %s.' 240 % ', '.join([repr(enc) for enc in encodings if enc])) 241 else: 242 return (decoded, successful_encoding)
243 244 245 ########################################################################## 246 # Invert a dictionary 247 ########################################################################## 248
249 -def invert_dict(d):
250 from nltk.compat import defaultdict 251 inverted_dict = defaultdict(list) 252 for key in d: 253 if hasattr(d[key], '__iter__'): 254 for term in d[key]: 255 inverted_dict[term].append(key) 256 else: 257 inverted_dict[d[key]] = key 258 return inverted_dict
259 260 261 ########################################################################## 262 # Utilities for directed graphs: transitive closure, and inversion 263 # The graph is represented as a dictionary of sets 264 ########################################################################## 265
266 -def transitive_closure(graph, reflexive=False):
267 """ 268 Calculate the transitive closure of a directed graph, 269 optionally the reflexive transitive closure. 270 271 The algorithm is a slight modification of the "Marking Algorithm" of 272 Ioannidis & Ramakrishnan (1998) "Efficient Transitive Closure Algorithms". 273 274 @param graph: the initial graph, represented as a dictionary of sets 275 @type graph: C{dict} of C{set}s 276 @param reflexive: if set, also make the closure reflexive 277 @type reflexive: C{bool} 278 @return: the (reflexive) transitive closure of the graph 279 @rtype: C{dict} of C{set}s 280 """ 281 if reflexive: 282 base_set = lambda k: set([k]) 283 else: 284 base_set = lambda k: set() 285 # The graph U_i in the article: 286 agenda_graph = dict((k, v.copy()) for (k,v) in graph.iteritems()) 287 # The graph M_i in the article: 288 closure_graph = dict((k, base_set(k)) for k in graph) 289 for i in graph: 290 agenda = agenda_graph[i] 291 closure = closure_graph[i] 292 while agenda: 293 j = agenda.pop() 294 closure.add(j) 295 closure |= closure_graph.setdefault(j, base_set(j)) 296 agenda |= agenda_graph.get(j, base_set(j)) 297 agenda -= closure 298 return closure_graph
299 300
301 -def invert_graph(graph):
302 """ 303 Inverts a directed graph. 304 305 @param graph: the graph, represented as a dictionary of sets 306 @type graph: C{dict} of C{set}s 307 @return: the inverted graph 308 @rtype: C{dict} of C{set}s 309 """ 310 inverted = {} 311 for key, values in graph.iteritems(): 312 for value in values: 313 inverted.setdefault(value, set()).add(key) 314 return inverted
315 316 317 318 ########################################################################## 319 # HTML Cleaning 320 ########################################################################## 321
322 -def clean_html(html):
323 """ 324 Remove HTML markup from the given string. 325 326 @param html: the HTML string to be cleaned 327 @type html: C{string} 328 @rtype: C{string} 329 """ 330 331 # First we remove inline JavaScript/CSS: 332 cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) 333 # Then we remove html comments. This has to be done before removing regular 334 # tags since comments can contain '>' characters. 335 cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) 336 # Next we can remove the remaining tags: 337 cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) 338 # Finally, we deal with whitespace 339 cleaned = re.sub(r"&nbsp;", " ", cleaned) 340 cleaned = re.sub(r" ", " ", cleaned) 341 cleaned = re.sub(r" ", " ", cleaned) 342 return cleaned.strip()
343
344 -def clean_url(url):
345 from urllib import urlopen 346 html = urlopen(url).read() 347 return clean_html(html)
348 349 ########################################################################## 350 # FLATTEN LISTS 351 ########################################################################## 352
353 -def flatten(*args):
354 """Flatten a list. 355 356 >>> flatten(1, 2, ['b', 'a' , ['c', 'd']], 3) 357 [1, 2, 'a', 'b', 'c', 'd', 3] 358 359 @param *args: items and lists to be combined into a single list 360 @rtype: C{list} 361 """ 362 363 x = [] 364 for l in args: 365 if not isinstance(l, (list, tuple)): l = [l] 366 for item in l: 367 if isinstance(item, (list, tuple)): 368 x.extend(flatten(item)) 369 else: 370 x.append(item) 371 return x
372 373 ########################################################################## 374 # Ngram iteration 375 ########################################################################## 376 377 # add a flag to pad the sequence so we get peripheral ngrams? 378
379 -def ngrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
380 """ 381 A utility that produces a sequence of ngrams from a sequence of items. 382 For example: 383 384 >>> ngrams([1,2,3,4,5], 3) 385 [(1, 2, 3), (2, 3, 4), (3, 4, 5)] 386 387 Use ingram for an iterator version of this function. Set pad_left 388 or pad_right to true in order to get additional ngrams: 389 390 >>> ngrams([1,2,3,4,5], 2, pad_right=True) 391 [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] 392 393 @param sequence: the source data to be converted into ngrams 394 @type sequence: C{sequence} or C{iterator} 395 @param n: the degree of the ngrams 396 @type n: C{int} 397 @param pad_left: whether the ngrams should be left-padded 398 @type pad_left: C{boolean} 399 @param pad_right: whether the ngrams should be right-padded 400 @type pad_right: C{boolean} 401 @param pad_symbol: the symbol to use for padding (default is None) 402 @type pad_symbol: C{any} 403 @return: The ngrams 404 @rtype: C{list} of C{tuple}s 405 """ 406 407 if pad_left: 408 sequence = chain((pad_symbol,) * (n-1), sequence) 409 if pad_right: 410 sequence = chain(sequence, (pad_symbol,) * (n-1)) 411 sequence = list(sequence) 412 413 count = max(0, len(sequence) - n + 1) 414 return [tuple(sequence[i:i+n]) for i in range(count)]
415
416 -def bigrams(sequence, **kwargs):
417 """ 418 A utility that produces a sequence of bigrams from a sequence of items. 419 For example: 420 421 >>> bigrams([1,2,3,4,5]) 422 [(1, 2), (2, 3), (3, 4), (4, 5)] 423 424 Use ibigrams for an iterator version of this function. 425 426 @param sequence: the source data to be converted into bigrams 427 @type sequence: C{sequence} or C{iterator} 428 @return: The bigrams 429 @rtype: C{list} of C{tuple}s 430 """ 431 return ngrams(sequence, 2, **kwargs)
432
433 -def trigrams(sequence, **kwargs):
434 """ 435 A utility that produces a sequence of trigrams from a sequence of items. 436 For example: 437 438 >>> trigrams([1,2,3,4,5]) 439 [(1, 2, 3), (2, 3, 4), (3, 4, 5)] 440 441 Use itrigrams for an iterator version of this function. 442 443 @param sequence: the source data to be converted into trigrams 444 @type sequence: C{sequence} or C{iterator} 445 @return: The trigrams 446 @rtype: C{list} of C{tuple}s 447 """ 448 return ngrams(sequence, 3, **kwargs)
449
450 -def ingrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
451 """ 452 A utility that produces an iterator over ngrams generated from a sequence of items. 453 454 For example: 455 456 >>> list(ingrams([1,2,3,4,5], 3)) 457 [(1, 2, 3), (2, 3, 4), (3, 4, 5)] 458 459 Use ngrams for a list version of this function. Set pad_left 460 or pad_right to true in order to get additional ngrams: 461 462 >>> list(ingrams([1,2,3,4,5], 2, pad_right=True)) 463 [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] 464 465 @param sequence: the source data to be converted into ngrams 466 @type sequence: C{sequence} or C{iterator} 467 @param n: the degree of the ngrams 468 @type n: C{int} 469 @param pad_left: whether the ngrams should be left-padded 470 @type pad_left: C{boolean} 471 @param pad_right: whether the ngrams should be right-padded 472 @type pad_right: C{boolean} 473 @param pad_symbol: the symbol to use for padding (default is None) 474 @type pad_symbol: C{any} 475 @return: The ngrams 476 @rtype: C{iterator} of C{tuple}s 477 """ 478 479 sequence = iter(sequence) 480 if pad_left: 481 sequence = chain((pad_symbol,) * (n-1), sequence) 482 if pad_right: 483 sequence = chain(sequence, (pad_symbol,) * (n-1)) 484 485 history = [] 486 while n > 1: 487 history.append(sequence.next()) 488 n -= 1 489 for item in sequence: 490 history.append(item) 491 yield tuple(history) 492 del history[0]
493
494 -def ibigrams(sequence, **kwargs):
495 """ 496 A utility that produces an iterator over bigrams generated from a sequence of items. 497 498 For example: 499 500 >>> list(ibigrams([1,2,3,4,5])) 501 [(1, 2), (2, 3), (3, 4), (4, 5)] 502 503 Use bigrams for a list version of this function. 504 505 @param sequence: the source data to be converted into bigrams 506 @type sequence: C{sequence} or C{iterator} 507 @return: The bigrams 508 @rtype: C{iterator} of C{tuple}s 509 """ 510 511 for item in ingrams(sequence, 2, **kwargs): 512 yield item
513
514 -def itrigrams(sequence, **kwargs):
515 """ 516 A utility that produces an iterator over trigrams generated from a sequence of items. 517 518 For example: 519 520 >>> list(itrigrams([1,2,3,4,5]) 521 [(1, 2, 3), (2, 3, 4), (3, 4, 5)] 522 523 Use trigrams for a list version of this function. 524 525 @param sequence: the source data to be converted into trigrams 526 @type sequence: C{sequence} or C{iterator} 527 @return: The trigrams 528 @rtype: C{iterator} of C{tuple}s 529 """ 530 531 for item in ingrams(sequence, 3, **kwargs): 532 yield item
533 534 ########################################################################## 535 # Ordered Dictionary 536 ########################################################################## 537
538 -class OrderedDict(dict):
539 - def __init__(self, data=None, **kwargs):
540 self._keys = self.keys(data, kwargs.get('keys')) 541 self._default_factory = kwargs.get('default_factory') 542 if data is None: 543 dict.__init__(self) 544 else: 545 dict.__init__(self, data)
546
547 - def __delitem__(self, key):
548 dict.__delitem__(self, key) 549 self._keys.remove(key)
550
551 - def __getitem__(self, key):
552 try: 553 return dict.__getitem__(self, key) 554 except KeyError: 555 return self.__missing__(key)
556
557 - def __iter__(self):
558 return (key for key in self.keys())
559
560 - def __missing__(self, key):
561 if not self._default_factory and key not in self._keys: 562 raise KeyError() 563 else: 564 return self._default_factory()
565
566 - def __setitem__(self, key, item):
567 dict.__setitem__(self, key, item) 568 if key not in self._keys: 569 self._keys.append(key)
570
571 - def clear(self):
572 dict.clear(self) 573 self._keys.clear()
574
575 - def copy(self):
576 d = dict.copy(self) 577 d._keys = self._keys 578 return d
579
580 - def items(self):
581 return zip(self.keys(), self.values())
582
583 - def keys(self, data=None, keys=None):
584 if data: 585 if keys: 586 assert isinstance(keys, list) 587 assert len(data) == len(keys) 588 return keys 589 else: 590 assert isinstance(data, dict) or \ 591 isinstance(data, OrderedDict) or \ 592 isinstance(data, list) 593 if isinstance(data, dict) or isinstance(data, OrderedDict): 594 return data.keys() 595 elif isinstance(data, list): 596 return [key for (key, value) in data] 597 elif '_keys' in self.__dict__: 598 return self._keys 599 else: 600 return []
601
602 - def popitem(self):
603 if self._keys: 604 key = self._keys.pop() 605 value = self[key] 606 del self[key] 607 return (key, value) 608 else: 609 raise KeyError()
610
611 - def setdefault(self, key, failobj=None):
612 dict.setdefault(self, key, failobj) 613 if key not in self._keys: 614 self._keys.append(key)
615
616 - def update(self, data):
617 dict.update(self, data) 618 for key in self.keys(data): 619 if key not in self._keys: 620 self._keys.append(key)
621
622 - def values(self):
623 return map(self.get, self._keys)
624 625 ###################################################################### 626 # Lazy Sequences 627 ###################################################################### 628
629 -class AbstractLazySequence(object):
630 """ 631 An abstract base class for read-only sequences whose values are 632 computed as needed. Lazy sequences act like tuples -- they can be 633 indexed, sliced, and iterated over; but they may not be modified. 634 635 The most common application of lazy sequences in NLTK is for 636 I{corpus view} objects, which provide access to the contents of a 637 corpus without loading the entire corpus into memory, by loading 638 pieces of the corpus from disk as needed. 639 640 The result of modifying a mutable element of a lazy sequence is 641 undefined. In particular, the modifications made to the element 642 may or may not persist, depending on whether and when the lazy 643 sequence caches that element's value or reconstructs it from 644 scratch. 645 646 Subclasses are required to define two methods: 647 648 - L{__len__()} 649 - L{iterate_from()}. 650 """
651 - def __len__(self):
652 """ 653 Return the number of tokens in the corpus file underlying this 654 corpus view. 655 """ 656 raise NotImplementedError('should be implemented by subclass')
657
658 - def iterate_from(self, start):
659 """ 660 Return an iterator that generates the tokens in the corpus 661 file underlying this corpus view, starting at the token number 662 C{start}. If C{start>=len(self)}, then this iterator will 663 generate no tokens. 664 """ 665 raise NotImplementedError('should be implemented by subclass')
666
667 - def __getitem__(self, i):
668 """ 669 Return the C{i}th token in the corpus file underlying this 670 corpus view. Negative indices and spans are both supported. 671 """ 672 if isinstance(i, slice): 673 start, stop = slice_bounds(self, i) 674 return LazySubsequence(self, start, stop) 675 else: 676 # Handle negative indices 677 if i < 0: i += len(self) 678 if i < 0: raise IndexError('index out of range') 679 # Use iterate_from to extract it. 680 try: 681 return self.iterate_from(i).next() 682 except StopIteration: 683 raise IndexError('index out of range')
684
685 - def __iter__(self):
686 """Return an iterator that generates the tokens in the corpus 687 file underlying this corpus view.""" 688 return self.iterate_from(0)
689
690 - def count(self, value):
691 """Return the number of times this list contains C{value}.""" 692 return sum(1 for elt in self if elt==value)
693
694 - def index(self, value, start=None, stop=None):
695 """Return the index of the first occurance of C{value} in this 696 list that is greater than or equal to C{start} and less than 697 C{stop}. Negative start & stop values are treated like negative 698 slice bounds -- i.e., they count from the end of the list.""" 699 start, stop = slice_bounds(self, slice(start, stop)) 700 for i, elt in enumerate(islice(self, start, stop)): 701 if elt == value: return i+start 702 raise ValueError('index(x): x not in list')
703
704 - def __contains__(self, value):
705 """Return true if this list contains C{value}.""" 706 return bool(self.count(value))
707
708 - def __add__(self, other):
709 """Return a list concatenating self with other.""" 710 return LazyConcatenation([self, other])
711
712 - def __radd__(self, other):
713 """Return a list concatenating other with self.""" 714 return LazyConcatenation([other, self])
715
716 - def __mul__(self, count):
717 """Return a list concatenating self with itself C{count} times.""" 718 return LazyConcatenation([self] * count)
719
720 - def __rmul__(self, count):
721 """Return a list concatenating self with itself C{count} times.""" 722 return LazyConcatenation([self] * count)
723 724 _MAX_REPR_SIZE = 60
725 - def __repr__(self):
726 """ 727 @return: A string representation for this corpus view that is 728 similar to a list's representation; but if it would be more 729 than 60 characters long, it is truncated. 730 """ 731 pieces = [] 732 length = 5 733 for elt in self: 734 pieces.append(repr(elt)) 735 length += len(pieces[-1]) + 2 736 if length > self._MAX_REPR_SIZE and len(pieces) > 2: 737 return '[%s, ...]' % ', '.join(pieces[:-1]) 738 else: 739 return '[%s]' % ', '.join(pieces)
740
741 - def __cmp__(self, other):
742 """ 743 Return a number indicating how C{self} relates to other. 744 745 - If C{other} is not a corpus view or a C{list}, return -1. 746 - Otherwise, return C{cmp(list(self), list(other))}. 747 748 Note: corpus views do not compare equal to tuples containing 749 equal elements. Otherwise, transitivity would be violated, 750 since tuples do not compare equal to lists. 751 """ 752 if not isinstance(other, (AbstractLazySequence, list)): return -1 753 return cmp(list(self), list(other))
754
755 - def __hash__(self):
756 """ 757 @raise ValueError: Corpus view objects are unhashable. 758 """ 759 raise ValueError('%s objects are unhashable' % 760 self.__class__.__name__)
761 762
763 -class LazySubsequence(AbstractLazySequence):
764 """ 765 A subsequence produced by slicing a lazy sequence. This slice 766 keeps a reference to its source sequence, and generates its values 767 by looking them up in the source sequence. 768 """ 769 770 MIN_SIZE = 100 771 """The minimum size for which lazy slices should be created. If 772 C{LazySubsequence()} is called with a subsequence that is 773 shorter than C{MIN_SIZE}, then a tuple will be returned 774 instead.""" 775
776 - def __new__(cls, source, start, stop):
777 """ 778 Construct a new slice from a given underlying sequence. The 779 C{start} and C{stop} indices should be absolute indices -- 780 i.e., they should not be negative (for indexing from the back 781 of a list) or greater than the length of C{source}. 782 """ 783 # If the slice is small enough, just use a tuple. 784 if stop-start < cls.MIN_SIZE: 785 return list(islice(source.iterate_from(start), stop-start)) 786 else: 787 return object.__new__(cls)
788
789 - def __init__(self, source, start, stop):
790 self._source = source 791 self._start = start 792 self._stop = stop
793
794 - def __len__(self):
795 return self._stop - self._start
796
797 - def iterate_from(self, start):
798 return islice(self._source.iterate_from(start+self._start), 799 max(0, len(self)-start))
800 801
802 -class LazyConcatenation(AbstractLazySequence):
803 """ 804 A lazy sequence formed by concatenating a list of lists. This 805 underlying list of lists may itself be lazy. C{LazyConcatenation} 806 maintains an index that it uses to keep track of the relationship 807 between offsets in the concatenated lists and offsets in the 808 sublists. 809 """
810 - def __init__(self, list_of_lists):
811 self._list = list_of_lists 812 self._offsets = [0]
813
814 - def __len__(self):
815 if len(self._offsets) <= len(self._list): 816 for tok in self.iterate_from(self._offsets[-1]): pass 817 return self._offsets[-1]
818
819 - def iterate_from(self, start_index):
820 if start_index < self._offsets[-1]: 821 sublist_index = bisect.bisect_right(self._offsets, start_index)-1 822 else: 823 sublist_index = len(self._offsets)-1 824 825 index = self._offsets[sublist_index] 826 827 # Construct an iterator over the sublists. 828 if isinstance(self._list, AbstractLazySequence): 829 sublist_iter = self._list.iterate_from(sublist_index) 830 else: 831 sublist_iter = islice(self._list, sublist_index, None) 832 833 for sublist in sublist_iter: 834 if sublist_index == (len(self._offsets)-1): 835 assert index+len(sublist) >= self._offsets[-1], ( 836 'offests not monotonic increasing!') 837 self._offsets.append(index+len(sublist)) 838 else: 839 assert self._offsets[sublist_index+1] == index+len(sublist), ( 840 'inconsistent list value (num elts)') 841 842 for value in sublist[max(0, start_index-index):]: 843 yield value 844 845 index += len(sublist) 846 sublist_index += 1
847 848
849 -class LazyMap(AbstractLazySequence):
850 """ 851 A lazy sequence whose elements are formed by applying a given 852 function to each element in one or more underlying lists. The 853 function is applied lazily -- i.e., when you read a value from the 854 list, C{LazyMap} will calculate that value by applying its 855 function to the underlying lists' value(s). C{LazyMap} is 856 essentially a lazy version of the Python primitive function 857 C{map}. In particular, the following two expressions are 858 equivalent: 859 860 >>> map(f, sequences...) 861 >>> list(LazyMap(f, sequences...)) 862 863 Like the Python C{map} primitive, if the source lists do not have 864 equal size, then the value C{None} will be supplied for the 865 'missing' elements. 866 867 Lazy maps can be useful for conserving memory, in cases where 868 individual values take up a lot of space. This is especially true 869 if the underlying list's values are constructed lazily, as is the 870 case with many corpus readers. 871 872 A typical example of a use case for this class is performing 873 feature detection on the tokens in a corpus. Since featuresets 874 are encoded as dictionaries, which can take up a lot of memory, 875 using a C{LazyMap} can significantly reduce memory usage when 876 training and running classifiers. 877 """
878 - def __init__(self, function, *lists, **config):
879 """ 880 @param function: The function that should be applied to 881 elements of C{lists}. It should take as many arguments 882 as there are C{lists}. 883 @param lists: The underlying lists. 884 @kwparam cache_size: Determines the size of the cache used 885 by this lazy map. (default=5) 886 """ 887 if not lists: 888 raise TypeError('LazyMap requires at least two args') 889 890 self._lists = lists 891 self._func = function 892 self._cache_size = config.get('cache_size', 5) 893 if self._cache_size > 0: 894 self._cache = {} 895 else: 896 self._cache = None 897 898 # If you just take bool() of sum() here _all_lazy will be true just 899 # in case n >= 1 list is an AbstractLazySequence. Presumably this 900 # isn't what's intended. 901 self._all_lazy = sum(isinstance(lst, AbstractLazySequence) 902 for lst in lists) == len(lists)
903
904 - def iterate_from(self, index):
905 # Special case: one lazy sublist 906 if len(self._lists) == 1 and self._all_lazy: 907 for value in self._lists[0].iterate_from(index): 908 yield self._func(value) 909 return 910 911 # Special case: one non-lazy sublist 912 elif len(self._lists) == 1: 913 while True: 914 try: yield self._func(self._lists[0][index]) 915 except IndexError: return 916 index += 1 917 918 # Special case: n lazy sublists 919 elif self._all_lazy: 920 iterators = [lst.iterate_from(index) for lst in self._lists] 921 while True: 922 elements = [] 923 for iterator in iterators: 924 try: elements.append(iterator.next()) 925 except: elements.append(None) 926 if elements == [None] * len(self._lists): 927 return 928 yield self._func(*elements) 929 index += 1 930 931 # general case 932 else: 933 while True: 934 try: elements = [lst[index] for lst in self._lists] 935 except IndexError: 936 elements = [None] * len(self._lists) 937 for i, lst in enumerate(self._lists): 938 try: elements[i] = lst[index] 939 except IndexError: pass 940 if elements == [None] * len(self._lists): 941 return 942 yield self._func(*elements) 943 index += 1
944
945 - def __getitem__(self, index):
946 if isinstance(index, slice): 947 sliced_lists = [lst[index] for lst in self._lists] 948 return LazyMap(self._func, *sliced_lists) 949 else: 950 # Handle negative indices 951 if index < 0: index += len(self) 952 if index < 0: raise IndexError('index out of range') 953 # Check the cache 954 if self._cache is not None and index in self._cache: 955 return self._cache[index] 956 # Calculate the value 957 try: val = self.iterate_from(index).next() 958 except StopIteration: 959 raise IndexError('index out of range') 960 # Update the cache 961 if self._cache is not None: 962 if len(self._cache) > self._cache_size: 963 self._cache.popitem() # discard random entry 964 self._cache[index] = val 965 # Return the value 966 return val
967
968 - def __len__(self):
969 return max(len(lst) for lst in self._lists)
970 971
972 -class LazyZip(LazyMap):
973 """ 974 A lazy sequence whose elements are tuples, each containing the i-th 975 element from each of the argument sequences. The returned list is 976 truncated in length to the length of the shortest argument sequence. The 977 tuples are constructed lazily -- i.e., when you read a value from the 978 list, C{LazyZip} will calculate that value by forming a C{tuple} from 979 the i-th element of each of the argument sequences. 980 981 C{LazyZip} is essentially a lazy version of the Python primitive function 982 C{zip}. In particular, the following two expressions are equivalent: 983 984 >>> zip(sequences...) 985 >>> list(LazyZip(sequences...)) 986 987 Lazy zips can be useful for conserving memory in cases where the argument 988 sequences are particularly long. 989 990 A typical example of a use case for this class is combining long sequences 991 of gold standard and predicted values in a classification or tagging task 992 in order to calculate accuracy. By constructing tuples lazily and 993 avoiding the creation of an additional long sequence, memory usage can be 994 significantly reduced. 995 """
996 - def __init__(self, *lists):
997 """ 998 @param lists: the underlying lists 999 @type lists: C{list} of C{list} 1000 """ 1001 LazyMap.__init__(self, lambda *elts: elts, *lists)
1002
1003 - def iterate_from(self, index):
1004 iterator = LazyMap.iterate_from(self, index) 1005 while index < len(self): 1006 yield iterator.next() 1007 index += 1 1008 return
1009
1010 - def __len__(self):
1011 return min(len(lst) for lst in self._lists)
1012 1013
1014 -class LazyEnumerate(LazyZip):
1015 """ 1016 A lazy sequence whose elements are tuples, each ontaining a count (from 1017 zero) and a value yielded by underlying sequence. C{LazyEnumerate} is 1018 useful for obtaining an indexed list. The tuples are constructed lazily 1019 -- i.e., when you read a value from the list, C{LazyEnumerate} will 1020 calculate that value by forming a C{tuple} from the count of the i-th 1021 element and the i-th element of the underlying sequence. 1022 1023 C{LazyEnumerate} is essentially a lazy version of the Python primitive 1024 function C{enumerate}. In particular, the following two expressions are 1025 equivalent: 1026 1027 >>> enumerate(sequence) 1028 >>> list(LazyEnumerate(sequence)) 1029 1030 Lazy enumerations can be useful for conserving memory in cases where the 1031 argument sequences are particularly long. 1032 1033 A typical example of a use case for this class is obtaining an indexed 1034 list for a long sequence of values. By constructing tuples lazily and 1035 avoiding the creation of an additional long sequence, memory usage can be 1036 significantly reduced. 1037 """ 1038
1039 - def __init__(self, lst):
1040 """ 1041 @param lst: the underlying list 1042 @type lst: C{list} 1043 """ 1044 LazyZip.__init__(self, xrange(len(lst)), lst)
1045 1046 1047 ###################################################################### 1048 # Binary Search in a File 1049 ###################################################################### 1050 1051 # inherited from pywordnet, by Oliver Steele
1052 -def binary_search_file(file, key, cache={}, cacheDepth=-1):
1053 """ 1054 Searches through a sorted file using the binary search algorithm. 1055 1056 @type file: file 1057 @param file: the file to be searched through. 1058 @type key: {string} 1059 @param key: the identifier we are searching for. 1060 @return: The line from the file with first word key. 1061 """ 1062 1063 key = key + ' ' 1064 keylen = len(key) 1065 start = 0 1066 currentDepth = 0 1067 1068 if hasattr(file, 'name'): 1069 end = os.stat(file.name).st_size - 1 1070 else: 1071 file.seek(0, 2) 1072 end = file.tell() - 1 1073 file.seek(0) 1074 1075 while start < end: 1076 lastState = start, end 1077 middle = (start + end) / 2 1078 1079 if cache.get(middle): 1080 offset, line = cache[middle] 1081 1082 else: 1083 line = "" 1084 while True: 1085 file.seek(max(0, middle - 1)) 1086 if middle > 0: 1087 file.readline() 1088 offset = file.tell() 1089 line = file.readline() 1090 if line != "": break 1091 # at EOF; try to find start of the last line 1092 middle = (start + middle)/2 1093 if middle == end -1: 1094 return None 1095 if currentDepth < cacheDepth: 1096 cache[middle] = (offset, line) 1097 1098 if offset > end: 1099 assert end != middle - 1, "infinite loop" 1100 end = middle - 1 1101 elif line[:keylen] == key: 1102 return line 1103 elif line > key: 1104 assert end != middle - 1, "infinite loop" 1105 end = middle - 1 1106 elif line < key: 1107 start = offset + len(line) - 1 1108 1109 currentDepth += 1 1110 thisState = start, end 1111 1112 if lastState == thisState: 1113 # Detects the condition where we're searching past the end 1114 # of the file, which is otherwise difficult to detect 1115 return None 1116 1117 return None
1118 1119 ###################################################################### 1120 # Proxy configuration 1121 ###################################################################### 1122
1123 -def set_proxy(proxy, (user, password)=(None, '')):
1124 """ 1125 Set the HTTP proxy for Python to download through. 1126 1127 If C{proxy} is None then tries to set proxy from enviroment or system 1128 settings. 1129 1130 @param proxy: The HTTP proxy server to use. For example: 1131 'http://proxy.example.com:3128/' 1132 @param user: The username to authenticate with. Use C{None} to disable 1133 authentication. 1134 @param password: The password to authenticate with. 1135 """ 1136 import urllib 1137 import urllib2 1138 1139 if proxy is None: 1140 # Try and find the system proxy settings 1141 try: 1142 proxy = urllib.getproxies()['http'] 1143 except KeyError: 1144 raise ValueError('Could not detect default proxy settings') 1145 1146 # Set up the proxy handler 1147 proxy_handler = urllib2.ProxyHandler({'http': proxy}) 1148 opener = urllib2.build_opener(proxy_handler) 1149 1150 if user is not None: 1151 # Set up basic proxy authentication if provided 1152 password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() 1153 password_manager.add_password(realm=None, uri=proxy, user=user, 1154 passwd=password) 1155 opener.add_handler(urllib2.ProxyBasicAuthHandler(password_manager)) 1156 opener.add_handler(urllib2.ProxyDigestAuthHandler(password_manager)) 1157 1158 # Overide the existing url opener 1159 urllib2.install_opener(opener)
1160