1
2
3
4
5
6
7
8 import locale
9 import re
10 import types
11 import textwrap
12 import pydoc
13 import bisect
14 import os
15
16 from itertools import islice, chain
17 from pprint import pprint
18 from nltk.compat import defaultdict
19
20 from nltk.internals import slice_bounds
21
22
23
24
25
26 -def usage(obj, selfname='self'):
27 import inspect
28 str(obj)
29
30 if not isinstance(obj, (types.TypeType, types.ClassType)):
31 obj = obj.__class__
32
33 print '%s supports the following operations:' % obj.__name__
34 for (name, method) in sorted(pydoc.allmethods(obj).items()):
35 if name.startswith('_'): continue
36 if getattr(method, '__deprecated__', False): continue
37
38 args, varargs, varkw, defaults = inspect.getargspec(method)
39 if (args and args[0]=='self' and
40 (defaults is None or len(args)>len(defaults))):
41 args = args[1:]
42 name = '%s.%s' % (selfname, name)
43 argspec = inspect.formatargspec(
44 args, varargs, varkw, defaults)
45 print textwrap.fill('%s%s' % (name, argspec),
46 initial_indent=' - ',
47 subsequent_indent=' '*(len(name)+5))
48
49
50
51
52
54 """
55 @rtype: C{boolean}
56 @return: true if this function is run within idle. Tkinter
57 programs that are run in idle should never call C{Tk.mainloop}; so
58 this function should be used to gate all calls to C{Tk.mainloop}.
59
60 @warning: This function works by checking C{sys.stdin}. If the
61 user has modified C{sys.stdin}, then it may return incorrect
62 results.
63 """
64 import sys, types
65 return (type(sys.stdin) == types.InstanceType and \
66 sys.stdin.__class__.__name__ == 'PyShell')
67
68
69
70
71
72 -def pr(data, start=0, end=None):
73 """
74 Pretty print a sequence of data items
75
76 @param data: the data stream to print
77 @type data: C{sequence} or C{iterator}
78 @param start: the start position
79 @type start: C{int}
80 @param end: the end position
81 @type end: C{int}
82 """
83 pprint(list(islice(data, start, end)))
84
86 """
87 Pretty print a string, breaking lines on whitespace
88
89 @param s: the string to print, consisting of words and spaces
90 @type s: C{string}
91 @param width: the display width
92 @type width: C{int}
93 """
94 print '\n'.join(textwrap.wrap(s, width=width))
95
96 -def tokenwrap(tokens, separator=" ", width=70):
97 """
98 Pretty print a list of text tokens, breaking lines on whitespace
99
100 @param tokens: the tokens to print
101 @type tokens: C{list}
102 @param separator: the string to use to separate tokens
103 @type separator: C{str}
104 @param width: the display width (default=70)
105 @type width: C{int}
106 """
107 return '\n'.join(textwrap.wrap(separator.join(tokens), width=width))
108
109
110
111
112
113
114 -class Index(defaultdict):
120
121
122
123
124
125
126 -def re_show(regexp, string, left="{", right="}"):
127 """
128 Search C{string} for substrings matching C{regexp} and wrap
129 the matches with braces. This is convenient for learning about
130 regular expressions.
131
132 @param regexp: The regular expression.
133 @type regexp: C{string}
134 @param string: The string being matched.
135 @type string: C{string}
136 @param left: The left delimiter (printed before the matched substring)
137 @type left: C{string}
138 @param right: The right delimiter (printed after the matched substring)
139 @type right: C{string}
140 @rtype: C{string}
141 @return: A string with markers surrounding the matched substrings.
142 """
143 print re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip())
144
145
146
147
148
149
150
152 if hasattr(f, 'read'):
153 return f.read()
154 elif isinstance(f, basestring):
155 return open(f).read()
156 else:
157 raise ValueError, "Must be called with a filename or file-like object"
158
159
160
161
162
164 """Traverse the nodes of a tree in breadth-first order.
165 (No need to check for cycles.)
166 The first argument should be the tree root;
167 children should be a function taking as argument a tree node
168 and returning an iterator of the node's children.
169 """
170 if queue == None:
171 queue = []
172 queue.append(tree)
173
174 while queue:
175 node = queue.pop(0)
176 yield node
177 if depth != 0:
178 try:
179 queue += children(node)
180 depth -= 1
181 except:
182 pass
183
184
185
186
187
188
189
190
192 """
193 Given a byte string, attempt to decode it.
194 Tries the standard 'UTF8' and 'latin-1' encodings,
195 Plus several gathered from locale information.
196
197 The calling program *must* first call::
198
199 locale.setlocale(locale.LC_ALL, '')
200
201 If successful it returns C{(decoded_unicode, successful_encoding)}.
202 If unsuccessful it raises a C{UnicodeError}.
203 """
204 successful_encoding = None
205
206 encodings = ['utf-8']
207
208
209 try:
210 encodings.append(locale.nl_langinfo(locale.CODESET))
211 except AttributeError:
212 pass
213 try:
214 encodings.append(locale.getlocale()[1])
215 except (AttributeError, IndexError):
216 pass
217 try:
218 encodings.append(locale.getdefaultlocale()[1])
219 except (AttributeError, IndexError):
220 pass
221
222
223 encodings.append('latin-1')
224 for enc in encodings:
225
226
227 if not enc:
228 continue
229 try:
230 decoded = unicode(data, enc)
231 successful_encoding = enc
232
233 except (UnicodeError, LookupError):
234 pass
235 else:
236 break
237 if not successful_encoding:
238 raise UnicodeError(
239 'Unable to decode input data. Tried the following encodings: %s.'
240 % ', '.join([repr(enc) for enc in encodings if enc]))
241 else:
242 return (decoded, successful_encoding)
243
244
245
246
247
248
259
260
261
262
263
264
265
267 """
268 Calculate the transitive closure of a directed graph,
269 optionally the reflexive transitive closure.
270
271 The algorithm is a slight modification of the "Marking Algorithm" of
272 Ioannidis & Ramakrishnan (1998) "Efficient Transitive Closure Algorithms".
273
274 @param graph: the initial graph, represented as a dictionary of sets
275 @type graph: C{dict} of C{set}s
276 @param reflexive: if set, also make the closure reflexive
277 @type reflexive: C{bool}
278 @return: the (reflexive) transitive closure of the graph
279 @rtype: C{dict} of C{set}s
280 """
281 if reflexive:
282 base_set = lambda k: set([k])
283 else:
284 base_set = lambda k: set()
285
286 agenda_graph = dict((k, v.copy()) for (k,v) in graph.iteritems())
287
288 closure_graph = dict((k, base_set(k)) for k in graph)
289 for i in graph:
290 agenda = agenda_graph[i]
291 closure = closure_graph[i]
292 while agenda:
293 j = agenda.pop()
294 closure.add(j)
295 closure |= closure_graph.setdefault(j, base_set(j))
296 agenda |= agenda_graph.get(j, base_set(j))
297 agenda -= closure
298 return closure_graph
299
300
302 """
303 Inverts a directed graph.
304
305 @param graph: the graph, represented as a dictionary of sets
306 @type graph: C{dict} of C{set}s
307 @return: the inverted graph
308 @rtype: C{dict} of C{set}s
309 """
310 inverted = {}
311 for key, values in graph.iteritems():
312 for value in values:
313 inverted.setdefault(value, set()).add(key)
314 return inverted
315
316
317
318
319
320
321
323 """
324 Remove HTML markup from the given string.
325
326 @param html: the HTML string to be cleaned
327 @type html: C{string}
328 @rtype: C{string}
329 """
330
331
332 cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
333
334
335 cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
336
337 cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
338
339 cleaned = re.sub(r" ", " ", cleaned)
340 cleaned = re.sub(r" ", " ", cleaned)
341 cleaned = re.sub(r" ", " ", cleaned)
342 return cleaned.strip()
343
345 from urllib import urlopen
346 html = urlopen(url).read()
347 return clean_html(html)
348
349
350
351
352
354 """Flatten a list.
355
356 >>> flatten(1, 2, ['b', 'a' , ['c', 'd']], 3)
357 [1, 2, 'a', 'b', 'c', 'd', 3]
358
359 @param *args: items and lists to be combined into a single list
360 @rtype: C{list}
361 """
362
363 x = []
364 for l in args:
365 if not isinstance(l, (list, tuple)): l = [l]
366 for item in l:
367 if isinstance(item, (list, tuple)):
368 x.extend(flatten(item))
369 else:
370 x.append(item)
371 return x
372
373
374
375
376
377
378
379 -def ngrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
380 """
381 A utility that produces a sequence of ngrams from a sequence of items.
382 For example:
383
384 >>> ngrams([1,2,3,4,5], 3)
385 [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
386
387 Use ingram for an iterator version of this function. Set pad_left
388 or pad_right to true in order to get additional ngrams:
389
390 >>> ngrams([1,2,3,4,5], 2, pad_right=True)
391 [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
392
393 @param sequence: the source data to be converted into ngrams
394 @type sequence: C{sequence} or C{iterator}
395 @param n: the degree of the ngrams
396 @type n: C{int}
397 @param pad_left: whether the ngrams should be left-padded
398 @type pad_left: C{boolean}
399 @param pad_right: whether the ngrams should be right-padded
400 @type pad_right: C{boolean}
401 @param pad_symbol: the symbol to use for padding (default is None)
402 @type pad_symbol: C{any}
403 @return: The ngrams
404 @rtype: C{list} of C{tuple}s
405 """
406
407 if pad_left:
408 sequence = chain((pad_symbol,) * (n-1), sequence)
409 if pad_right:
410 sequence = chain(sequence, (pad_symbol,) * (n-1))
411 sequence = list(sequence)
412
413 count = max(0, len(sequence) - n + 1)
414 return [tuple(sequence[i:i+n]) for i in range(count)]
415
417 """
418 A utility that produces a sequence of bigrams from a sequence of items.
419 For example:
420
421 >>> bigrams([1,2,3,4,5])
422 [(1, 2), (2, 3), (3, 4), (4, 5)]
423
424 Use ibigrams for an iterator version of this function.
425
426 @param sequence: the source data to be converted into bigrams
427 @type sequence: C{sequence} or C{iterator}
428 @return: The bigrams
429 @rtype: C{list} of C{tuple}s
430 """
431 return ngrams(sequence, 2, **kwargs)
432
434 """
435 A utility that produces a sequence of trigrams from a sequence of items.
436 For example:
437
438 >>> trigrams([1,2,3,4,5])
439 [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
440
441 Use itrigrams for an iterator version of this function.
442
443 @param sequence: the source data to be converted into trigrams
444 @type sequence: C{sequence} or C{iterator}
445 @return: The trigrams
446 @rtype: C{list} of C{tuple}s
447 """
448 return ngrams(sequence, 3, **kwargs)
449
450 -def ingrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
451 """
452 A utility that produces an iterator over ngrams generated from a sequence of items.
453
454 For example:
455
456 >>> list(ingrams([1,2,3,4,5], 3))
457 [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
458
459 Use ngrams for a list version of this function. Set pad_left
460 or pad_right to true in order to get additional ngrams:
461
462 >>> list(ingrams([1,2,3,4,5], 2, pad_right=True))
463 [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
464
465 @param sequence: the source data to be converted into ngrams
466 @type sequence: C{sequence} or C{iterator}
467 @param n: the degree of the ngrams
468 @type n: C{int}
469 @param pad_left: whether the ngrams should be left-padded
470 @type pad_left: C{boolean}
471 @param pad_right: whether the ngrams should be right-padded
472 @type pad_right: C{boolean}
473 @param pad_symbol: the symbol to use for padding (default is None)
474 @type pad_symbol: C{any}
475 @return: The ngrams
476 @rtype: C{iterator} of C{tuple}s
477 """
478
479 sequence = iter(sequence)
480 if pad_left:
481 sequence = chain((pad_symbol,) * (n-1), sequence)
482 if pad_right:
483 sequence = chain(sequence, (pad_symbol,) * (n-1))
484
485 history = []
486 while n > 1:
487 history.append(sequence.next())
488 n -= 1
489 for item in sequence:
490 history.append(item)
491 yield tuple(history)
492 del history[0]
493
495 """
496 A utility that produces an iterator over bigrams generated from a sequence of items.
497
498 For example:
499
500 >>> list(ibigrams([1,2,3,4,5]))
501 [(1, 2), (2, 3), (3, 4), (4, 5)]
502
503 Use bigrams for a list version of this function.
504
505 @param sequence: the source data to be converted into bigrams
506 @type sequence: C{sequence} or C{iterator}
507 @return: The bigrams
508 @rtype: C{iterator} of C{tuple}s
509 """
510
511 for item in ingrams(sequence, 2, **kwargs):
512 yield item
513
515 """
516 A utility that produces an iterator over trigrams generated from a sequence of items.
517
518 For example:
519
520 >>> list(itrigrams([1,2,3,4,5])
521 [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
522
523 Use trigrams for a list version of this function.
524
525 @param sequence: the source data to be converted into trigrams
526 @type sequence: C{sequence} or C{iterator}
527 @return: The trigrams
528 @rtype: C{iterator} of C{tuple}s
529 """
530
531 for item in ingrams(sequence, 3, **kwargs):
532 yield item
533
534
535
536
537
539 - def __init__(self, data=None, **kwargs):
546
550
556
559
561 if not self._default_factory and key not in self._keys:
562 raise KeyError()
563 else:
564 return self._default_factory()
565
570
574
579
582
583 - def keys(self, data=None, keys=None):
601
603 if self._keys:
604 key = self._keys.pop()
605 value = self[key]
606 del self[key]
607 return (key, value)
608 else:
609 raise KeyError()
610
615
621
624
625
626
627
628
630 """
631 An abstract base class for read-only sequences whose values are
632 computed as needed. Lazy sequences act like tuples -- they can be
633 indexed, sliced, and iterated over; but they may not be modified.
634
635 The most common application of lazy sequences in NLTK is for
636 I{corpus view} objects, which provide access to the contents of a
637 corpus without loading the entire corpus into memory, by loading
638 pieces of the corpus from disk as needed.
639
640 The result of modifying a mutable element of a lazy sequence is
641 undefined. In particular, the modifications made to the element
642 may or may not persist, depending on whether and when the lazy
643 sequence caches that element's value or reconstructs it from
644 scratch.
645
646 Subclasses are required to define two methods:
647
648 - L{__len__()}
649 - L{iterate_from()}.
650 """
652 """
653 Return the number of tokens in the corpus file underlying this
654 corpus view.
655 """
656 raise NotImplementedError('should be implemented by subclass')
657
659 """
660 Return an iterator that generates the tokens in the corpus
661 file underlying this corpus view, starting at the token number
662 C{start}. If C{start>=len(self)}, then this iterator will
663 generate no tokens.
664 """
665 raise NotImplementedError('should be implemented by subclass')
666
668 """
669 Return the C{i}th token in the corpus file underlying this
670 corpus view. Negative indices and spans are both supported.
671 """
672 if isinstance(i, slice):
673 start, stop = slice_bounds(self, i)
674 return LazySubsequence(self, start, stop)
675 else:
676
677 if i < 0: i += len(self)
678 if i < 0: raise IndexError('index out of range')
679
680 try:
681 return self.iterate_from(i).next()
682 except StopIteration:
683 raise IndexError('index out of range')
684
686 """Return an iterator that generates the tokens in the corpus
687 file underlying this corpus view."""
688 return self.iterate_from(0)
689
691 """Return the number of times this list contains C{value}."""
692 return sum(1 for elt in self if elt==value)
693
694 - def index(self, value, start=None, stop=None):
695 """Return the index of the first occurance of C{value} in this
696 list that is greater than or equal to C{start} and less than
697 C{stop}. Negative start & stop values are treated like negative
698 slice bounds -- i.e., they count from the end of the list."""
699 start, stop = slice_bounds(self, slice(start, stop))
700 for i, elt in enumerate(islice(self, start, stop)):
701 if elt == value: return i+start
702 raise ValueError('index(x): x not in list')
703
705 """Return true if this list contains C{value}."""
706 return bool(self.count(value))
707
709 """Return a list concatenating self with other."""
710 return LazyConcatenation([self, other])
711
713 """Return a list concatenating other with self."""
714 return LazyConcatenation([other, self])
715
717 """Return a list concatenating self with itself C{count} times."""
718 return LazyConcatenation([self] * count)
719
721 """Return a list concatenating self with itself C{count} times."""
722 return LazyConcatenation([self] * count)
723
724 _MAX_REPR_SIZE = 60
726 """
727 @return: A string representation for this corpus view that is
728 similar to a list's representation; but if it would be more
729 than 60 characters long, it is truncated.
730 """
731 pieces = []
732 length = 5
733 for elt in self:
734 pieces.append(repr(elt))
735 length += len(pieces[-1]) + 2
736 if length > self._MAX_REPR_SIZE and len(pieces) > 2:
737 return '[%s, ...]' % ', '.join(pieces[:-1])
738 else:
739 return '[%s]' % ', '.join(pieces)
740
742 """
743 Return a number indicating how C{self} relates to other.
744
745 - If C{other} is not a corpus view or a C{list}, return -1.
746 - Otherwise, return C{cmp(list(self), list(other))}.
747
748 Note: corpus views do not compare equal to tuples containing
749 equal elements. Otherwise, transitivity would be violated,
750 since tuples do not compare equal to lists.
751 """
752 if not isinstance(other, (AbstractLazySequence, list)): return -1
753 return cmp(list(self), list(other))
754
756 """
757 @raise ValueError: Corpus view objects are unhashable.
758 """
759 raise ValueError('%s objects are unhashable' %
760 self.__class__.__name__)
761
762
764 """
765 A subsequence produced by slicing a lazy sequence. This slice
766 keeps a reference to its source sequence, and generates its values
767 by looking them up in the source sequence.
768 """
769
770 MIN_SIZE = 100
771 """The minimum size for which lazy slices should be created. If
772 C{LazySubsequence()} is called with a subsequence that is
773 shorter than C{MIN_SIZE}, then a tuple will be returned
774 instead."""
775
776 - def __new__(cls, source, start, stop):
777 """
778 Construct a new slice from a given underlying sequence. The
779 C{start} and C{stop} indices should be absolute indices --
780 i.e., they should not be negative (for indexing from the back
781 of a list) or greater than the length of C{source}.
782 """
783
784 if stop-start < cls.MIN_SIZE:
785 return list(islice(source.iterate_from(start), stop-start))
786 else:
787 return object.__new__(cls)
788
789 - def __init__(self, source, start, stop):
793
795 return self._stop - self._start
796
800
801
803 """
804 A lazy sequence formed by concatenating a list of lists. This
805 underlying list of lists may itself be lazy. C{LazyConcatenation}
806 maintains an index that it uses to keep track of the relationship
807 between offsets in the concatenated lists and offsets in the
808 sublists.
809 """
811 self._list = list_of_lists
812 self._offsets = [0]
813
815 if len(self._offsets) <= len(self._list):
816 for tok in self.iterate_from(self._offsets[-1]): pass
817 return self._offsets[-1]
818
820 if start_index < self._offsets[-1]:
821 sublist_index = bisect.bisect_right(self._offsets, start_index)-1
822 else:
823 sublist_index = len(self._offsets)-1
824
825 index = self._offsets[sublist_index]
826
827
828 if isinstance(self._list, AbstractLazySequence):
829 sublist_iter = self._list.iterate_from(sublist_index)
830 else:
831 sublist_iter = islice(self._list, sublist_index, None)
832
833 for sublist in sublist_iter:
834 if sublist_index == (len(self._offsets)-1):
835 assert index+len(sublist) >= self._offsets[-1], (
836 'offests not monotonic increasing!')
837 self._offsets.append(index+len(sublist))
838 else:
839 assert self._offsets[sublist_index+1] == index+len(sublist), (
840 'inconsistent list value (num elts)')
841
842 for value in sublist[max(0, start_index-index):]:
843 yield value
844
845 index += len(sublist)
846 sublist_index += 1
847
848
849 -class LazyMap(AbstractLazySequence):
850 """
851 A lazy sequence whose elements are formed by applying a given
852 function to each element in one or more underlying lists. The
853 function is applied lazily -- i.e., when you read a value from the
854 list, C{LazyMap} will calculate that value by applying its
855 function to the underlying lists' value(s). C{LazyMap} is
856 essentially a lazy version of the Python primitive function
857 C{map}. In particular, the following two expressions are
858 equivalent:
859
860 >>> map(f, sequences...)
861 >>> list(LazyMap(f, sequences...))
862
863 Like the Python C{map} primitive, if the source lists do not have
864 equal size, then the value C{None} will be supplied for the
865 'missing' elements.
866
867 Lazy maps can be useful for conserving memory, in cases where
868 individual values take up a lot of space. This is especially true
869 if the underlying list's values are constructed lazily, as is the
870 case with many corpus readers.
871
872 A typical example of a use case for this class is performing
873 feature detection on the tokens in a corpus. Since featuresets
874 are encoded as dictionaries, which can take up a lot of memory,
875 using a C{LazyMap} can significantly reduce memory usage when
876 training and running classifiers.
877 """
878 - def __init__(self, function, *lists, **config):
879 """
880 @param function: The function that should be applied to
881 elements of C{lists}. It should take as many arguments
882 as there are C{lists}.
883 @param lists: The underlying lists.
884 @kwparam cache_size: Determines the size of the cache used
885 by this lazy map. (default=5)
886 """
887 if not lists:
888 raise TypeError('LazyMap requires at least two args')
889
890 self._lists = lists
891 self._func = function
892 self._cache_size = config.get('cache_size', 5)
893 if self._cache_size > 0:
894 self._cache = {}
895 else:
896 self._cache = None
897
898
899
900
901 self._all_lazy = sum(isinstance(lst, AbstractLazySequence)
902 for lst in lists) == len(lists)
903
905
906 if len(self._lists) == 1 and self._all_lazy:
907 for value in self._lists[0].iterate_from(index):
908 yield self._func(value)
909 return
910
911
912 elif len(self._lists) == 1:
913 while True:
914 try: yield self._func(self._lists[0][index])
915 except IndexError: return
916 index += 1
917
918
919 elif self._all_lazy:
920 iterators = [lst.iterate_from(index) for lst in self._lists]
921 while True:
922 elements = []
923 for iterator in iterators:
924 try: elements.append(iterator.next())
925 except: elements.append(None)
926 if elements == [None] * len(self._lists):
927 return
928 yield self._func(*elements)
929 index += 1
930
931
932 else:
933 while True:
934 try: elements = [lst[index] for lst in self._lists]
935 except IndexError:
936 elements = [None] * len(self._lists)
937 for i, lst in enumerate(self._lists):
938 try: elements[i] = lst[index]
939 except IndexError: pass
940 if elements == [None] * len(self._lists):
941 return
942 yield self._func(*elements)
943 index += 1
944
946 if isinstance(index, slice):
947 sliced_lists = [lst[index] for lst in self._lists]
948 return LazyMap(self._func, *sliced_lists)
949 else:
950
951 if index < 0: index += len(self)
952 if index < 0: raise IndexError('index out of range')
953
954 if self._cache is not None and index in self._cache:
955 return self._cache[index]
956
957 try: val = self.iterate_from(index).next()
958 except StopIteration:
959 raise IndexError('index out of range')
960
961 if self._cache is not None:
962 if len(self._cache) > self._cache_size:
963 self._cache.popitem()
964 self._cache[index] = val
965
966 return val
967
969 return max(len(lst) for lst in self._lists)
970
971
973 """
974 A lazy sequence whose elements are tuples, each containing the i-th
975 element from each of the argument sequences. The returned list is
976 truncated in length to the length of the shortest argument sequence. The
977 tuples are constructed lazily -- i.e., when you read a value from the
978 list, C{LazyZip} will calculate that value by forming a C{tuple} from
979 the i-th element of each of the argument sequences.
980
981 C{LazyZip} is essentially a lazy version of the Python primitive function
982 C{zip}. In particular, the following two expressions are equivalent:
983
984 >>> zip(sequences...)
985 >>> list(LazyZip(sequences...))
986
987 Lazy zips can be useful for conserving memory in cases where the argument
988 sequences are particularly long.
989
990 A typical example of a use case for this class is combining long sequences
991 of gold standard and predicted values in a classification or tagging task
992 in order to calculate accuracy. By constructing tuples lazily and
993 avoiding the creation of an additional long sequence, memory usage can be
994 significantly reduced.
995 """
997 """
998 @param lists: the underlying lists
999 @type lists: C{list} of C{list}
1000 """
1001 LazyMap.__init__(self, lambda *elts: elts, *lists)
1002
1009
1011 return min(len(lst) for lst in self._lists)
1012
1013
1015 """
1016 A lazy sequence whose elements are tuples, each ontaining a count (from
1017 zero) and a value yielded by underlying sequence. C{LazyEnumerate} is
1018 useful for obtaining an indexed list. The tuples are constructed lazily
1019 -- i.e., when you read a value from the list, C{LazyEnumerate} will
1020 calculate that value by forming a C{tuple} from the count of the i-th
1021 element and the i-th element of the underlying sequence.
1022
1023 C{LazyEnumerate} is essentially a lazy version of the Python primitive
1024 function C{enumerate}. In particular, the following two expressions are
1025 equivalent:
1026
1027 >>> enumerate(sequence)
1028 >>> list(LazyEnumerate(sequence))
1029
1030 Lazy enumerations can be useful for conserving memory in cases where the
1031 argument sequences are particularly long.
1032
1033 A typical example of a use case for this class is obtaining an indexed
1034 list for a long sequence of values. By constructing tuples lazily and
1035 avoiding the creation of an additional long sequence, memory usage can be
1036 significantly reduced.
1037 """
1038
1040 """
1041 @param lst: the underlying list
1042 @type lst: C{list}
1043 """
1044 LazyZip.__init__(self, xrange(len(lst)), lst)
1045
1046
1047
1048
1049
1050
1051
1053 """
1054 Searches through a sorted file using the binary search algorithm.
1055
1056 @type file: file
1057 @param file: the file to be searched through.
1058 @type key: {string}
1059 @param key: the identifier we are searching for.
1060 @return: The line from the file with first word key.
1061 """
1062
1063 key = key + ' '
1064 keylen = len(key)
1065 start = 0
1066 currentDepth = 0
1067
1068 if hasattr(file, 'name'):
1069 end = os.stat(file.name).st_size - 1
1070 else:
1071 file.seek(0, 2)
1072 end = file.tell() - 1
1073 file.seek(0)
1074
1075 while start < end:
1076 lastState = start, end
1077 middle = (start + end) / 2
1078
1079 if cache.get(middle):
1080 offset, line = cache[middle]
1081
1082 else:
1083 line = ""
1084 while True:
1085 file.seek(max(0, middle - 1))
1086 if middle > 0:
1087 file.readline()
1088 offset = file.tell()
1089 line = file.readline()
1090 if line != "": break
1091
1092 middle = (start + middle)/2
1093 if middle == end -1:
1094 return None
1095 if currentDepth < cacheDepth:
1096 cache[middle] = (offset, line)
1097
1098 if offset > end:
1099 assert end != middle - 1, "infinite loop"
1100 end = middle - 1
1101 elif line[:keylen] == key:
1102 return line
1103 elif line > key:
1104 assert end != middle - 1, "infinite loop"
1105 end = middle - 1
1106 elif line < key:
1107 start = offset + len(line) - 1
1108
1109 currentDepth += 1
1110 thisState = start, end
1111
1112 if lastState == thisState:
1113
1114
1115 return None
1116
1117 return None
1118
1119
1120
1121
1122
1123 -def set_proxy(proxy, (user, password)=(None, '')):
1124 """
1125 Set the HTTP proxy for Python to download through.
1126
1127 If C{proxy} is None then tries to set proxy from enviroment or system
1128 settings.
1129
1130 @param proxy: The HTTP proxy server to use. For example:
1131 'http://proxy.example.com:3128/'
1132 @param user: The username to authenticate with. Use C{None} to disable
1133 authentication.
1134 @param password: The password to authenticate with.
1135 """
1136 import urllib
1137 import urllib2
1138
1139 if proxy is None:
1140
1141 try:
1142 proxy = urllib.getproxies()['http']
1143 except KeyError:
1144 raise ValueError('Could not detect default proxy settings')
1145
1146
1147 proxy_handler = urllib2.ProxyHandler({'http': proxy})
1148 opener = urllib2.build_opener(proxy_handler)
1149
1150 if user is not None:
1151
1152 password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
1153 password_manager.add_password(realm=None, uri=proxy, user=user,
1154 passwd=password)
1155 opener.add_handler(urllib2.ProxyBasicAuthHandler(password_manager))
1156 opener.add_handler(urllib2.ProxyDigestAuthHandler(password_manager))
1157
1158
1159 urllib2.install_opener(opener)
1160