1
2
3
4
5
6
7
8
9
10 import subprocess
11 import os
12 import os.path
13 import re
14 import warnings
15 import textwrap
16 import types
17 import sys
18 import stat
19
20 from nltk import __file__
21
22
23 try: from xml.etree import cElementTree as ElementTree
24 except ImportError: from nltk.etree import ElementTree
25
26
27
28
29
31 """
32 Convert all grouping parenthases in the given regexp pattern to
33 non-grouping parenthases, and return the result. E.g.:
34
35 >>> convert_regexp_to_nongrouping('ab(c(x+)(z*))?d')
36 'ab(?:c(?:x+)(?:z*))?d'
37
38 @type pattern: C{str}
39 @rtype: C{str}
40 """
41
42 for s in re.findall(r'\\.|\(\?P=', pattern):
43 if s[1] in '0123456789' or s == '(?P=':
44 raise ValueError('Regular expressions with back-references '
45 'are not supported: %r' % pattern)
46
47
48
49 def subfunc(m):
50 return re.sub('^\((\?P<[^>]*>)?$', '(?:', m.group())
51
52
53
54
55
56
57 return re.sub(r'''(?x)
58 \\. | # Backslashed character
59 \(\?P<[^>]*> | # Named group
60 \(\? | # Extension group
61 \( # Grouping parenthasis''', subfunc, pattern)
62
63
64
65
66
67
68 _java_bin = None
69 _java_options = []
70
72 """
73 Configure nltk's java interface, by letting nltk know where it can
74 find the C{java} binary, and what extra options (if any) should be
75 passed to java when it is run.
76
77 @param bin: The full path to the C{java} binary. If not specified,
78 then nltk will search the system for a C{java} binary; and if
79 one is not found, it will raise a C{LookupError} exception.
80 @type bin: C{string}
81 @param options: A list of options that should be passed to the
82 C{java} binary when it is called. A common value is
83 C{['-Xmx512m']}, which tells the C{java} binary to increase
84 the maximum heap size to 512 megabytes. If no options are
85 specified, then do not modify the options list.
86 @type options: C{list} of C{string}
87 """
88 global _java_bin, _java_options
89 _java_bin = find_binary('java', bin, env_vars=['JAVAHOME', 'JAVA_HOME'], verbose=verbose)
90
91 if options is not None:
92 if isinstance(options, basestring):
93 options = options.split()
94 _java_options = list(options)
95
96 -def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None,
97 blocking=True):
98 """
99 Execute the given java command, by opening a subprocess that calls
100 C{java}. If java has not yet been configured, it will be configured
101 by calling L{config_java()} with no arguments.
102
103 @param cmd: The java command that should be called, formatted as
104 a list of strings. Typically, the first string will be the name
105 of the java class; and the remaining strings will be arguments
106 for that java class.
107 @type cmd: C{list} of C{string}
108
109 @param classpath: A C{':'} separated list of directories, JAR
110 archives, and ZIP archives to search for class files.
111 @type classpath: C{string}
112
113 @param stdin, stdout, stderr: Specify the executed programs'
114 standard input, standard output and standard error file
115 handles, respectively. Valid values are C{subprocess.PIPE},
116 an existing file descriptor (a positive integer), an existing
117 file object, and C{None}. C{subprocess.PIPE} indicates that a
118 new pipe to the child should be created. With C{None}, no
119 redirection will occur; the child's file handles will be
120 inherited from the parent. Additionally, stderr can be
121 C{subprocess.STDOUT}, which indicates that the stderr data
122 from the applications should be captured into the same file
123 handle as for stdout.
124
125 @param blocking: If C{false}, then return immediately after
126 spawning the subprocess. In this case, the return value is
127 the C{Popen} object, and not a C{(stdout, stderr)} tuple.
128
129 @return: If C{blocking=True}, then return a tuple C{(stdout,
130 stderr)}, containing the stdout and stderr outputs generated
131 by the java command if the C{stdout} and C{stderr} parameters
132 were set to C{subprocess.PIPE}; or C{None} otherwise. If
133 C{blocking=False}, then return a C{subprocess.Popen} object.
134
135 @raise OSError: If the java command returns a nonzero return code.
136 """
137 if stdin == 'pipe': stdin = subprocess.PIPE
138 if stdout == 'pipe': stdout = subprocess.PIPE
139 if stderr == 'pipe': stderr = subprocess.PIPE
140 if isinstance(cmd, basestring):
141 raise TypeError('cmd should be a list of strings')
142
143
144 if _java_bin is None:
145 config_java()
146
147
148 if classpath is None:
149 classpath = NLTK_JAR
150 else:
151 classpath += ':' + NLTK_JAR
152
153
154 cmd = list(cmd)
155 cmd = ['-cp', classpath] + cmd
156 cmd = [_java_bin] + _java_options + cmd
157
158
159 p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr)
160 if not blocking: return p
161 (stdout, stderr) = p.communicate()
162
163
164 if p.returncode != 0:
165 print stderr
166 raise OSError('Java command failed!')
167
168 return (stdout, stderr)
169
170
171
172
173 NLTK_JAR = os.path.abspath(os.path.join(os.path.split(__file__)[0],
174 'nltk.jar'))
175
176 if 0:
177
178
179
180
181
182
183 (a,b) = java(['weka.classifiers.bayes.NaiveBayes',
184 '-l', '/tmp/names.model', '-T', '/tmp/test.arff',
185 '-p', '0'],
186 classpath='/Users/edloper/Desktop/weka/weka.jar')
187
188
189
190
191
192
194 """
195 Exception raised by parse_* functions when they fail.
196 @param position: The index in the input string where an error occured.
197 @param expected: What was expected when an error occured.
198 """
199 - def __init__(self, expected, position):
200 ValueError.__init__(self, expected, position)
201 self.expected = expected
202 self.position = position
204 return 'Expected %s at %s' % (self.expected, self.position)
205
206 _STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')")
208 """
209 If a Python string literal begins at the specified position in the
210 given string, then return a tuple C{(val, end_position)}
211 containing the value of the string literal and the position where
212 it ends. Otherwise, raise a L{ParseError}.
213 """
214
215 m = _STRING_START_RE.match(s, start_position)
216 if not m: raise ParseError('open quote', start_position)
217 quotemark = m.group(1)
218
219
220 _STRING_END_RE = re.compile(r'\\|%s' % quotemark)
221 position = m.end()
222 while True:
223 match = _STRING_END_RE.search(s, position)
224 if not match: raise ParseError('close quote', position)
225 if match.group(0) == '\\': position = match.end()+1
226 else: break
227
228
229
230 try:
231 return eval(s[start_position:match.end()]), match.end()
232 except ValueError, e:
233 raise ParseError('valid string (%s)' % e, start)
234
235 _PARSE_INT_RE = re.compile(r'-?\d+')
237 """
238 If an integer begins at the specified position in the given
239 string, then return a tuple C{(val, end_position)} containing the
240 value of the integer and the position where it ends. Otherwise,
241 raise a L{ParseError}.
242 """
243 m = _PARSE_INT_RE.match(s, start_position)
244 if not m: raise ParseError('integer', start_position)
245 return int(m.group()), m.end()
246
247 _PARSE_NUMBER_VALUE = re.compile(r'-?(\d*)([.]?\d*)?')
249 """
250 If an integer or float begins at the specified position in the
251 given string, then return a tuple C{(val, end_position)}
252 containing the value of the number and the position where it ends.
253 Otherwise, raise a L{ParseError}.
254 """
255 m = _PARSE_NUMBER_VALUE.match(s, start_position)
256 if not m or not (m.group(1) or m.group(2)):
257 raise ParseError('number', start_position)
258 if m.group(2): return float(m.group()), m.end()
259 else: return int(m.group()), m.end()
260
261
262
263
264
265
266
268 """
269 @return: True if C{method} overrides some method with the same
270 name in a base class. This is typically used when defining
271 abstract base classes or interfaces, to allow subclasses to define
272 either of two related methods:
273
274 >>> class EaterI:
275 ... '''Subclass must define eat() or batch_eat().'''
276 ... def eat(self, food):
277 ... if overridden(self.batch_eat):
278 ... return self.batch_eat([food])[0]
279 ... else:
280 ... raise NotImplementedError()
281 ... def batch_eat(self, foods):
282 ... return [self.eat(food) for food in foods]
283
284 @type method: instance method
285 """
286
287 if isinstance(method, types.MethodType) and method.im_class is not None:
288 name = method.__name__
289 funcs = [cls.__dict__[name]
290 for cls in _mro(method.im_class)
291 if name in cls.__dict__]
292 return len(funcs) > 1
293 else:
294 raise TypeError('Expected an instance method.')
295
297 """
298 Return the I{method resolution order} for C{cls} -- i.e., a list
299 containing C{cls} and all its base classes, in the order in which
300 they would be checked by C{getattr}. For new-style classes, this
301 is just cls.__mro__. For classic classes, this can be obtained by
302 a depth-first left-to-right traversal of C{__bases__}.
303 """
304 if isinstance(cls, type):
305 return cls.__mro__
306 else:
307 mro = [cls]
308 for base in cls.__bases__: mro.extend(_mro(base))
309 return mro
310
311
312
313
314
315
316 -def _add_epytext_field(obj, field, message):
317 """Add an epytext @field to a given object's docstring."""
318 indent = ''
319
320
321 if obj.__doc__:
322 obj.__doc__ = obj.__doc__.rstrip()+'\n\n'
323 indents = re.findall(r'(?<=\n)[ ]+(?!\s)', obj.__doc__.expandtabs())
324 if indents: indent = min(indents)
325
326 else:
327 obj.__doc__ = ''
328
329 obj.__doc__ += textwrap.fill('@%s: %s' % (field, message),
330 initial_indent=indent,
331 subsequent_indent=indent+' ')
332
334 """
335 A decorator used to mark functions as deprecated. This will cause
336 a warning to be printed the when the function is used. Usage:
337
338 >>> @deprecated('Use foo() instead')
339 >>> def bar(x):
340 ... print x/10
341 """
342 def decorator(func):
343 msg = ("Function %s() has been deprecated. %s"
344 % (func.__name__, message))
345 msg = '\n' + textwrap.fill(msg, initial_indent=' ',
346 subsequent_indent=' ')
347 def newFunc(*args, **kwargs):
348 warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
349 return func(*args, **kwargs)
350
351
352 newFunc.__dict__.update(func.__dict__)
353 newFunc.__name__ = func.__name__
354 newFunc.__doc__ = func.__doc__
355 newFunc.__deprecated__ = True
356
357 _add_epytext_field(newFunc, 'deprecated', message)
358 return newFunc
359 return decorator
360
362 """
363 A base class used to mark deprecated classes. A typical usage is to
364 alert users that the name of a class has changed:
365
366 >>> class OldClassName(Deprecated, NewClassName):
367 ... "Use NewClassName instead."
368
369 The docstring of the deprecated class will be used in the
370 deprecation warning message.
371 """
372 - def __new__(cls, *args, **kwargs):
373
374 dep_cls = None
375 for base in _mro(cls):
376 if Deprecated in base.__bases__:
377 dep_cls = base; break
378 assert dep_cls, 'Unable to determine which base is deprecated.'
379
380
381 doc = dep_cls.__doc__ or ''.strip()
382
383 doc = re.sub(r'\A\s*@deprecated:', r'', doc)
384
385 doc = re.sub(r'(?m)^\s*', '', doc)
386
387 name = 'Class %s' % dep_cls.__name__
388 if cls != dep_cls:
389 name += ' (base class for %s)' % cls.__name__
390
391 msg = '%s has been deprecated. %s' % (name, doc)
392
393 msg = '\n' + textwrap.fill(msg, initial_indent=' ',
394 subsequent_indent=' ')
395 warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
396
397 return object.__new__(cls, *args, **kwargs)
398
399
400
401
402
404 """
405 A counter that auto-increments each time its value is read.
406 """
408 self._value = initial_value
410 self._value += 1
411 return self._value
412
413
414
415
416
417 -def find_binary(name, path_to_bin=None, env_vars=(),
418 searchpath=(), binary_names=None, url=None,
419 verbose=True):
420 """
421 Search for the binary for a program that is used by nltk.
422
423 @param name: The name of the program
424 @param path_to_bin: The user-supplied binary location, or None.
425 @param env_vars: A list of environment variable names to check
426 @param binary_names: A list of alternative binary names to check.
427 @param searchpath: List of directories to search.
428 """
429 if binary_names is None: binary_names = [name]
430 assert isinstance(name, basestring)
431 assert not isinstance(binary_names, basestring)
432 assert not isinstance(searchpath, basestring)
433 if isinstance(env_vars, basestring):
434 env_vars = env_vars.split()
435
436
437
438 if path_to_bin is not None:
439 if os.path.isfile(path_to_bin):
440 return path_to_bin
441 for bin in binary_names:
442 if os.path.isfile(os.path.join(path_to_bin, bin)):
443 return os.path.join(path_to_bin, bin)
444 if os.path.isfile(os.path.join(path_to_bin, 'bin', bin)):
445 return os.path.join(path_to_bin, 'bin', bin)
446 raise ValueError('Could not find %s binary at %s' %
447 (name, path_to_bin))
448
449
450 for env_var in env_vars:
451 if env_var in os.environ:
452 path_to_bin = os.environ[env_var]
453 if os.path.isfile(path_to_bin):
454 if verbose: print '[Found %s: %s]' % (name, path_to_bin)
455 return os.environ[env_var]
456 else:
457 for bin_name in binary_names:
458 path_to_bin = os.path.join(os.environ[env_var], bin_name)
459 if os.path.isfile(path_to_bin):
460 if verbose: print '[Found %s: %s]'%(name, path_to_bin)
461 return path_to_bin
462 path_to_bin = os.path.join(os.environ[env_var], 'bin',
463 bin_name)
464 if os.path.isfile(path_to_bin):
465 if verbose: print '[Found %s: %s]'%(name, path_to_bin)
466 return path_to_bin
467
468
469 for directory in searchpath:
470 for bin in binary_names:
471 path_to_bin = os.path.join(directory, bin)
472 if os.path.isfile(path_to_bin):
473 return path_to_bin
474
475
476
477
478 if os.name == 'posix':
479 for bin in binary_names:
480 try:
481 p = subprocess.Popen(['which', bin], stdout=subprocess.PIPE)
482 stdout, stderr = p.communicate()
483 path = stdout.strip()
484 if path.endswith(bin) and os.path.exists(path):
485 if verbose: print '[Found %s: %s]' % (name, path)
486 return path
487 except KeyboardInterrupt, SystemExit:
488 raise
489 except:
490 pass
491
492 msg = ("NLTK was unable to find the %s executable! Use "
493 "config_%s()" % (name, name))
494 if env_vars: msg += ' or set the %s environment variable' % env_vars[0]
495 msg = textwrap.fill(msg+'.', initial_indent=' ',
496 subsequent_indent=' ')
497 msg += "\n\n >>> config_%s('/path/to/%s')" % (name, name)
498 if searchpath:
499 msg += '\n\n Searched in:'
500 msg += ''.join('\n - %s' % d for d in searchpath)
501 if url: msg += ('\n\n For more information, on %s, see:\n <%s>' %
502 (name, url))
503 div = '='*75
504 raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
505
506
507
508
509
510
511 -def find_jar(name, path_to_jar=None, env_vars=(),
512 searchpath=(), url=None, verbose=True):
513 """
514 Search for a jar that is used by nltk.
515
516 @param name: The name of the jar file
517 @param path_to_jar: The user-supplied jar location, or None.
518 @param env_vars: A list of environment variable names to check
519 in addition to the CLASSPATH variable which is
520 checked by default.
521 @param searchpath: List of directories to search.
522 """
523
524 assert isinstance(name, basestring)
525 assert not isinstance(searchpath, basestring)
526 if isinstance(env_vars, basestring):
527 env_vars = env_vars.split()
528
529
530 env_vars = ['CLASSPATH'] + list(env_vars)
531
532
533
534 if path_to_jar is not None:
535 if os.path.isfile(path_to_jar):
536 return path_to_jar
537 raise ValueError('Could not find %s jar file at %s' %
538 (name, path_to_jar))
539
540
541 for env_var in env_vars:
542 if env_var in os.environ:
543 if env_var == 'CLASSPATH':
544 classpath = os.environ['CLASSPATH']
545 for cp in classpath.split(':'):
546 if os.path.isfile(cp) and os.path.basename(cp) == name:
547 if verbose: print '[Found %s: %s]' % (name, cp)
548 return cp
549 else:
550 path_to_jar = os.environ[env_var]
551 if os.path.isfile(path_to_jar) and os.path.basename(path_to_jar) == name:
552 if verbose: print '[Found %s: %s]' % (name, path_to_jar)
553 return path_to_jar
554
555
556 for directory in searchpath:
557 path_to_jar = os.path.join(directory, name)
558 if os.path.isfile(path_to_jar):
559 if verbose: print '[Found %s: %s]' % (name, path_to_jar)
560 return path_to_jar
561
562
563 msg = ("NLTK was unable to find %s!" % name)
564 if env_vars: msg += ' Set the %s environment variable' % env_vars[0]
565 msg = textwrap.fill(msg+'.', initial_indent=' ',
566 subsequent_indent=' ')
567 if searchpath:
568 msg += '\n\n Searched in:'
569 msg += ''.join('\n - %s' % d for d in searchpath)
570 if url: msg += ('\n\n For more information, on %s, see:\n <%s>' %
571 (name, url))
572 div = '='*75
573 raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
574
575
576
577
578
580 """
581 When python is run from within the nltk/ directory tree, the
582 current directory is included at the beginning of the search path.
583 Unfortunately, that means that modules within nltk can sometimes
584 shadow standard library modules. As an example, the stdlib
585 'inspect' module will attempt to import the stdlib 'tokenzie'
586 module, but will instead end up importing NLTK's 'tokenize' module
587 instead (causing the import to fail).
588 """
589 old_path = sys.path
590 sys.path = [d for d in sys.path if d not in ('', '.')]
591 m = __import__(module)
592 sys.path = old_path
593 return m
594
595
596
597
598
600 """
601 A decorator used to mark methods as abstract. I.e., methods that
602 are marked by this decorator must be overridden by subclasses. If
603 an abstract method is called (either in the base class or in a
604 subclass that does not override the base class method), it will
605 raise C{NotImplementedError}.
606 """
607
608 inspect = import_from_stdlib('inspect')
609
610
611 args, varargs, varkw, defaults = inspect.getargspec(func)
612
613
614
615 msg = '%s is an abstract method.' % func.__name__
616 signature = inspect.formatargspec(args, varargs, varkw, ())
617 exec ('def newfunc%s: raise NotImplementedError(%r)' % (signature, msg))
618
619
620
621 newfunc.func_defaults = func.func_defaults
622
623
624 newfunc.__name__ = func.__name__
625 newfunc.__doc__ = func.__doc__
626 newfunc.__abstract__ = True
627 _add_epytext_field(newfunc, "note", "This method is abstract.")
628
629
630 return newfunc
631
632
633
634
635
637 """
638 A wrapper around ElementTree Element objects whose main purpose is
639 to provide nicer __repr__ and __str__ methods. In addition, any
640 of the wrapped Element's methods that return other Element objects
641 are overridden to wrap those values before returning them.
642
643 This makes Elements more convenient to work with in
644 interactive sessions and doctests, at the expense of some
645 efficiency.
646 """
647
648
650 """
651 Create and return a wrapper around a given Element object.
652 If C{etree} is an C{ElementWrapper}, then C{etree} is
653 returned as-is.
654 """
655 if isinstance(etree, ElementWrapper):
656 return etree
657 else:
658 return object.__new__(ElementWrapper, etree)
659
661 """
662 Initialize a new Element wrapper for C{etree}. If
663 C{etree} is a string, then it will be converted to an
664 Element object using C{ElementTree.fromstring()} first.
665 """
666 if isinstance(etree, basestring):
667 etree = ElementTree.fromstring(etree)
668 self.__dict__['_etree'] = etree
669
671 """
672 Return the Element object wrapped by this wrapper.
673 """
674 return self._etree
675
676
677
678
679
681 s = ElementTree.tostring(self._etree)
682 if len(s) > 60:
683 e = s.rfind('<')
684 if (len(s)-e) > 30: e = -20
685 s = '%s...%s' % (s[:30], s[e:])
686 return '<Element %r>' % s
687
689 """
690 @return: the result of applying C{ElementTree.tostring()} to
691 the wrapped Element object.
692 """
693 return ElementTree.tostring(self._etree).rstrip()
694
695
696
697
698
700 return getattr(self._etree, attrib)
701
703 return setattr(self._etree, attr, value)
704
706 return delattr(self._etree, attr)
707
709 self._etree[index] = element
710
712 del self._etree[index]
713
716
719
721 return len(self._etree)
722
723
724
725
726
729
732
735
739
742
743 - def find(self, path):
747
750
751
752
753
754
756 """
757 Given a slice, return the corresponding (start, stop) bounds,
758 taking into account None indices and negative indices. The
759 following guarantees are made for the returned start and stop values:
760
761 - 0 <= start <= len(sequence)
762 - 0 <= stop <= len(sequence)
763 - start <= stop
764
765 @raise ValueError: If C{slice_obj.step} is not C{None}.
766 @param allow_step: If true, then the slice object may have a
767 non-None step. If it does, then return a tuple
768 (start, stop, step).
769 """
770 start, stop = (slice_obj.start, slice_obj.stop)
771
772
773
774 if allow_step:
775 if slice_obj.step is None: slice_obj.step = 1
776
777
778
779 if slice_obj.step < 0:
780 start, stop = slice_bounds(sequence, slice(stop, start))
781 else:
782 start, stop = slice_bounds(sequence, slice(start, stop))
783 return start, stop, slice_obj.step
784
785
786 elif slice_obj.step not in (None, 1):
787 raise ValueError('slices with steps are not supported by %s' %
788 sequence.__class__.__name__)
789
790
791 if start is None: start = 0
792 if stop is None: stop = len(sequence)
793
794
795 if start < 0: start = max(0, len(sequence)+start)
796 if stop < 0: stop = max(0, len(sequence)+stop)
797
798
799
800
801 if stop > 0:
802 try: sequence[stop-1]
803 except IndexError: stop = len(sequence)
804
805
806 start = min(start, stop)
807
808
809 return start, stop
810
811
812
813
814
816
817 if not os.path.exists(path):
818 return False
819
820
821 if hasattr(os, 'getuid'):
822 statdata = os.stat(path)
823 perm = stat.S_IMODE(statdata.st_mode)
824
825 if (perm & 0002):
826 return True
827
828 elif statdata.st_uid == os.getuid() and (perm & 0200):
829 return True
830
831 elif statdata.st_gid == os.getgid() and (perm & 0020):
832 return True
833
834 else:
835 return False
836
837
838
839 return True
840