1
2
3
4
5
6
7
8
9
10 """
11 Class for representing hierarchical language structures, such as
12 syntax trees and morphological trees.
13 """
14
15
16
17 import re
18 import string
19
20 from grammar import Production, Nonterminal
21 from probability import ProbabilisticMixIn
22 from util import slice_bounds
23
24
25
26
27
28 -class Tree(list):
29 """
30 A hierarchical structure.
31
32 Each C{Tree} represents a single hierarchical grouping of
33 leaves and subtrees. For example, each constituent in a syntax
34 tree is represented by a single C{Tree}.
35
36 A tree's children are encoded as a C{list} of leaves and subtrees,
37 where a X{leaf} is a basic (non-tree) value; and a X{subtree} is a
38 nested C{Tree}.
39
40 Any other properties that a C{Tree} defines are known as
41 X{node properties}, and are used to add information about
42 individual hierarchical groupings. For example, syntax trees use a
43 NODE property to label syntactic constituents with phrase tags,
44 such as \"NP\" and\"VP\".
45
46 Several C{Tree} methods use X{tree positions} to specify
47 children or descendants of a tree. Tree positions are defined as
48 follows:
49
50 - The tree position M{i} specifies a C{Tree}'s M{i}th child.
51 - The tree position C{()} specifies the C{Tree} itself.
52 - If C{M{p}} is the tree position of descendant M{d}, then
53 C{M{p}+(M{i})} specifies the C{M{i}}th child of M{d}.
54
55 I.e., every tree position is either a single index C{M{i}},
56 specifying C{self[M{i}]}; or a sequence C{(M{i1}, M{i2}, ...,
57 M{iN})}, specifying
58 C{self[M{i1}][M{i2}]...[M{iN}]}.
59 """
60 - def __new__(cls, node_or_str=None, children=None):
61 if node_or_str is None:
62 return list.__new__(cls)
63 if children is None:
64 if not isinstance(node_or_str, basestring):
65 raise TypeError("%s: Expected a node value and child list "
66 "or a single string" % cls.__name__)
67 return cls.parse(node_or_str)
68 else:
69 if (isinstance(children, basestring) or
70 not hasattr(children, '__iter__')):
71 raise TypeError("%s() argument 2 should be a list, not a "
72 "string" % cls.__name__)
73 return list.__new__(cls, node_or_str, children)
74
75 - def __init__(self, node_or_str, children=None):
76 """
77 Construct a new tree. This constructor can be called in one
78 of two ways:
79
80 - C{Tree(node, children)} constructs a new tree with the
81 specified node value and list of children.
82
83 - C{Tree(s)} constructs a new tree by parsing the string
84 C{s}. It is equivalent to calling the class method
85 C{Tree.parse(s)}.
86 """
87
88
89
90
91
92
93 if children is None: return
94
95 list.__init__(self, children)
96 self.node = node_or_str
97
98
99
100
101
103 if not isinstance(other, Tree): return False
104 return self.node == other.node and list.__eq__(self, other)
106 return not (self == other)
108 if not isinstance(other, Tree): return False
109 return self.node < other.node or list.__lt__(self, other)
111 if not isinstance(other, Tree): return False
112 return self.node <= other.node or list.__le__(self, other)
117 if not isinstance(other, Tree): return False
118 return self.node >= other.node or list.__ge__(self, other)
119
120
121
122
123
125 raise TypeError('Tree does not support multiplication')
127 raise TypeError('Tree does not support multiplication')
129 raise TypeError('Tree does not support addition')
131 raise TypeError('Tree does not support addition')
132
133
134
135
136
147
149 if isinstance(index, (int, slice)):
150 return list.__setitem__(self, index, value)
151 else:
152 if len(index) == 0:
153 raise IndexError('The tree position () may not be '
154 'assigned to.')
155 elif len(index) == 1:
156 self[index[0]] = value
157 else:
158 self[index[0]][index[1:]] = value
159
161 if isinstance(index, (int, slice)):
162 return list.__delitem__(self, index)
163 else:
164 if len(index) == 0:
165 raise IndexError('The tree position () may not be deleted.')
166 elif len(index) == 1:
167 del self[index[0]]
168 else:
169 del self[index[0]][index[1:]]
170
171
172
173
174
176 """
177 @return: a list containing this tree's leaves.
178 The order reflects the order of the
179 leaves in the tree's hierarchical structure.
180 @rtype: C{list}
181 """
182 leaves = []
183 for child in self:
184 if isinstance(child, Tree):
185 leaves.extend(child.leaves())
186 else:
187 leaves.append(child)
188 return leaves
189
191 """
192 @return: a tree consisting of this tree's root connected directly to
193 its leaves, omitting all intervening non-terminal nodes.
194 @rtype: C{Tree}
195 """
196 return Tree(self.node, self.leaves())
197
199 """
200 @return: The height of this tree. The height of a tree
201 containing no children is 1; the height of a tree
202 containing only leaves is 2; and the height of any other
203 tree is one plus the maximum of its children's
204 heights.
205 @rtype: C{int}
206 """
207 max_child_height = 0
208 for child in self:
209 if isinstance(child, Tree):
210 max_child_height = max(max_child_height, child.height())
211 else:
212 max_child_height = max(max_child_height, 1)
213 return 1 + max_child_height
214
216 """
217 @param order: One of: C{preorder}, C{postorder}, C{bothorder},
218 C{leaves}.
219 """
220 positions = []
221 if order in ('preorder', 'bothorder'): positions.append( () )
222 for i, child in enumerate(self):
223 if isinstance(child, Tree):
224 childpos = child.treepositions(order)
225 positions.extend((i,)+p for p in childpos)
226 else:
227 positions.append( (i,) )
228 if order in ('postorder', 'bothorder'): positions.append( () )
229 return positions
230
232 """
233 Generate all the subtrees of this tree, optionally restricted
234 to trees matching the filter function.
235 @type filter: C{function}
236 @param filter: the function to filter all local trees
237 """
238 if not filter or filter(self):
239 yield self
240 for child in self:
241 if isinstance(child, Tree):
242 for subtree in child.subtrees(filter):
243 yield subtree
244
246 """
247 Generate the productions that correspond to the non-terminal nodes of the tree.
248 For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
249 form P -> C1 C2 ... Cn.
250
251 @rtype: list of C{Production}s
252 """
253
254 if not isinstance(self.node, basestring):
255 raise TypeError, 'Productions can only be generated from trees having node labels that are strings'
256
257 prods = [Production(Nonterminal(self.node), _child_names(self))]
258 for child in self:
259 if isinstance(child, Tree):
260 prods += child.productions()
261 return prods
262
264 """
265 @return: a list of tuples containing leaves and pre-terminals (part-of-speech tags).
266 The order reflects the order of the
267 leaves in the tree's hierarchical structure.
268 @rtype: C{list} of C{tuples}
269 """
270 pos = []
271 for child in self:
272 if isinstance(child, Tree):
273 pos.extend(child.pos())
274 else:
275 pos.append((child, self.node))
276 return pos
277
279 """
280 @return: The tree position of the C{index}-th leaf in this
281 tree. I.e., if C{tp=self.leaf_treeposition(i)}, then
282 C{self[tp]==self.leaves()[i]}.
283
284 @raise IndexError: If this tree contains fewer than C{index+1}
285 leaves, or if C{index<0}.
286 """
287 if index < 0: raise IndexError('index must be non-negative')
288
289 stack = [(self, ())]
290 while stack:
291 value, treepos = stack.pop()
292 if not isinstance(value, Tree):
293 if index == 0: return treepos
294 else: index -= 1
295 else:
296 for i in range(len(value)-1, -1, -1):
297 stack.append( (value[i], treepos+(i,)) )
298
299 raise IndexError('index must be less than or equal to len(self)')
300
302 """
303 @return: The tree position of the lowest descendant of this
304 tree that dominates C{self.leaves()[start:end]}.
305 @raise ValueError: if C{end <= start}
306 """
307 if end <= start:
308 raise ValueError('end must be greater than start')
309
310
311 start_treepos = self.leaf_treeposition(start)
312 end_treepos = self.leaf_treeposition(end-1)
313
314 for i in range(len(start_treepos)):
315 if i == len(end_treepos) or start_treepos[i] != end_treepos[i]:
316 return start_treepos[:i]
317 return start_treepos
318
319
320
321
322
349
371
372 - def collapse_unary(self, collapsePOS = False, collapseRoot = False, joinChar = "+"):
373 """
374 Collapse subtrees with a single child (ie. unary productions)
375 into a new non-terminal (Tree node) joined by 'joinChar'.
376 This is useful when working with algorithms that do not allow
377 unary productions, and completely removing the unary productions
378 would require loss of useful information. The Tree is modified
379 directly (since it is passed by reference) and no value is returned.
380
381 @param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
382 Part-of-Speech tags) since they are always unary productions
383 @type collapsePOS: C{boolean}
384 @param collapseRoot: 'False' (default) will not modify the root production
385 if it is unary. For the Penn WSJ treebank corpus, this corresponds
386 to the TOP -> productions.
387 @type collapseRoot: C{boolean}
388 @param joinChar: A string used to connect collapsed node values (default = "+")
389 @type joinChar: C{string}
390 """
391 from treetransforms import collapse_unary
392 collapse_unary(self, collapsePOS, collapseRoot, joinChar)
393
394
395
396
397
398
400 """
401 Convert a tree between different subtypes of Tree. C{cls} determines
402 which class will be used to encode the new tree.
403
404 @type val: L{Tree}
405 @param val: The tree that should be converted.
406 @return: The new C{Tree}.
407 """
408 if isinstance(val, Tree):
409 children = [cls.convert(child) for child in val]
410 return cls(val.node, children)
411 else:
412 return val
413 convert = classmethod(convert)
414
415 - def copy(self, deep=False):
416 if not deep: return self.__class__(self.node, self)
417 else: return self.__class__.convert(self)
418
420 - def freeze(self, leaf_freezer=None):
421 frozen_class = self._frozen_class()
422 if leaf_freezer is None:
423 newcopy = frozen_class.convert(self)
424 else:
425 newcopy = self.copy(deep=True)
426 for pos in newcopy.treepositions('leaves'):
427 newcopy[pos] = leaf_freezer(newcopy[pos])
428 newcopy = frozen_class.convert(newcopy)
429 hash(newcopy)
430 return newcopy
431
432
433
434
435
436 @classmethod
437 - def parse(cls, s, brackets='()', parse_node=None, parse_leaf=None,
438 node_pattern=None, leaf_pattern=None,
439 remove_empty_top_bracketing=False):
440 """
441 Parse a bracketed tree string and return the resulting tree.
442 Trees are represented as nested brackettings, such as::
443
444 (S (NP (NNP John)) (VP (V runs)))
445
446 @type s: C{str}
447 @param s: The string to parse
448
449 @type brackets: length-2 C{str}
450 @param brackets: The bracket characters used to mark the
451 beginning and end of trees and subtrees.
452
453 @type parse_node: C{function}
454 @type parse_leaf: C{function}
455 @param parse_node, parse_leaf: If specified, these functions
456 are applied to the substrings of C{s} corresponding to
457 nodes and leaves (respectively) to obtain the values for
458 those nodes and leaves. They should have the following
459 signature:
460
461 >>> parse_node(str) -> value
462
463 For example, these functions could be used to parse nodes
464 and leaves whose values should be some type other than
465 string (such as L{FeatStruct <nltk.featstruct.FeatStruct>}).
466 Note that by default, node strings and leaf strings are
467 delimited by whitespace and brackets; to override this
468 default, use the C{node_pattern} and C{leaf_pattern}
469 arguments.
470
471 @type node_pattern: C{str}
472 @type leaf_pattern: C{str}
473 @param node_pattern, leaf_pattern: Regular expression patterns
474 used to find node and leaf substrings in C{s}. By
475 default, both nodes patterns are defined to match any
476 sequence of non-whitespace non-bracket characters.
477
478 @type remove_empty_top_bracketing: C{bool}
479 @param remove_empty_top_bracketing: If the resulting tree has
480 an empty node label, and is length one, then return its
481 single child instead. This is useful for treebank trees,
482 which sometimes contain an extra level of bracketing.
483
484 @return: A tree corresponding to the string representation C{s}.
485 If this class method is called using a subclass of C{Tree},
486 then it will return a tree of that type.
487 @rtype: C{Tree}
488 """
489 if not isinstance(brackets, basestring) or len(brackets) != 2:
490 raise TypeError('brackets must be a length-2 string')
491 if re.search('\s', brackets):
492 raise TypeError('whitespace brackets not allowed')
493
494 open_b, close_b = brackets
495 open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
496 if node_pattern is None:
497 node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
498 if leaf_pattern is None:
499 leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
500 token_re = re.compile('%s\s*(%s)?|%s|(%s)' % (
501 open_pattern, node_pattern, close_pattern, leaf_pattern))
502
503 stack = [(None, [])]
504 for match in token_re.finditer(s):
505 token = match.group()
506
507 if token[0] == open_b:
508 if len(stack) == 1 and len(stack[0][1]) > 0:
509 cls._parse_error(s, match, 'end-of-string')
510 node = token[1:].lstrip()
511 if parse_node is not None: node = parse_node(node)
512 stack.append((node, []))
513
514 elif token == close_b:
515 if len(stack) == 1:
516 if len(stack[0][1]) == 0:
517 cls._parse_error(s, match, open_b)
518 else:
519 cls._parse_error(s, match, 'end-of-string')
520 node, children = stack.pop()
521 stack[-1][1].append(cls(node, children))
522
523 else:
524 if len(stack) == 1:
525 cls._parse_error(s, match, open_b)
526 if parse_leaf is not None: token = parse_leaf(token)
527 stack[-1][1].append(token)
528
529
530 if len(stack) > 1:
531 cls._parse_error(s, 'end-of-string', close_b)
532 elif len(stack[0][1]) == 0:
533 cls._parse_error(s, 'end-of-string', open_b)
534 else:
535 assert stack[0][0] is None
536 assert len(stack[0][1]) == 1
537 tree = stack[0][1][0]
538
539
540
541 if remove_empty_top_bracketing and tree.node == '' and len(tree) == 1:
542 tree = tree[0]
543
544 return tree
545
546 @classmethod
548 """
549 Display a friendly error message when parsing a tree string fails.
550 @param s: The string we're parsing.
551 @param match: regexp match of the problem token.
552 @param expecting: what we expected to see instead.
553 """
554
555 if match == 'end-of-string':
556 pos, token = len(s), 'end-of-string'
557 else:
558 pos, token = match.start(), match.group()
559 msg = '%s.parse(): expected %r but got %r\n%sat index %d.' % (
560 cls.__name__, expecting, token, ' '*12, pos)
561
562 s = s.replace('\n', ' ').replace('\t', ' ')
563 offset = pos
564 if len(s) > pos+10:
565 s = s[:pos+10]+'...'
566 if pos > 10:
567 s = '...'+s[pos-10:]
568 offset = 13
569 msg += '\n%s"%s"\n%s^' % (' '*16, s, ' '*(17+offset))
570 raise ValueError(msg)
571
572
573
574
581
583 childstr = ", ".join(repr(c) for c in self)
584 return '%s(%r, [%s])' % (self.__class__.__name__, self.node, childstr)
585
588
589 - def pprint(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
590 """
591 @return: A pretty-printed string representation of this tree.
592 @rtype: C{string}
593 @param margin: The right margin at which to do line-wrapping.
594 @type margin: C{int}
595 @param indent: The indentation level at which printing
596 begins. This number is used to decide how far to indent
597 subsequent lines.
598 @type indent: C{int}
599 @param nodesep: A string that is used to separate the node
600 from the children. E.g., the default value C{':'} gives
601 trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}.
602 """
603
604
605 s = self._pprint_flat(nodesep, parens, quotes)
606 if len(s)+indent < margin:
607 return s
608
609
610 if isinstance(self.node, basestring):
611 s = '%s%s%s' % (parens[0], self.node, nodesep)
612 else:
613 s = '%s%r%s' % (parens[0], self.node, nodesep)
614 for child in self:
615 if isinstance(child, Tree):
616 s += '\n'+' '*(indent+2)+child.pprint(margin, indent+2,
617 nodesep, parens, quotes)
618 elif isinstance(child, tuple):
619 s += '\n'+' '*(indent+2)+ "/".join(child)
620 elif isinstance(child, basestring) and not quotes:
621 s += '\n'+' '*(indent+2)+ '%s' % child
622 else:
623 s += '\n'+' '*(indent+2)+ '%r' % child
624 return s+parens[1]
625
627 r"""
628 Returns a representation of the tree compatible with the
629 LaTeX qtree package. This consists of the string C{\Tree}
630 followed by the parse tree represented in bracketed notation.
631
632 For example, the following result was generated from a parse tree of
633 the sentence C{The announcement astounded us}::
634
635 \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ]
636 [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ]
637
638 See U{http://www.ling.upenn.edu/advice/latex.html} for the LaTeX
639 style file for the qtree package.
640
641 @return: A latex qtree representation of this tree.
642 @rtype: C{string}
643 """
644 return r'\Tree ' + self.pprint(indent=6, nodesep='', parens=('[.', ' ]'))
645
647 childstrs = []
648 for child in self:
649 if isinstance(child, Tree):
650 childstrs.append(child._pprint_flat(nodesep, parens, quotes))
651 elif isinstance(child, tuple):
652 childstrs.append("/".join(child))
653 elif isinstance(child, basestring) and not quotes:
654 childstrs.append('%s' % child)
655 else:
656 childstrs.append('%r' % child)
657 if isinstance(self.node, basestring):
658 return '%s%s%s %s%s' % (parens[0], self.node, nodesep,
659 string.join(childstrs), parens[1])
660 else:
661 return '%s%r%s %s%s' % (parens[0], self.node, nodesep,
662 string.join(childstrs), parens[1])
663
665 - def __init__(self, node_or_str, children=None):
666 if children is None: return
667 super(ImmutableTree, self).__init__(node_or_str, children)
668
669
670 try:
671 self._hash = hash( (self.node, tuple(self)) )
672 except (TypeError, ValueError):
673 raise ValueError("ImmutableTree's node value and children "
674 "must be immutable")
676 raise ValueError, 'ImmutableTrees may not be modified'
678 raise ValueError, 'ImmutableTrees may not be modified'
680 raise ValueError, 'ImmutableTrees may not be modified'
682 raise ValueError, 'ImmutableTrees may not be modified'
684 raise ValueError, 'ImmutableTrees may not be modified'
686 raise ValueError, 'ImmutableTrees may not be modified'
688 raise ValueError, 'ImmutableTrees may not be modified'
690 raise ValueError, 'ImmutableTrees may not be modified'
691 - def pop(self, v=None):
692 raise ValueError, 'ImmutableTrees may not be modified'
694 raise ValueError, 'ImmutableTrees may not be modified'
696 raise ValueError, 'ImmutableTrees may not be modified'
698 raise ValueError, 'ImmutableTrees may not be modified'
701
703 """Set self._node. This will only succeed the first time the
704 node value is set, which should occur in Tree.__init__()."""
705 if hasattr(self, 'node'):
706 raise ValueError, 'ImmutableTrees may not be modified'
707 self._node = node
710 node = property(_get_node, _set_node)
711
717 """
718 An abstract base class for L{Tree}s that automatically maintain
719 pointers to their parents. These parent pointers are updated
720 whenever any change is made to a tree's structure. Two subclasses
721 are currently defined:
722
723 - L{ParentedTree} is used for tree structures where each subtree
724 has at most one parent. This class should be used in cases
725 where there is no"sharing" of subtrees.
726
727 - L{MultiParentedTree} is used for tree structures where a
728 subtree may have zero or more parents. This class should be
729 used in cases where subtrees may be shared.
730
731 Subclassing
732 ===========
733 The C{AbstractParentedTree} class redefines all operations that
734 modify a tree's structure to call two methods, which are used by
735 subclasses to update parent information:
736
737 - L{_setparent()} is called whenever a new child is added.
738 - L{_delparent()} is called whenever a child is removed.
739 """
740 - def __init__(self, node_or_str, children=None):
751
752
753
754
755
756 - def _setparent(self, child, index, dry_run=False):
757 """
758 Update C{child}'s parent pointer to point to self. This
759 method is only called if C{child}'s type is L{Tree}; i.e., it
760 is not called when adding a leaf to a tree. This method is
761 always called before the child is actually added to C{self}'s
762 child list.
763
764 @type child: L{Tree}
765 @type index: C{int}
766 @param index: The index of C{child} in C{self}.
767 @raise TypeError: If C{child} is a tree with an impropriate
768 type. Typically, if C{child} is a tree, then its type needs
769 to match C{self}'s type. This prevents mixing of
770 different tree types (single-parented, multi-parented, and
771 non-parented).
772 @param dry_run: If true, the don't actually set the child's
773 parent pointer; just check for any error conditions, and
774 raise an exception if one is found.
775 """
776 raise AssertionError('Abstract base class')
777
779 """
780 Update C{child}'s parent pointer to not point to self. This
781 method is only called if C{child}'s type is L{Tree}; i.e., it
782 is not called when removing a leaf from a tree. This method
783 is always called before the child is actually removed from
784 C{self}'s child list.
785
786 @type child: L{Tree}
787 @type index: C{int}
788 @param index: The index of C{child} in C{self}.
789 """
790 raise AssertionError('Abstract base class')
791
792
793
794
795
796
797
799
800 if isinstance(index, slice):
801 start, stop = slice_bounds(self, index)
802
803 for i in xrange(start, stop):
804 if isinstance(self[i], Tree):
805 self._delparent(self[i], i)
806
807 super(AbstractParentedTree, self).__delitem__(index)
808
809
810 elif isinstance(index, int):
811 if index < 0: index += len(self)
812 if index < 0: raise IndexError('index out of range')
813
814 if isinstance(self[index], Tree):
815 self._delparent(self[index], index)
816
817 super(AbstractParentedTree, self).__delitem__(index)
818
819
820 elif len(index) == 0:
821 raise IndexError('The tree position () may not be deleted.')
822
823
824 elif len(index) == 1:
825 del self[index[0]]
826
827
828 else:
829 del self[index[0]][index[1:]]
830
832
833 if isinstance(index, slice):
834 start, stop = slice_bounds(self, index)
835
836 if not isinstance(value, (list, tuple)):
837 value = list(value)
838
839
840 for i, child in enumerate(value):
841 if isinstance(child, Tree):
842 self._setparent(child, start+i, dry_run=True)
843
844 for i in xrange(start, stop):
845 if isinstance(self[i], Tree):
846 self._delparent(self[i], i)
847
848
849
850 for i, child in enumerate(value):
851 if isinstance(child, Tree):
852 self._setparent(child, start+i)
853
854 super(AbstractParentedTree, self).__setitem__(index, value)
855
856
857 elif isinstance(index, int):
858 if index < 0: index += len(self)
859 if index < 0: raise IndexError('index out of range')
860
861 if value is self[index]:
862 return
863
864 if isinstance(value, Tree):
865 self._setparent(value, index)
866
867 if isinstance(self[index], Tree):
868 self._delparent(self[index], index)
869
870 super(AbstractParentedTree, self).__setitem__(index, value)
871
872
873 elif len(index) == 0:
874 raise IndexError('The tree position () may not be assigned to.')
875
876
877 elif len(index) == 1:
878 self[index[0]] = value
879
880
881 else:
882 self[index[0]][index[1:]] = value
883
888
894
895 - def insert(self, index, child):
905
906 - def pop(self, index=-1):
912
913
914
920
921
922
923
924
925
926
927 if hasattr(list, '__getslice__'):
934
936 """
937 A L{Tree} that automatically maintains parent pointers for
938 single-parented trees. The following read-only property values
939 are automatically updated whenever the structure of a parented
940 tree is modified: L{parent}, L{parent_index}, L{left_sibling},
941 L{right_sibling}, L{root}, L{treeposition}.
942
943 Each C{ParentedTree} may have at most one parent. In
944 particular, subtrees may not be shared. Any attempt to reuse a
945 single C{ParentedTree} as a child of more than one parent (or
946 as multiple children of the same parent) will cause a
947 C{ValueError} exception to be raised.
948
949 C{ParentedTrees} should never be used in the same tree as C{Trees}
950 or C{MultiParentedTrees}. Mixing tree implementations may result
951 in incorrect parent pointers and in C{TypeError} exceptions.
952 """
953 - def __init__(self, node_or_str, children=None):
954 if children is None: return
955
956 self._parent = None
957 """The parent of this Tree, or C{None} if it has no parent."""
958
959 super(ParentedTree, self).__init__(node_or_str, children)
960
962
963
964
965
966
968 if self._parent is None: return None
969 for i, child in enumerate(self._parent):
970 if child is self: return i
971 assert False, 'expected to find self in self._parent!'
972
978
984
989
991 if self._parent is None: return self
992 else: return self._parent._get_root()
993
994 parent = property(lambda self: self._parent, doc="""
995 The parent of this tree, or C{None} if it has no parent.""")
996
997 parent_index = property(_get_parent_index, doc="""
998 The index of this tree in its parent. I.e.,
999 C{ptree.parent[ptree.parent_index] is ptree}. Note that
1000 C{ptree.parent_index} is not necessarily equal to
1001 C{ptree.parent.index(ptree)}, since the C{index()} method
1002 returns the first child that is I{equal} to its argument.""")
1003
1004 left_sibling = property(_get_left_sibling, doc="""
1005 The left sibling of this tree, or C{None} if it has none.""")
1006
1007 right_sibling = property(_get_right_sibling, doc="""
1008 The right sibling of this tree, or C{None} if it has none.""")
1009
1010 root = property(_get_root, doc="""
1011 The root of this tree. I.e., the unique ancestor of this tree
1012 whose parent is C{None}. If C{ptree.parent} is C{None}, then
1013 C{ptree} is its own root.""")
1014
1015 treeposition = property(_get_treeposition, doc="""
1016 The tree position of this tree, relative to the root of the
1017 tree. I.e., C{ptree.root[ptree.treeposition] is ptree}.""")
1018 treepos = treeposition
1019
1020
1021
1022
1023
1032
1033 - def _setparent(self, child, index, dry_run=False):
1034
1035 if not isinstance(child, ParentedTree):
1036 raise TypeError('Can not insert a non-ParentedTree '+
1037 'into a ParentedTree')
1038
1039
1040 if child._parent is not None:
1041 raise ValueError('Can not insert a subtree that already '
1042 'has a parent.')
1043
1044
1045 if not dry_run:
1046 child._parent = self
1047
1049 """
1050 A L{Tree} that automatically maintains parent pointers for
1051 multi-parented trees. The following read-only property values are
1052 automatically updated whenever the structure of a multi-parented
1053 tree is modified: L{parents}, L{parent_indices}, L{left_siblings},
1054 L{right_siblings}, L{roots}, L{treepositions}.
1055
1056 Each C{MultiParentedTree} may have zero or more parents. In
1057 particular, subtrees may be shared. If a single
1058 C{MultiParentedTree} is used as multiple children of the same
1059 parent, then that parent will appear multiple times in its
1060 C{parents} property.
1061
1062 C{MultiParentedTrees} should never be used in the same tree as
1063 C{Trees} or C{ParentedTrees}. Mixing tree implementations may
1064 result in incorrect parent pointers and in C{TypeError} exceptions.
1065 """
1066 - def __init__(self, node_or_str, children=None):
1067 if children is None: return
1068
1069 self._parents = []
1070 """A list of this tree's parents. This list should not
1071 contain duplicates, even if a parent contains this tree
1072 multiple times."""
1073
1074 super(MultiParentedTree, self).__init__(node_or_str, children)
1075
1077
1078
1079
1080
1081
1087
1092
1097
1100
1108
1109 parents = property(lambda self: list(self._parents), doc="""
1110 The set of parents of this tree. If this tree has no parents,
1111 then C{parents} is the empty set. To check if a tree is used
1112 as multiple children of the same parent, use the
1113 L{parent_indices} property.
1114
1115 @type: C{list} of L{MultiParentedTree}""")
1116
1117 left_siblings = property(_get_left_siblings, doc="""
1118 A list of all left siblings of this tree, in any of its parent
1119 trees. A tree may be its own left sibling if it is used as
1120 multiple contiguous children of the same parent. A tree may
1121 appear multiple times in this list if it is the left sibling
1122 of this tree with respect to multiple parents.
1123
1124 @type: C{list} of L{MultiParentedTree}""")
1125
1126 right_siblings = property(_get_right_siblings, doc="""
1127 A list of all right siblings of this tree, in any of its parent
1128 trees. A tree may be its own right sibling if it is used as
1129 multiple contiguous children of the same parent. A tree may
1130 appear multiple times in this list if it is the right sibling
1131 of this tree with respect to multiple parents.
1132
1133 @type: C{list} of L{MultiParentedTree}""")
1134
1135 roots = property(_get_roots, doc="""
1136 The set of all roots of this tree. This set is formed by
1137 tracing all possible parent paths until trees with no parents
1138 are found.
1139
1140 @type: C{list} of L{MultiParentedTree}""")
1141
1143 """
1144 Return a list of the indices where this tree occurs as a child
1145 of C{parent}. If this child does not occur as a child of
1146 C{parent}, then the empty list is returned. The following is
1147 always true::
1148
1149 for parent_index in ptree.parent_indices(parent):
1150 parent[parent_index] is ptree
1151 """
1152 if parent not in self._parents: return []
1153 else: return [index for (index, child) in enumerate(parent)
1154 if child is self]
1155
1157 """
1158 Return a list of all tree positions that can be used to reach
1159 this multi-parented tree starting from C{root}. I.e., the
1160 following is always true::
1161
1162 for treepos in ptree.treepositions(root):
1163 root[treepos] is ptree
1164 """
1165 if self is root:
1166 return [()]
1167 else:
1168 return [treepos+(index,)
1169 for parent in self._parents
1170 for treepos in parent.treepositions(root)
1171 for (index, child) in enumerate(parent) if child is self]
1172
1173
1174
1175
1176
1177
1190
1191 - def _setparent(self, child, index, dry_run=False):
1192
1193 if not isinstance(child, MultiParentedTree):
1194 raise TypeError('Can not insert a non-MultiParentedTree '+
1195 'into a MultiParentedTree')
1196
1197
1198 if not dry_run:
1199 for parent in child._parents:
1200 if parent is self: break
1201 else:
1202 child._parents.append(self)
1203
1205 - def __init__(self, node_or_str, children=None):
1208
1210 - def __init__(self, node_or_str, children=None):
1213
1218 - def __new__(cls, node_or_str, children=None, **prob_kwargs):
1221 - def __init__(self, node_or_str, children=None, **prob_kwargs):
1225
1226
1231 return '%s (p=%s)' % (self.pprint(margin=60), self.prob())
1237 if not isinstance(other, Tree): return False
1238 return Tree.__eq__(self, other) and self.prob()==other.prob()
1240 return not (self == other)
1241 - def copy(self, deep=False):
1242 if not deep: return self.__class__(self.node, self, prob=self.prob())
1243 else: return self.__class__.convert(self)
1253 convert = classmethod(convert)
1254
1256 - def __new__(cls, node_or_str, children=None, **prob_kwargs):
1259 - def __init__(self, node_or_str, children=None, **prob_kwargs):
1263
1264
1269 return '%s [%s]' % (self.pprint(margin=60), self.prob())
1275 if not isinstance(other, Tree): return False
1276 return Tree.__eq__(self, other) and self.prob()==other.prob()
1278 return not (self == other)
1279 - def copy(self, deep=False):
1280 if not deep: return self.__class__(self.node, self, prob=self.prob())
1281 else: return self.__class__.convert(self)
1291 convert = classmethod(convert)
1292
1302
1308 """
1309 Use Tree.parse(s, remove_top_empty_bracketing=True) instead.
1310 """
1311 raise NameError("Use Tree.parse(s, remove_top_empty_bracketing=True) instead.")
1312
1314 """
1315 Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings,
1316 as shown in the following example (X represents a Chinese character):
1317 S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY)
1318
1319 @return: A tree corresponding to the string representation.
1320 @rtype: C{tree}
1321 @param s: The string to be converted
1322 @type s: C{string}
1323 """
1324 tokens = re.split(r'([()| ])', s)
1325 for i in range(len(tokens)):
1326 if tokens[i] == '(':
1327 tokens[i-1], tokens[i] = tokens[i], tokens[i-1]
1328 elif ':' in tokens[i]:
1329 fields = tokens[i].split(':')
1330 if len(fields) == 2:
1331 tokens[i] = fields[1]
1332 else:
1333 tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
1334 elif tokens[i] == '|':
1335 tokens[i] = ''
1336
1337 treebank_string = string.join(tokens)
1338 return bracket_parse(treebank_string)
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349 -def demo():
1350 """
1351 A demonstration showing how C{Tree}s and C{Tree}s can be
1352 used. This demonstration creates a C{Tree}, and loads a
1353 C{Tree} from the L{treebank<nltk.corpus.treebank>} corpus,
1354 and shows the results of calling several of their methods.
1355 """
1356
1357 from nltk import tree
1358
1359
1360 s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
1361 t = Tree(s)
1362 print "Convert bracketed string into tree:"
1363 print t
1364 print t.__repr__()
1365
1366 print "Display tree properties:"
1367 print t.node
1368 print t[0]
1369 print t[1]
1370 print t.height()
1371 print t.leaves()
1372 print t[1]
1373 print t[1,1]
1374 print t[1,1,0]
1375
1376
1377 the_cat = t[0]
1378 the_cat.insert(1, tree.bracket_parse('(JJ big)'))
1379 print "Tree modification:"
1380 print t
1381 t[1,1,1] = tree.bracket_parse('(NN cake)')
1382 print t
1383 print
1384
1385
1386 print "Collapse unary:"
1387 t.collapse_unary()
1388 print t
1389 print "Chomsky normal form:"
1390 t.chomsky_normal_form()
1391 print t
1392 print
1393
1394
1395 pt = tree.ProbabilisticTree('x', ['y', 'z'], prob=0.5)
1396 print "Probabilistic Tree:"
1397 print pt
1398 print
1399
1400
1401 t = tree.bracket_parse(t.pprint())
1402 print "Convert tree to bracketed string and back again:"
1403 print t
1404 print
1405
1406
1407 print "LaTeX output:"
1408 print t.pprint_latex_qtree()
1409 print
1410
1411
1412 print "Production output:"
1413 print t.productions()
1414 print
1415
1416
1417 t.node = ('test', 3)
1418 print t
1419
1420 if __name__ == '__main__':
1421 demo()
1422
1423 __all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn',
1424 'ProbabilisticTree', 'Tree', 'bracket_parse',
1425 'sinica_parse', 'ParentedTree', 'MultiParentedTree',
1426 'ImmutableParentedTree', 'ImmutableMultiParentedTree']
1427