Package nltk :: Module tree
[hide private]
[frames] | no frames]

Source Code for Module nltk.tree

   1  # Natural Language Toolkit: Text Trees 
   2  # 
   3  # Copyright (C) 2001-2011 NLTK Project 
   4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   5  #         Steven Bird <sb@csse.unimelb.edu.au> 
   6  #         Nathan Bodenstab <bodenstab@cslu.ogi.edu> (tree transforms) 
   7  # URL: <http://www.nltk.org/> 
   8  # For license information, see LICENSE.TXT 
   9   
  10  """ 
  11  Class for representing hierarchical language structures, such as 
  12  syntax trees and morphological trees. 
  13  """ 
  14   
  15  # TODO: add LabelledTree (can be used for dependency trees) 
  16   
  17  import re 
  18  import string 
  19   
  20  from grammar import Production, Nonterminal 
  21  from probability import ProbabilisticMixIn 
  22  from util import slice_bounds 
23 24 ###################################################################### 25 ## Trees 26 ###################################################################### 27 28 -class Tree(list):
29 """ 30 A hierarchical structure. 31 32 Each C{Tree} represents a single hierarchical grouping of 33 leaves and subtrees. For example, each constituent in a syntax 34 tree is represented by a single C{Tree}. 35 36 A tree's children are encoded as a C{list} of leaves and subtrees, 37 where a X{leaf} is a basic (non-tree) value; and a X{subtree} is a 38 nested C{Tree}. 39 40 Any other properties that a C{Tree} defines are known as 41 X{node properties}, and are used to add information about 42 individual hierarchical groupings. For example, syntax trees use a 43 NODE property to label syntactic constituents with phrase tags, 44 such as \"NP\" and\"VP\". 45 46 Several C{Tree} methods use X{tree positions} to specify 47 children or descendants of a tree. Tree positions are defined as 48 follows: 49 50 - The tree position M{i} specifies a C{Tree}'s M{i}th child. 51 - The tree position C{()} specifies the C{Tree} itself. 52 - If C{M{p}} is the tree position of descendant M{d}, then 53 C{M{p}+(M{i})} specifies the C{M{i}}th child of M{d}. 54 55 I.e., every tree position is either a single index C{M{i}}, 56 specifying C{self[M{i}]}; or a sequence C{(M{i1}, M{i2}, ..., 57 M{iN})}, specifying 58 C{self[M{i1}][M{i2}]...[M{iN}]}. 59 """
60 - def __new__(cls, node_or_str=None, children=None):
61 if node_or_str is None: 62 return list.__new__(cls) # used by copy.deepcopy 63 if children is None: 64 if not isinstance(node_or_str, basestring): 65 raise TypeError("%s: Expected a node value and child list " 66 "or a single string" % cls.__name__) 67 return cls.parse(node_or_str) 68 else: 69 if (isinstance(children, basestring) or 70 not hasattr(children, '__iter__')): 71 raise TypeError("%s() argument 2 should be a list, not a " 72 "string" % cls.__name__) 73 return list.__new__(cls, node_or_str, children)
74
75 - def __init__(self, node_or_str, children=None):
76 """ 77 Construct a new tree. This constructor can be called in one 78 of two ways: 79 80 - C{Tree(node, children)} constructs a new tree with the 81 specified node value and list of children. 82 83 - C{Tree(s)} constructs a new tree by parsing the string 84 C{s}. It is equivalent to calling the class method 85 C{Tree.parse(s)}. 86 """ 87 # Because __new__ may delegate to Tree.parse(), the __init__ 88 # method may end up getting called more than once (once when 89 # constructing the return value for Tree.parse; and again when 90 # __new__ returns). We therefore check if `children` is None 91 # (which will cause __new__ to call Tree.parse()); if so, then 92 # __init__ has already been called once, so just return. 93 if children is None: return 94 95 list.__init__(self, children) 96 self.node = node_or_str
97 98 #//////////////////////////////////////////////////////////// 99 # Comparison operators 100 #//////////////////////////////////////////////////////////// 101
102 - def __eq__(self, other):
103 if not isinstance(other, Tree): return False 104 return self.node == other.node and list.__eq__(self, other)
105 - def __ne__(self, other):
106 return not (self == other)
107 - def __lt__(self, other):
108 if not isinstance(other, Tree): return False 109 return self.node < other.node or list.__lt__(self, other)
110 - def __le__(self, other):
111 if not isinstance(other, Tree): return False 112 return self.node <= other.node or list.__le__(self, other)
113 - def __gt__(self, other):
114 if not isinstance(other, Tree): return True 115 return self.node > other.node or list.__gt__(self, other)
116 - def __ge__(self, other):
117 if not isinstance(other, Tree): return False 118 return self.node >= other.node or list.__ge__(self, other)
119 120 #//////////////////////////////////////////////////////////// 121 # Disabled list operations 122 #//////////////////////////////////////////////////////////// 123
124 - def __mul__(self, v):
125 raise TypeError('Tree does not support multiplication')
126 - def __rmul__(self, v):
127 raise TypeError('Tree does not support multiplication')
128 - def __add__(self, v):
129 raise TypeError('Tree does not support addition')
130 - def __radd__(self, v):
131 raise TypeError('Tree does not support addition')
132 133 #//////////////////////////////////////////////////////////// 134 # Indexing (with support for tree positions) 135 #//////////////////////////////////////////////////////////// 136
137 - def __getitem__(self, index):
138 if isinstance(index, (int, slice)): 139 return list.__getitem__(self, index) 140 else: 141 if len(index) == 0: 142 return self 143 elif len(index) == 1: 144 return self[int(index[0])] 145 else: 146 return self[int(index[0])][index[1:]]
147
148 - def __setitem__(self, index, value):
149 if isinstance(index, (int, slice)): 150 return list.__setitem__(self, index, value) 151 else: 152 if len(index) == 0: 153 raise IndexError('The tree position () may not be ' 154 'assigned to.') 155 elif len(index) == 1: 156 self[index[0]] = value 157 else: 158 self[index[0]][index[1:]] = value
159
160 - def __delitem__(self, index):
161 if isinstance(index, (int, slice)): 162 return list.__delitem__(self, index) 163 else: 164 if len(index) == 0: 165 raise IndexError('The tree position () may not be deleted.') 166 elif len(index) == 1: 167 del self[index[0]] 168 else: 169 del self[index[0]][index[1:]]
170 171 #//////////////////////////////////////////////////////////// 172 # Basic tree operations 173 #//////////////////////////////////////////////////////////// 174
175 - def leaves(self):
176 """ 177 @return: a list containing this tree's leaves. 178 The order reflects the order of the 179 leaves in the tree's hierarchical structure. 180 @rtype: C{list} 181 """ 182 leaves = [] 183 for child in self: 184 if isinstance(child, Tree): 185 leaves.extend(child.leaves()) 186 else: 187 leaves.append(child) 188 return leaves
189
190 - def flatten(self):
191 """ 192 @return: a tree consisting of this tree's root connected directly to 193 its leaves, omitting all intervening non-terminal nodes. 194 @rtype: C{Tree} 195 """ 196 return Tree(self.node, self.leaves())
197
198 - def height(self):
199 """ 200 @return: The height of this tree. The height of a tree 201 containing no children is 1; the height of a tree 202 containing only leaves is 2; and the height of any other 203 tree is one plus the maximum of its children's 204 heights. 205 @rtype: C{int} 206 """ 207 max_child_height = 0 208 for child in self: 209 if isinstance(child, Tree): 210 max_child_height = max(max_child_height, child.height()) 211 else: 212 max_child_height = max(max_child_height, 1) 213 return 1 + max_child_height
214
215 - def treepositions(self, order='preorder'):
216 """ 217 @param order: One of: C{preorder}, C{postorder}, C{bothorder}, 218 C{leaves}. 219 """ 220 positions = [] 221 if order in ('preorder', 'bothorder'): positions.append( () ) 222 for i, child in enumerate(self): 223 if isinstance(child, Tree): 224 childpos = child.treepositions(order) 225 positions.extend((i,)+p for p in childpos) 226 else: 227 positions.append( (i,) ) 228 if order in ('postorder', 'bothorder'): positions.append( () ) 229 return positions
230
231 - def subtrees(self, filter=None):
232 """ 233 Generate all the subtrees of this tree, optionally restricted 234 to trees matching the filter function. 235 @type filter: C{function} 236 @param filter: the function to filter all local trees 237 """ 238 if not filter or filter(self): 239 yield self 240 for child in self: 241 if isinstance(child, Tree): 242 for subtree in child.subtrees(filter): 243 yield subtree
244
245 - def productions(self):
246 """ 247 Generate the productions that correspond to the non-terminal nodes of the tree. 248 For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the 249 form P -> C1 C2 ... Cn. 250 251 @rtype: list of C{Production}s 252 """ 253 254 if not isinstance(self.node, basestring): 255 raise TypeError, 'Productions can only be generated from trees having node labels that are strings' 256 257 prods = [Production(Nonterminal(self.node), _child_names(self))] 258 for child in self: 259 if isinstance(child, Tree): 260 prods += child.productions() 261 return prods
262
263 - def pos(self):
264 """ 265 @return: a list of tuples containing leaves and pre-terminals (part-of-speech tags). 266 The order reflects the order of the 267 leaves in the tree's hierarchical structure. 268 @rtype: C{list} of C{tuples} 269 """ 270 pos = [] 271 for child in self: 272 if isinstance(child, Tree): 273 pos.extend(child.pos()) 274 else: 275 pos.append((child, self.node)) 276 return pos
277
278 - def leaf_treeposition(self, index):
279 """ 280 @return: The tree position of the C{index}-th leaf in this 281 tree. I.e., if C{tp=self.leaf_treeposition(i)}, then 282 C{self[tp]==self.leaves()[i]}. 283 284 @raise IndexError: If this tree contains fewer than C{index+1} 285 leaves, or if C{index<0}. 286 """ 287 if index < 0: raise IndexError('index must be non-negative') 288 289 stack = [(self, ())] 290 while stack: 291 value, treepos = stack.pop() 292 if not isinstance(value, Tree): 293 if index == 0: return treepos 294 else: index -= 1 295 else: 296 for i in range(len(value)-1, -1, -1): 297 stack.append( (value[i], treepos+(i,)) ) 298 299 raise IndexError('index must be less than or equal to len(self)')
300
301 - def treeposition_spanning_leaves(self, start, end):
302 """ 303 @return: The tree position of the lowest descendant of this 304 tree that dominates C{self.leaves()[start:end]}. 305 @raise ValueError: if C{end <= start} 306 """ 307 if end <= start: 308 raise ValueError('end must be greater than start') 309 # Find the tree positions of the start & end leaves, and 310 # take the longest common subsequence. 311 start_treepos = self.leaf_treeposition(start) 312 end_treepos = self.leaf_treeposition(end-1) 313 # Find the first index where they mismatch: 314 for i in range(len(start_treepos)): 315 if i == len(end_treepos) or start_treepos[i] != end_treepos[i]: 316 return start_treepos[:i] 317 return start_treepos
318 319 #//////////////////////////////////////////////////////////// 320 # Transforms 321 #//////////////////////////////////////////////////////////// 322
323 - def chomsky_normal_form(self, factor = "right", horzMarkov = None, vertMarkov = 0, childChar = "|", parentChar = "^"):
324 """ 325 This method can modify a tree in three ways: 326 327 1. Convert a tree into its Chomsky Normal Form (CNF) 328 equivalent -- Every subtree has either two non-terminals 329 or one terminal as its children. This process requires 330 the creation of more"artificial" non-terminal nodes. 331 2. Markov (vertical) smoothing of children in new artificial 332 nodes 333 3. Horizontal (parent) annotation of nodes 334 335 @param factor: Right or left factoring method (default = "right") 336 @type factor: C{string} = [left|right] 337 @param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings) 338 @type horzMarkov: C{int} | None 339 @param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation) 340 @type vertMarkov: C{int} | None 341 @param childChar: A string used in construction of the artificial nodes, separating the head of the 342 original subtree from the child nodes that have yet to be expanded (default = "|") 343 @type childChar: C{string} 344 @param parentChar: A string used to separate the node representation from its vertical annotation 345 @type parentChar: C{string} 346 """ 347 from treetransforms import chomsky_normal_form 348 chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar)
349
350 - def un_chomsky_normal_form(self, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"):
351 """ 352 This method modifies the tree in three ways: 353 354 1. Transforms a tree in Chomsky Normal Form back to its 355 original structure (branching greater than two) 356 2. Removes any parent annotation (if it exists) 357 3. (optional) expands unary subtrees (if previously 358 collapsed with collapseUnary(...) ) 359 360 @param expandUnary: Flag to expand unary or not (default = True) 361 @type expandUnary: C{boolean} 362 @param childChar: A string separating the head node from its children in an artificial node (default = "|") 363 @type childChar: C{string} 364 @param parentChar: A sting separating the node label from its parent annotation (default = "^") 365 @type parentChar: C{string} 366 @param unaryChar: A string joining two non-terminals in a unary production (default = "+") 367 @type unaryChar: C{string} 368 """ 369 from treetransforms import un_chomsky_normal_form 370 un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar)
371
372 - def collapse_unary(self, collapsePOS = False, collapseRoot = False, joinChar = "+"):
373 """ 374 Collapse subtrees with a single child (ie. unary productions) 375 into a new non-terminal (Tree node) joined by 'joinChar'. 376 This is useful when working with algorithms that do not allow 377 unary productions, and completely removing the unary productions 378 would require loss of useful information. The Tree is modified 379 directly (since it is passed by reference) and no value is returned. 380 381 @param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. 382 Part-of-Speech tags) since they are always unary productions 383 @type collapsePOS: C{boolean} 384 @param collapseRoot: 'False' (default) will not modify the root production 385 if it is unary. For the Penn WSJ treebank corpus, this corresponds 386 to the TOP -> productions. 387 @type collapseRoot: C{boolean} 388 @param joinChar: A string used to connect collapsed node values (default = "+") 389 @type joinChar: C{string} 390 """ 391 from treetransforms import collapse_unary 392 collapse_unary(self, collapsePOS, collapseRoot, joinChar)
393 394 #//////////////////////////////////////////////////////////// 395 # Convert, copy 396 #//////////////////////////////////////////////////////////// 397 398 # [classmethod]
399 - def convert(cls, val):
400 """ 401 Convert a tree between different subtypes of Tree. C{cls} determines 402 which class will be used to encode the new tree. 403 404 @type val: L{Tree} 405 @param val: The tree that should be converted. 406 @return: The new C{Tree}. 407 """ 408 if isinstance(val, Tree): 409 children = [cls.convert(child) for child in val] 410 return cls(val.node, children) 411 else: 412 return val
413 convert = classmethod(convert) 414
415 - def copy(self, deep=False):
416 if not deep: return self.__class__(self.node, self) 417 else: return self.__class__.convert(self)
418
419 - def _frozen_class(self): return ImmutableTree
420 - def freeze(self, leaf_freezer=None):
421 frozen_class = self._frozen_class() 422 if leaf_freezer is None: 423 newcopy = frozen_class.convert(self) 424 else: 425 newcopy = self.copy(deep=True) 426 for pos in newcopy.treepositions('leaves'): 427 newcopy[pos] = leaf_freezer(newcopy[pos]) 428 newcopy = frozen_class.convert(newcopy) 429 hash(newcopy) # Make sure the leaves are hashable. 430 return newcopy
431 432 #//////////////////////////////////////////////////////////// 433 # Parsing 434 #//////////////////////////////////////////////////////////// 435 436 @classmethod
437 - def parse(cls, s, brackets='()', parse_node=None, parse_leaf=None, 438 node_pattern=None, leaf_pattern=None, 439 remove_empty_top_bracketing=False):
440 """ 441 Parse a bracketed tree string and return the resulting tree. 442 Trees are represented as nested brackettings, such as:: 443 444 (S (NP (NNP John)) (VP (V runs))) 445 446 @type s: C{str} 447 @param s: The string to parse 448 449 @type brackets: length-2 C{str} 450 @param brackets: The bracket characters used to mark the 451 beginning and end of trees and subtrees. 452 453 @type parse_node: C{function} 454 @type parse_leaf: C{function} 455 @param parse_node, parse_leaf: If specified, these functions 456 are applied to the substrings of C{s} corresponding to 457 nodes and leaves (respectively) to obtain the values for 458 those nodes and leaves. They should have the following 459 signature: 460 461 >>> parse_node(str) -> value 462 463 For example, these functions could be used to parse nodes 464 and leaves whose values should be some type other than 465 string (such as L{FeatStruct <nltk.featstruct.FeatStruct>}). 466 Note that by default, node strings and leaf strings are 467 delimited by whitespace and brackets; to override this 468 default, use the C{node_pattern} and C{leaf_pattern} 469 arguments. 470 471 @type node_pattern: C{str} 472 @type leaf_pattern: C{str} 473 @param node_pattern, leaf_pattern: Regular expression patterns 474 used to find node and leaf substrings in C{s}. By 475 default, both nodes patterns are defined to match any 476 sequence of non-whitespace non-bracket characters. 477 478 @type remove_empty_top_bracketing: C{bool} 479 @param remove_empty_top_bracketing: If the resulting tree has 480 an empty node label, and is length one, then return its 481 single child instead. This is useful for treebank trees, 482 which sometimes contain an extra level of bracketing. 483 484 @return: A tree corresponding to the string representation C{s}. 485 If this class method is called using a subclass of C{Tree}, 486 then it will return a tree of that type. 487 @rtype: C{Tree} 488 """ 489 if not isinstance(brackets, basestring) or len(brackets) != 2: 490 raise TypeError('brackets must be a length-2 string') 491 if re.search('\s', brackets): 492 raise TypeError('whitespace brackets not allowed') 493 # Construct a regexp that will tokenize the string. 494 open_b, close_b = brackets 495 open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b)) 496 if node_pattern is None: 497 node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern) 498 if leaf_pattern is None: 499 leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern) 500 token_re = re.compile('%s\s*(%s)?|%s|(%s)' % ( 501 open_pattern, node_pattern, close_pattern, leaf_pattern)) 502 # Walk through each token, updating a stack of trees. 503 stack = [(None, [])] # list of (node, children) tuples 504 for match in token_re.finditer(s): 505 token = match.group() 506 # Beginning of a tree/subtree 507 if token[0] == open_b: 508 if len(stack) == 1 and len(stack[0][1]) > 0: 509 cls._parse_error(s, match, 'end-of-string') 510 node = token[1:].lstrip() 511 if parse_node is not None: node = parse_node(node) 512 stack.append((node, [])) 513 # End of a tree/subtree 514 elif token == close_b: 515 if len(stack) == 1: 516 if len(stack[0][1]) == 0: 517 cls._parse_error(s, match, open_b) 518 else: 519 cls._parse_error(s, match, 'end-of-string') 520 node, children = stack.pop() 521 stack[-1][1].append(cls(node, children)) 522 # Leaf node 523 else: 524 if len(stack) == 1: 525 cls._parse_error(s, match, open_b) 526 if parse_leaf is not None: token = parse_leaf(token) 527 stack[-1][1].append(token) 528 529 # check that we got exactly one complete tree. 530 if len(stack) > 1: 531 cls._parse_error(s, 'end-of-string', close_b) 532 elif len(stack[0][1]) == 0: 533 cls._parse_error(s, 'end-of-string', open_b) 534 else: 535 assert stack[0][0] is None 536 assert len(stack[0][1]) == 1 537 tree = stack[0][1][0] 538 539 # If the tree has an extra level with node='', then get rid of 540 # it. E.g.: "((S (NP ...) (VP ...)))" 541 if remove_empty_top_bracketing and tree.node == '' and len(tree) == 1: 542 tree = tree[0] 543 # return the tree. 544 return tree
545 546 @classmethod
547 - def _parse_error(cls, s, match, expecting):
548 """ 549 Display a friendly error message when parsing a tree string fails. 550 @param s: The string we're parsing. 551 @param match: regexp match of the problem token. 552 @param expecting: what we expected to see instead. 553 """ 554 # Construct a basic error message 555 if match == 'end-of-string': 556 pos, token = len(s), 'end-of-string' 557 else: 558 pos, token = match.start(), match.group() 559 msg = '%s.parse(): expected %r but got %r\n%sat index %d.' % ( 560 cls.__name__, expecting, token, ' '*12, pos) 561 # Add a display showing the error token itsels: 562 s = s.replace('\n', ' ').replace('\t', ' ') 563 offset = pos 564 if len(s) > pos+10: 565 s = s[:pos+10]+'...' 566 if pos > 10: 567 s = '...'+s[pos-10:] 568 offset = 13 569 msg += '\n%s"%s"\n%s^' % (' '*16, s, ' '*(17+offset)) 570 raise ValueError(msg)
571 #//////////////////////////////////////////////////////////// 572 # Visualization & String Representation 573 #//////////////////////////////////////////////////////////// 574
575 - def draw(self):
576 """ 577 Open a new window containing a graphical diagram of this tree. 578 """ 579 from nltk.draw.tree import draw_trees 580 draw_trees(self)
581
582 - def __repr__(self):
583 childstr = ", ".join(repr(c) for c in self) 584 return '%s(%r, [%s])' % (self.__class__.__name__, self.node, childstr)
585
586 - def __str__(self):
587 return self.pprint()
588
589 - def pprint(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
590 """ 591 @return: A pretty-printed string representation of this tree. 592 @rtype: C{string} 593 @param margin: The right margin at which to do line-wrapping. 594 @type margin: C{int} 595 @param indent: The indentation level at which printing 596 begins. This number is used to decide how far to indent 597 subsequent lines. 598 @type indent: C{int} 599 @param nodesep: A string that is used to separate the node 600 from the children. E.g., the default value C{':'} gives 601 trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}. 602 """ 603 604 # Try writing it on one line. 605 s = self._pprint_flat(nodesep, parens, quotes) 606 if len(s)+indent < margin: 607 return s 608 609 # If it doesn't fit on one line, then write it on multi-lines. 610 if isinstance(self.node, basestring): 611 s = '%s%s%s' % (parens[0], self.node, nodesep) 612 else: 613 s = '%s%r%s' % (parens[0], self.node, nodesep) 614 for child in self: 615 if isinstance(child, Tree): 616 s += '\n'+' '*(indent+2)+child.pprint(margin, indent+2, 617 nodesep, parens, quotes) 618 elif isinstance(child, tuple): 619 s += '\n'+' '*(indent+2)+ "/".join(child) 620 elif isinstance(child, basestring) and not quotes: 621 s += '\n'+' '*(indent+2)+ '%s' % child 622 else: 623 s += '\n'+' '*(indent+2)+ '%r' % child 624 return s+parens[1]
625
626 - def pprint_latex_qtree(self):
627 r""" 628 Returns a representation of the tree compatible with the 629 LaTeX qtree package. This consists of the string C{\Tree} 630 followed by the parse tree represented in bracketed notation. 631 632 For example, the following result was generated from a parse tree of 633 the sentence C{The announcement astounded us}:: 634 635 \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ] 636 [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ] 637 638 See U{http://www.ling.upenn.edu/advice/latex.html} for the LaTeX 639 style file for the qtree package. 640 641 @return: A latex qtree representation of this tree. 642 @rtype: C{string} 643 """ 644 return r'\Tree ' + self.pprint(indent=6, nodesep='', parens=('[.', ' ]'))
645
646 - def _pprint_flat(self, nodesep, parens, quotes):
647 childstrs = [] 648 for child in self: 649 if isinstance(child, Tree): 650 childstrs.append(child._pprint_flat(nodesep, parens, quotes)) 651 elif isinstance(child, tuple): 652 childstrs.append("/".join(child)) 653 elif isinstance(child, basestring) and not quotes: 654 childstrs.append('%s' % child) 655 else: 656 childstrs.append('%r' % child) 657 if isinstance(self.node, basestring): 658 return '%s%s%s %s%s' % (parens[0], self.node, nodesep, 659 string.join(childstrs), parens[1]) 660 else: 661 return '%s%r%s %s%s' % (parens[0], self.node, nodesep, 662 string.join(childstrs), parens[1])
663
664 -class ImmutableTree(Tree):
665 - def __init__(self, node_or_str, children=None):
666 if children is None: return # see note in Tree.__init__() 667 super(ImmutableTree, self).__init__(node_or_str, children) 668 # Precompute our hash value. This ensures that we're really 669 # immutable. It also means we only have to calculate it once. 670 try: 671 self._hash = hash( (self.node, tuple(self)) ) 672 except (TypeError, ValueError): 673 raise ValueError("ImmutableTree's node value and children " 674 "must be immutable")
675 - def __setitem__(self):
676 raise ValueError, 'ImmutableTrees may not be modified'
677 - def __setslice__(self):
678 raise ValueError, 'ImmutableTrees may not be modified'
679 - def __delitem__(self):
680 raise ValueError, 'ImmutableTrees may not be modified'
681 - def __delslice__(self):
682 raise ValueError, 'ImmutableTrees may not be modified'
683 - def __iadd__(self):
684 raise ValueError, 'ImmutableTrees may not be modified'
685 - def __imul__(self):
686 raise ValueError, 'ImmutableTrees may not be modified'
687 - def append(self, v):
688 raise ValueError, 'ImmutableTrees may not be modified'
689 - def extend(self, v):
690 raise ValueError, 'ImmutableTrees may not be modified'
691 - def pop(self, v=None):
692 raise ValueError, 'ImmutableTrees may not be modified'
693 - def remove(self, v):
694 raise ValueError, 'ImmutableTrees may not be modified'
695 - def reverse(self):
696 raise ValueError, 'ImmutableTrees may not be modified'
697 - def sort(self):
698 raise ValueError, 'ImmutableTrees may not be modified'
699 - def __hash__(self):
700 return self._hash
701
702 - def _set_node(self, node):
703 """Set self._node. This will only succeed the first time the 704 node value is set, which should occur in Tree.__init__().""" 705 if hasattr(self, 'node'): 706 raise ValueError, 'ImmutableTrees may not be modified' 707 self._node = node
708 - def _get_node(self):
709 return self._node
710 node = property(_get_node, _set_node)
711
712 ###################################################################### 713 ## Parented trees 714 ###################################################################### 715 716 -class AbstractParentedTree(Tree):
717 """ 718 An abstract base class for L{Tree}s that automatically maintain 719 pointers to their parents. These parent pointers are updated 720 whenever any change is made to a tree's structure. Two subclasses 721 are currently defined: 722 723 - L{ParentedTree} is used for tree structures where each subtree 724 has at most one parent. This class should be used in cases 725 where there is no"sharing" of subtrees. 726 727 - L{MultiParentedTree} is used for tree structures where a 728 subtree may have zero or more parents. This class should be 729 used in cases where subtrees may be shared. 730 731 Subclassing 732 =========== 733 The C{AbstractParentedTree} class redefines all operations that 734 modify a tree's structure to call two methods, which are used by 735 subclasses to update parent information: 736 737 - L{_setparent()} is called whenever a new child is added. 738 - L{_delparent()} is called whenever a child is removed. 739 """
740 - def __init__(self, node_or_str, children=None):
741 if children is None: return # see note in Tree.__init__() 742 super(AbstractParentedTree, self).__init__(node_or_str, children) 743 # iterate over self, and *not* children, because children 744 # might be an iterator. 745 for i, child in enumerate(self): 746 if isinstance(child, Tree): 747 self._setparent(child, i, dry_run=True) 748 for i, child in enumerate(self): 749 if isinstance(child, Tree): 750 self._setparent(child, i)
751 752 #//////////////////////////////////////////////////////////// 753 # Parent management 754 #//////////////////////////////////////////////////////////// 755
756 - def _setparent(self, child, index, dry_run=False):
757 """ 758 Update C{child}'s parent pointer to point to self. This 759 method is only called if C{child}'s type is L{Tree}; i.e., it 760 is not called when adding a leaf to a tree. This method is 761 always called before the child is actually added to C{self}'s 762 child list. 763 764 @type child: L{Tree} 765 @type index: C{int} 766 @param index: The index of C{child} in C{self}. 767 @raise TypeError: If C{child} is a tree with an impropriate 768 type. Typically, if C{child} is a tree, then its type needs 769 to match C{self}'s type. This prevents mixing of 770 different tree types (single-parented, multi-parented, and 771 non-parented). 772 @param dry_run: If true, the don't actually set the child's 773 parent pointer; just check for any error conditions, and 774 raise an exception if one is found. 775 """ 776 raise AssertionError('Abstract base class')
777
778 - def _delparent(self, child, index):
779 """ 780 Update C{child}'s parent pointer to not point to self. This 781 method is only called if C{child}'s type is L{Tree}; i.e., it 782 is not called when removing a leaf from a tree. This method 783 is always called before the child is actually removed from 784 C{self}'s child list. 785 786 @type child: L{Tree} 787 @type index: C{int} 788 @param index: The index of C{child} in C{self}. 789 """ 790 raise AssertionError('Abstract base class')
791 792 #//////////////////////////////////////////////////////////// 793 # Methods that add/remove children 794 #//////////////////////////////////////////////////////////// 795 # Every method that adds or removes a child must make 796 # appropriate calls to _setparent() and _delparent(). 797
798 - def __delitem__(self, index):
799 # del ptree[start:stop] 800 if isinstance(index, slice): 801 start, stop = slice_bounds(self, index) 802 # Clear all the children pointers. 803 for i in xrange(start, stop): 804 if isinstance(self[i], Tree): 805 self._delparent(self[i], i) 806 # Delete the children from our child list. 807 super(AbstractParentedTree, self).__delitem__(index) 808 809 # del ptree[i] 810 elif isinstance(index, int): 811 if index < 0: index += len(self) 812 if index < 0: raise IndexError('index out of range') 813 # Clear the child's parent pointer. 814 if isinstance(self[index], Tree): 815 self._delparent(self[index], index) 816 # Remove the child from our child list. 817 super(AbstractParentedTree, self).__delitem__(index) 818 819 # del ptree[()] 820 elif len(index) == 0: 821 raise IndexError('The tree position () may not be deleted.') 822 823 # del ptree[(i,)] 824 elif len(index) == 1: 825 del self[index[0]] 826 827 # del ptree[i1, i2, i3] 828 else: 829 del self[index[0]][index[1:]]
830
831 - def __setitem__(self, index, value):
832 # ptree[start:stop] = value 833 if isinstance(index, slice): 834 start, stop = slice_bounds(self, index) 835 # make a copy of value, in case it's an iterator 836 if not isinstance(value, (list, tuple)): 837 value = list(value) 838 # Check for any error conditions, so we can avoid ending 839 # up in an inconsistent state if an error does occur. 840 for i, child in enumerate(value): 841 if isinstance(child, Tree): 842 self._setparent(child, start+i, dry_run=True) 843 # clear the child pointers of all parents we're removing 844 for i in xrange(start, stop): 845 if isinstance(self[i], Tree): 846 self._delparent(self[i], i) 847 # set the child pointers of the new children. We do this 848 # after clearing *all* child pointers, in case we're e.g. 849 # reversing the elements in a tree. 850 for i, child in enumerate(value): 851 if isinstance(child, Tree): 852 self._setparent(child, start+i) 853 # finally, update the content of the child list itself. 854 super(AbstractParentedTree, self).__setitem__(index, value) 855 856 # ptree[i] = value 857 elif isinstance(index, int): 858 if index < 0: index += len(self) 859 if index < 0: raise IndexError('index out of range') 860 # if the value is not changing, do nothing. 861 if value is self[index]: 862 return 863 # Set the new child's parent pointer. 864 if isinstance(value, Tree): 865 self._setparent(value, index) 866 # Remove the old child's parent pointer 867 if isinstance(self[index], Tree): 868 self._delparent(self[index], index) 869 # Update our child list. 870 super(AbstractParentedTree, self).__setitem__(index, value) 871 872 # ptree[()] = value 873 elif len(index) == 0: 874 raise IndexError('The tree position () may not be assigned to.') 875 876 # ptree[(i,)] = value 877 elif len(index) == 1: 878 self[index[0]] = value 879 880 # ptree[i1, i2, i3] = value 881 else: 882 self[index[0]][index[1:]] = value
883
884 - def append(self, child):
885 if isinstance(child, Tree): 886 self._setparent(child, len(self)) 887 super(AbstractParentedTree, self).append(child)
888
889 - def extend(self, children):
890 for child in children: 891 if isinstance(child, Tree): 892 self._setparent(child, len(self)) 893 super(AbstractParentedTree, self).append(child)
894
895 - def insert(self, index, child):
896 # Handle negative indexes. Note that if index < -len(self), 897 # we do *not* raise an IndexError, unlike __getitem__. This 898 # is done for consistency with list.__getitem__ and list.index. 899 if index < 0: index += len(self) 900 if index < 0: index = 0 901 # Set the child's parent, and update our child list. 902 if isinstance(child, Tree): 903 self._setparent(child, index) 904 super(AbstractParentedTree, self).insert(index, child)
905
906 - def pop(self, index=-1):
907 if index < 0: index += len(self) 908 if index < 0: raise IndexError('index out of range') 909 if isinstance(self[index], Tree): 910 self._delparent(self[index], index) 911 return super(AbstractParentedTree, self).pop(index)
912 913 # n.b.: like `list`, this is done by equality, not identity! 914 # To remove a specific child, use del ptree[i].
915 - def remove(self, child):
916 index = self.index(child) 917 if isinstance(self[index], Tree): 918 self._delparent(self[index], index) 919 super(AbstractParentedTree, self).remove(child)
920 921 # We need to implement __getslice__ and friends, even though 922 # they're deprecated, because otherwise list.__getslice__ will get 923 # called (since we're subclassing from list). Just delegate to 924 # __getitem__ etc., but use max(0, start) and max(0, stop) because 925 # because negative indices are already handled *before* 926 # __getslice__ is called; and we don't want to double-count them. 927 if hasattr(list, '__getslice__'):
928 - def __getslice__(self, start, stop):
929 return self.__getitem__(slice(max(0, start), max(0, stop)))
930 - def __delslice__(self, start, stop):
931 return self.__delitem__(slice(max(0, start), max(0, stop)))
932 - def __setslice__(self, start, stop, value):
933 return self.__setitem__(slice(max(0, start), max(0, stop)), value)
934
935 -class ParentedTree(AbstractParentedTree):
936 """ 937 A L{Tree} that automatically maintains parent pointers for 938 single-parented trees. The following read-only property values 939 are automatically updated whenever the structure of a parented 940 tree is modified: L{parent}, L{parent_index}, L{left_sibling}, 941 L{right_sibling}, L{root}, L{treeposition}. 942 943 Each C{ParentedTree} may have at most one parent. In 944 particular, subtrees may not be shared. Any attempt to reuse a 945 single C{ParentedTree} as a child of more than one parent (or 946 as multiple children of the same parent) will cause a 947 C{ValueError} exception to be raised. 948 949 C{ParentedTrees} should never be used in the same tree as C{Trees} 950 or C{MultiParentedTrees}. Mixing tree implementations may result 951 in incorrect parent pointers and in C{TypeError} exceptions. 952 """
953 - def __init__(self, node_or_str, children=None):
954 if children is None: return # see note in Tree.__init__() 955 956 self._parent = None 957 """The parent of this Tree, or C{None} if it has no parent.""" 958 959 super(ParentedTree, self).__init__(node_or_str, children)
960
961 - def _frozen_class(self): return ImmutableParentedTree
962 963 #///////////////////////////////////////////////////////////////// 964 # Properties 965 #///////////////////////////////////////////////////////////////// 966
967 - def _get_parent_index(self):
968 if self._parent is None: return None 969 for i, child in enumerate(self._parent): 970 if child is self: return i 971 assert False, 'expected to find self in self._parent!'
972
973 - def _get_left_sibling(self):
974 parent_index = self._get_parent_index() 975 if self._parent and parent_index > 0: 976 return self._parent[parent_index-1] 977 return None # no left sibling
978
979 - def _get_right_sibling(self):
980 parent_index = self._get_parent_index() 981 if self._parent and parent_index < (len(self._parent)-1): 982 return self._parent[parent_index+1] 983 return None # no right sibling
984
985 - def _get_treeposition(self):
986 if self._parent is None: return () 987 else: return (self._parent._get_treeposition() + 988 (self._get_parent_index(),))
989
990 - def _get_root(self):
991 if self._parent is None: return self 992 else: return self._parent._get_root()
993 994 parent = property(lambda self: self._parent, doc=""" 995 The parent of this tree, or C{None} if it has no parent.""") 996 997 parent_index = property(_get_parent_index, doc=""" 998 The index of this tree in its parent. I.e., 999 C{ptree.parent[ptree.parent_index] is ptree}. Note that 1000 C{ptree.parent_index} is not necessarily equal to 1001 C{ptree.parent.index(ptree)}, since the C{index()} method 1002 returns the first child that is I{equal} to its argument.""") 1003 1004 left_sibling = property(_get_left_sibling, doc=""" 1005 The left sibling of this tree, or C{None} if it has none.""") 1006 1007 right_sibling = property(_get_right_sibling, doc=""" 1008 The right sibling of this tree, or C{None} if it has none.""") 1009 1010 root = property(_get_root, doc=""" 1011 The root of this tree. I.e., the unique ancestor of this tree 1012 whose parent is C{None}. If C{ptree.parent} is C{None}, then 1013 C{ptree} is its own root.""") 1014 1015 treeposition = property(_get_treeposition, doc=""" 1016 The tree position of this tree, relative to the root of the 1017 tree. I.e., C{ptree.root[ptree.treeposition] is ptree}.""") 1018 treepos = treeposition # [xx] alias -- which name should we use? 1019 1020 #///////////////////////////////////////////////////////////////// 1021 # Parent Management 1022 #///////////////////////////////////////////////////////////////// 1023
1024 - def _delparent(self, child, index):
1025 # Sanity checks 1026 assert isinstance(child, ParentedTree) 1027 assert self[index] is child 1028 assert child._parent is self 1029 1030 # Delete child's parent pointer. 1031 child._parent = None
1032
1033 - def _setparent(self, child, index, dry_run=False):
1034 # If the child's type is incorrect, then complain. 1035 if not isinstance(child, ParentedTree): 1036 raise TypeError('Can not insert a non-ParentedTree '+ 1037 'into a ParentedTree') 1038 1039 # If child already has a parent, then complain. 1040 if child._parent is not None: 1041 raise ValueError('Can not insert a subtree that already ' 1042 'has a parent.') 1043 1044 # Set child's parent pointer & index. 1045 if not dry_run: 1046 child._parent = self
1047
1048 -class MultiParentedTree(AbstractParentedTree):
1049 """ 1050 A L{Tree} that automatically maintains parent pointers for 1051 multi-parented trees. The following read-only property values are 1052 automatically updated whenever the structure of a multi-parented 1053 tree is modified: L{parents}, L{parent_indices}, L{left_siblings}, 1054 L{right_siblings}, L{roots}, L{treepositions}. 1055 1056 Each C{MultiParentedTree} may have zero or more parents. In 1057 particular, subtrees may be shared. If a single 1058 C{MultiParentedTree} is used as multiple children of the same 1059 parent, then that parent will appear multiple times in its 1060 C{parents} property. 1061 1062 C{MultiParentedTrees} should never be used in the same tree as 1063 C{Trees} or C{ParentedTrees}. Mixing tree implementations may 1064 result in incorrect parent pointers and in C{TypeError} exceptions. 1065 """
1066 - def __init__(self, node_or_str, children=None):
1067 if children is None: return # see note in Tree.__init__() 1068 1069 self._parents = [] 1070 """A list of this tree's parents. This list should not 1071 contain duplicates, even if a parent contains this tree 1072 multiple times.""" 1073 1074 super(MultiParentedTree, self).__init__(node_or_str, children)
1075 1077 1078 #///////////////////////////////////////////////////////////////// 1079 # Properties 1080 #///////////////////////////////////////////////////////////////// 1081
1082 - def _get_parent_indices(self):
1083 return [(parent, index) 1084 for parent in self._parents 1085 for index, child in enumerate(parent) 1086 if child is self]
1087
1088 - def _get_left_siblings(self):
1089 return [parent[index-1] 1090 for (parent, index) in self._get_parent_indices() 1091 if index > 0]
1092
1093 - def _get_right_siblings(self):
1094 return [parent[index+1] 1095 for (parent, index) in self._get_parent_indices() 1096 if index < (len(parent)-1)]
1097
1098 - def _get_roots(self):
1099 return self._get_roots_helper({}).values()
1100
1101 - def _get_roots_helper(self, result):
1102 if self._parents: 1103 for parent in self._parents: 1104 parent._get_roots_helper(result) 1105 else: 1106 result[id(self)] = self 1107 return result
1108 1109 parents = property(lambda self: list(self._parents), doc=""" 1110 The set of parents of this tree. If this tree has no parents, 1111 then C{parents} is the empty set. To check if a tree is used 1112 as multiple children of the same parent, use the 1113 L{parent_indices} property. 1114 1115 @type: C{list} of L{MultiParentedTree}""") 1116 1117 left_siblings = property(_get_left_siblings, doc=""" 1118 A list of all left siblings of this tree, in any of its parent 1119 trees. A tree may be its own left sibling if it is used as 1120 multiple contiguous children of the same parent. A tree may 1121 appear multiple times in this list if it is the left sibling 1122 of this tree with respect to multiple parents. 1123 1124 @type: C{list} of L{MultiParentedTree}""") 1125 1126 right_siblings = property(_get_right_siblings, doc=""" 1127 A list of all right siblings of this tree, in any of its parent 1128 trees. A tree may be its own right sibling if it is used as 1129 multiple contiguous children of the same parent. A tree may 1130 appear multiple times in this list if it is the right sibling 1131 of this tree with respect to multiple parents. 1132 1133 @type: C{list} of L{MultiParentedTree}""") 1134 1135 roots = property(_get_roots, doc=""" 1136 The set of all roots of this tree. This set is formed by 1137 tracing all possible parent paths until trees with no parents 1138 are found. 1139 1140 @type: C{list} of L{MultiParentedTree}""") 1141
1142 - def parent_indices(self, parent):
1143 """ 1144 Return a list of the indices where this tree occurs as a child 1145 of C{parent}. If this child does not occur as a child of 1146 C{parent}, then the empty list is returned. The following is 1147 always true:: 1148 1149 for parent_index in ptree.parent_indices(parent): 1150 parent[parent_index] is ptree 1151 """ 1152 if parent not in self._parents: return [] 1153 else: return [index for (index, child) in enumerate(parent) 1154 if child is self]
1155
1156 - def treepositions(self, root):
1157 """ 1158 Return a list of all tree positions that can be used to reach 1159 this multi-parented tree starting from C{root}. I.e., the 1160 following is always true:: 1161 1162 for treepos in ptree.treepositions(root): 1163 root[treepos] is ptree 1164 """ 1165 if self is root: 1166 return [()] 1167 else: 1168 return [treepos+(index,) 1169 for parent in self._parents 1170 for treepos in parent.treepositions(root) 1171 for (index, child) in enumerate(parent) if child is self]
1172 1173 1174 #///////////////////////////////////////////////////////////////// 1175 # Parent Management 1176 #///////////////////////////////////////////////////////////////// 1177
1178 - def _delparent(self, child, index):
1179 # Sanity checks 1180 assert isinstance(child, MultiParentedTree) 1181 assert self[index] is child 1182 assert len([p for p in child._parents if p is self]) == 1 1183 1184 # If the only copy of child in self is at index, then delete 1185 # self from child's parent list. 1186 for i, c in enumerate(self): 1187 if c is child and i != index: break 1188 else: 1189 child._parents.remove(self)
1190
1191 - def _setparent(self, child, index, dry_run=False):
1192 # If the child's type is incorrect, then complain. 1193 if not isinstance(child, MultiParentedTree): 1194 raise TypeError('Can not insert a non-MultiParentedTree '+ 1195 'into a MultiParentedTree') 1196 1197 # Add self as a parent pointer if it's not already listed. 1198 if not dry_run: 1199 for parent in child._parents: 1200 if parent is self: break 1201 else: 1202 child._parents.append(self)
1203
1204 -class ImmutableParentedTree(ImmutableTree, ParentedTree):
1205 - def __init__(self, node_or_str, children=None):
1206 if children is None: return # see note in Tree.__init__() 1207 super(ImmutableParentedTree, self).__init__(node_or_str, children)
1208
1209 -class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree):
1210 - def __init__(self, node_or_str, children=None):
1211 if children is None: return # see note in Tree.__init__() 1212 super(ImmutableMultiParentedTree, self).__init__(node_or_str, children)
1213
1214 ###################################################################### 1215 ## Probabilistic trees 1216 ###################################################################### 1217 -class ProbabilisticTree(Tree, ProbabilisticMixIn):
1218 - def __new__(cls, node_or_str, children=None, **prob_kwargs):
1219 return super(ProbabilisticTree, cls).__new__( 1220 cls, node_or_str, children)
1221 - def __init__(self, node_or_str, children=None, **prob_kwargs):
1222 if children is None: return # see note in Tree.__init__() 1223 Tree.__init__(self, node_or_str, children) 1224 ProbabilisticMixIn.__init__(self, **prob_kwargs)
1225 1226 # We have to patch up these methods to make them work right:
1228 - def __repr__(self):
1229 return '%s (p=%s)' % (Tree.__repr__(self), self.prob())
1230 - def __str__(self):
1231 return '%s (p=%s)' % (self.pprint(margin=60), self.prob())
1232 - def __cmp__(self, other):
1233 c = Tree.__cmp__(self, other) 1234 if c != 0: return c 1235 return cmp(self.prob(), other.prob())
1236 - def __eq__(self, other):
1237 if not isinstance(other, Tree): return False 1238 return Tree.__eq__(self, other) and self.prob()==other.prob()
1239 - def __ne__(self, other):
1240 return not (self == other)
1241 - def copy(self, deep=False):
1242 if not deep: return self.__class__(self.node, self, prob=self.prob()) 1243 else: return self.__class__.convert(self)
1244 - def convert(cls, val):
1245 if isinstance(val, Tree): 1246 children = [cls.convert(child) for child in val] 1247 if isinstance(val, ProbabilisticMixIn): 1248 return cls(val.node, children, prob=val.prob()) 1249 else: 1250 return cls(val.node, children, prob=1.0) 1251 else: 1252 return val
1253 convert = classmethod(convert)
1254
1255 -class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
1256 - def __new__(cls, node_or_str, children=None, **prob_kwargs):
1257 return super(ImmutableProbabilisticTree, cls).__new__( 1258 cls, node_or_str, children)
1259 - def __init__(self, node_or_str, children=None, **prob_kwargs):
1260 if children is None: return # see note in Tree.__init__() 1261 ImmutableTree.__init__(self, node_or_str, children) 1262 ProbabilisticMixIn.__init__(self, **prob_kwargs)
1263 1264 # We have to patch up these methods to make them work right:
1266 - def __repr__(self):
1267 return '%s [%s]' % (Tree.__repr__(self), self.prob())
1268 - def __str__(self):
1269 return '%s [%s]' % (self.pprint(margin=60), self.prob())
1270 - def __cmp__(self, other):
1271 c = Tree.__cmp__(self, other) 1272 if c != 0: return c 1273 return cmp(self.prob(), other.prob())
1274 - def __eq__(self, other):
1275 if not isinstance(other, Tree): return False 1276 return Tree.__eq__(self, other) and self.prob()==other.prob()
1277 - def __ne__(self, other):
1278 return not (self == other)
1279 - def copy(self, deep=False):
1280 if not deep: return self.__class__(self.node, self, prob=self.prob()) 1281 else: return self.__class__.convert(self)
1282 - def convert(cls, val):
1283 if isinstance(val, Tree): 1284 children = [cls.convert(child) for child in val] 1285 if isinstance(val, ProbabilisticMixIn): 1286 return cls(val.node, children, prob=val.prob()) 1287 else: 1288 return cls(val.node, children, prob=1) 1289 else: 1290 return val
1291 convert = classmethod(convert)
1292
1293 1294 -def _child_names(tree):
1295 names = [] 1296 for child in tree: 1297 if isinstance(child, Tree): 1298 names.append(Nonterminal(child.node)) 1299 else: 1300 names.append(child) 1301 return names
1302
1303 ###################################################################### 1304 ## Parsing 1305 ###################################################################### 1306 1307 -def bracket_parse(s):
1308 """ 1309 Use Tree.parse(s, remove_top_empty_bracketing=True) instead. 1310 """ 1311 raise NameError("Use Tree.parse(s, remove_top_empty_bracketing=True) instead.")
1312
1313 -def sinica_parse(s):
1314 """ 1315 Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, 1316 as shown in the following example (X represents a Chinese character): 1317 S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY) 1318 1319 @return: A tree corresponding to the string representation. 1320 @rtype: C{tree} 1321 @param s: The string to be converted 1322 @type s: C{string} 1323 """ 1324 tokens = re.split(r'([()| ])', s) 1325 for i in range(len(tokens)): 1326 if tokens[i] == '(': 1327 tokens[i-1], tokens[i] = tokens[i], tokens[i-1] # pull nonterminal inside parens 1328 elif ':' in tokens[i]: 1329 fields = tokens[i].split(':') 1330 if len(fields) == 2: # non-terminal 1331 tokens[i] = fields[1] 1332 else: 1333 tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" 1334 elif tokens[i] == '|': 1335 tokens[i] = '' 1336 1337 treebank_string = string.join(tokens) 1338 return bracket_parse(treebank_string)
1339
1340 # s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier 1341 # s = re.sub(r'\w+:', '', s) # remove role tags 1342 1343 # return s 1344 1345 ###################################################################### 1346 ## Demonstration 1347 ###################################################################### 1348 1349 -def demo():
1350 """ 1351 A demonstration showing how C{Tree}s and C{Tree}s can be 1352 used. This demonstration creates a C{Tree}, and loads a 1353 C{Tree} from the L{treebank<nltk.corpus.treebank>} corpus, 1354 and shows the results of calling several of their methods. 1355 """ 1356 1357 from nltk import tree 1358 1359 # Demonstrate tree parsing. 1360 s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))' 1361 t = Tree(s) 1362 print "Convert bracketed string into tree:" 1363 print t 1364 print t.__repr__() 1365 1366 print "Display tree properties:" 1367 print t.node # tree's constituent type 1368 print t[0] # tree's first child 1369 print t[1] # tree's second child 1370 print t.height() 1371 print t.leaves() 1372 print t[1] 1373 print t[1,1] 1374 print t[1,1,0] 1375 1376 # Demonstrate tree modification. 1377 the_cat = t[0] 1378 the_cat.insert(1, tree.bracket_parse('(JJ big)')) 1379 print "Tree modification:" 1380 print t 1381 t[1,1,1] = tree.bracket_parse('(NN cake)') 1382 print t 1383 print 1384 1385 # Tree transforms 1386 print "Collapse unary:" 1387 t.collapse_unary() 1388 print t 1389 print "Chomsky normal form:" 1390 t.chomsky_normal_form() 1391 print t 1392 print 1393 1394 # Demonstrate probabilistic trees. 1395 pt = tree.ProbabilisticTree('x', ['y', 'z'], prob=0.5) 1396 print "Probabilistic Tree:" 1397 print pt 1398 print 1399 1400 # Demonstrate parsing of treebank output format. 1401 t = tree.bracket_parse(t.pprint()) 1402 print "Convert tree to bracketed string and back again:" 1403 print t 1404 print 1405 1406 # Demonstrate LaTeX output 1407 print "LaTeX output:" 1408 print t.pprint_latex_qtree() 1409 print 1410 1411 # Demonstrate Productions 1412 print "Production output:" 1413 print t.productions() 1414 print 1415 1416 # Demonstrate tree nodes containing objects other than strings 1417 t.node = ('test', 3) 1418 print t
1419 1420 if __name__ == '__main__': 1421 demo() 1422 1423 __all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn', 1424 'ProbabilisticTree', 'Tree', 'bracket_parse', 1425 'sinica_parse', 'ParentedTree', 'MultiParentedTree', 1426 'ImmutableParentedTree', 'ImmutableMultiParentedTree'] 1427