Package nltk :: Package parse :: Module chart
[hide private]
[frames] | no frames]

Source Code for Module nltk.parse.chart

   1  # -*- coding: utf-8 -*- 
   2  # Natural Language Toolkit: A Chart Parser 
   3  # 
   4  # Copyright (C) 2001-2011 NLTK Project 
   5  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   6  #         Steven Bird <sb@csse.unimelb.edu.au> 
   7  #         Jean Mark Gawron <gawron@mail.sdsu.edu> 
   8  #         Peter Ljunglöf <peter.ljunglof@heatherleaf.se> 
   9  # URL: <http://www.nltk.org/> 
  10  # For license information, see LICENSE.TXT 
  11  # 
  12  # $Id: chart.py 8778 2011-04-10 07:29:10Z StevenBird1 $ 
  13   
  14  """ 
  15  Data classes and parser implementations for \"chart parsers\", which 
  16  use dynamic programming to efficiently parse a text.  A X{chart 
  17  parser} derives parse trees for a text by iteratively adding \"edges\" 
  18  to a \"chart.\"  Each X{edge} represents a hypothesis about the tree 
  19  structure for a subsequence of the text.  The X{chart} is a 
  20  \"blackboard\" for composing and combining these hypotheses. 
  21   
  22  When a chart parser begins parsing a text, it creates a new (empty) 
  23  chart, spanning the text.  It then incrementally adds new edges to the 
  24  chart.  A set of X{chart rules} specifies the conditions under which 
  25  new edges should be added to the chart.  Once the chart reaches a 
  26  stage where none of the chart rules adds any new edges, parsing is 
  27  complete. 
  28   
  29  Charts are encoded with the L{Chart} class, and edges are encoded with 
  30  the L{TreeEdge} and L{LeafEdge} classes.  The chart parser module 
  31  defines three chart parsers: 
  32   
  33    - C{ChartParser} is a simple and flexible chart parser.  Given a 
  34      set of chart rules, it will apply those rules to the chart until 
  35      no more edges are added. 
  36   
  37    - C{SteppingChartParser} is a subclass of C{ChartParser} that can 
  38      be used to step through the parsing process. 
  39  """ 
  40   
  41  from nltk.tree import Tree 
  42  from nltk.grammar import WeightedGrammar, is_nonterminal, is_terminal 
  43  from nltk.compat import defaultdict 
  44   
  45  from api import * 
  46   
  47  import re 
  48  import warnings 
  49   
  50  ######################################################################## 
  51  ##  Edges 
  52  ######################################################################## 
  53   
54 -class EdgeI(object):
55 """ 56 A hypothesis about the structure of part of a sentence. 57 Each edge records the fact that a structure is (partially) 58 consistent with the sentence. An edge contains: 59 60 - A X{span}, indicating what part of the sentence is 61 consistent with the hypothesized structure. 62 63 - A X{left-hand side}, specifying what kind of structure is 64 hypothesized. 65 66 - A X{right-hand side}, specifying the contents of the 67 hypothesized structure. 68 69 - A X{dot position}, indicating how much of the hypothesized 70 structure is consistent with the sentence. 71 72 Every edge is either X{complete} or X{incomplete}: 73 74 - An edge is X{complete} if its structure is fully consistent 75 with the sentence. 76 77 - An edge is X{incomplete} if its structure is partially 78 consistent with the sentence. For every incomplete edge, the 79 span specifies a possible prefix for the edge's structure. 80 81 There are two kinds of edge: 82 83 - C{TreeEdges<TreeEdge>} record which trees have been found to 84 be (partially) consistent with the text. 85 86 - C{LeafEdges<leafEdge>} record the tokens occur in the text. 87 88 The C{EdgeI} interface provides a common interface to both types 89 of edge, allowing chart parsers to treat them in a uniform manner. 90 """
91 - def __init__(self):
92 if self.__class__ == EdgeI: 93 raise TypeError('Edge is an abstract interface')
94 95 #//////////////////////////////////////////////////////////// 96 # Span 97 #//////////////////////////////////////////////////////////// 98
99 - def span(self):
100 """ 101 @return: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 102 portion of the sentence that is consistent with this 103 edge's structure. 104 @rtype: C{(int, int)} 105 """ 106 raise AssertionError('EdgeI is an abstract interface')
107
108 - def start(self):
109 """ 110 @return: The start index of this edge's span. 111 @rtype: C{int} 112 """ 113 raise AssertionError('EdgeI is an abstract interface')
114
115 - def end(self):
116 """ 117 @return: The end index of this edge's span. 118 @rtype: C{int} 119 """ 120 raise AssertionError('EdgeI is an abstract interface')
121
122 - def length(self):
123 """ 124 @return: The length of this edge's span. 125 @rtype: C{int} 126 """ 127 raise AssertionError('EdgeI is an abstract interface')
128 129 #//////////////////////////////////////////////////////////// 130 # Left Hand Side 131 #//////////////////////////////////////////////////////////// 132
133 - def lhs(self):
134 """ 135 @return: This edge's left-hand side, which specifies what kind 136 of structure is hypothesized by this edge. 137 @see: L{TreeEdge} and L{LeafEdge} for a description of 138 the left-hand side values for each edge type. 139 """ 140 raise AssertionError('EdgeI is an abstract interface')
141 142 #//////////////////////////////////////////////////////////// 143 # Right Hand Side 144 #//////////////////////////////////////////////////////////// 145
146 - def rhs(self):
147 """ 148 @return: This edge's right-hand side, which specifies 149 the content of the structure hypothesized by this 150 edge. 151 @see: L{TreeEdge} and L{LeafEdge} for a description of 152 the right-hand side values for each edge type. 153 """ 154 raise AssertionError('EdgeI is an abstract interface')
155
156 - def dot(self):
157 """ 158 @return: This edge's dot position, which indicates how much of 159 the hypothesized structure is consistent with the 160 sentence. In particular, C{self.rhs[:dot]} is consistent 161 with C{subtoks[self.start():self.end()]}. 162 @rtype: C{int} 163 """ 164 raise AssertionError('EdgeI is an abstract interface')
165
166 - def next(self):
167 """ 168 @return: The element of this edge's right-hand side that 169 immediately follows its dot. 170 @rtype: C{Nonterminal} or X{terminal} or C{None} 171 """ 172 raise AssertionError('EdgeI is an abstract interface')
173
174 - def is_complete(self):
175 """ 176 @return: True if this edge's structure is fully consistent 177 with the text. 178 @rtype: C{boolean} 179 """ 180 raise AssertionError('EdgeI is an abstract interface')
181
182 - def is_incomplete(self):
183 """ 184 @return: True if this edge's structure is partially consistent 185 with the text. 186 @rtype: C{boolean} 187 """ 188 raise AssertionError('EdgeI is an abstract interface')
189 190 #//////////////////////////////////////////////////////////// 191 # Comparisons 192 #////////////////////////////////////////////////////////////
193 - def __cmp__(self, other):
194 raise AssertionError('EdgeI is an abstract interface')
195
196 - def __hash__(self, other):
197 raise AssertionError('EdgeI is an abstract interface')
198
199 -class TreeEdge(EdgeI):
200 """ 201 An edge that records the fact that a tree is (partially) 202 consistent with the sentence. A tree edge consists of: 203 204 - A X{span}, indicating what part of the sentence is 205 consistent with the hypothesized tree. 206 207 - A X{left-hand side}, specifying the hypothesized tree's node 208 value. 209 210 - A X{right-hand side}, specifying the hypothesized tree's 211 children. Each element of the right-hand side is either a 212 terminal, specifying a token with that terminal as its leaf 213 value; or a nonterminal, specifying a subtree with that 214 nonterminal's symbol as its node value. 215 216 - A X{dot position}, indicating which children are consistent 217 with part of the sentence. In particular, if C{dot} is the 218 dot position, C{rhs} is the right-hand size, C{(start,end)} 219 is the span, and C{sentence} is the list of subtokens in the 220 sentence, then C{subtokens[start:end]} can be spanned by the 221 children specified by C{rhs[:dot]}. 222 223 For more information about edges, see the L{EdgeI} interface. 224 """
225 - def __init__(self, span, lhs, rhs, dot=0):
226 """ 227 Construct a new C{TreeEdge}. 228 229 @type span: C{(int, int)} 230 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 231 portion of the sentence that is consistent with the new 232 edge's structure. 233 @type lhs: L{Nonterminal} 234 @param lhs: The new edge's left-hand side, specifying the 235 hypothesized tree's node value. 236 @type rhs: C{list} of (L{Nonterminal} and C{string}) 237 @param rhs: The new edge's right-hand side, specifying the 238 hypothesized tree's children. 239 @type dot: C{int} 240 @param dot: The position of the new edge's dot. This position 241 specifies what prefix of the production's right hand side 242 is consistent with the text. In particular, if 243 C{sentence} is the list of subtokens in the sentence, then 244 C{subtokens[span[0]:span[1]]} can be spanned by the 245 children specified by C{rhs[:dot]}. 246 """ 247 self._lhs = lhs 248 self._rhs = tuple(rhs) 249 self._span = span 250 self._dot = dot
251 252 # [staticmethod]
253 - def from_production(production, index):
254 """ 255 @return: A new C{TreeEdge} formed from the given production. 256 The new edge's left-hand side and right-hand side will 257 be taken from C{production}; its span will be 258 C{(index,index)}; and its dot position will be C{0}. 259 @rtype: L{TreeEdge} 260 """ 261 return TreeEdge(span=(index, index), lhs=production.lhs(), 262 rhs=production.rhs(), dot=0)
263 from_production = staticmethod(from_production) 264
265 - def move_dot_forward(self, new_end):
266 """ 267 @return: A new C{TreeEdge} formed from this edge. 268 The new edge's dot position is increased by C{1}, 269 and its end index will be replaced by C{new_end}. 270 @rtype: L{TreeEdge} 271 @param new_end: The new end index. 272 @type new_end: C{int} 273 """ 274 return TreeEdge(span=(self._span[0], new_end), 275 lhs=self._lhs, rhs=self._rhs, 276 dot=self._dot+1)
277 278 # Accessors
279 - def lhs(self): return self._lhs
280 - def span(self): return self._span
281 - def start(self): return self._span[0]
282 - def end(self): return self._span[1]
283 - def length(self): return self._span[1] - self._span[0]
284 - def rhs(self): return self._rhs
285 - def dot(self): return self._dot
286 - def is_complete(self): return self._dot == len(self._rhs)
287 - def is_incomplete(self): return self._dot != len(self._rhs)
288 - def next(self):
289 if self._dot >= len(self._rhs): return None 290 else: return self._rhs[self._dot]
291 292 # Comparisons & hashing
293 - def __cmp__(self, other):
294 if self.__class__ != other.__class__: return -1 295 return cmp((self._span, self.lhs(), self.rhs(), self._dot), 296 (other._span, other.lhs(), other.rhs(), other._dot))
297 - def __hash__(self):
298 return hash((self.lhs(), self.rhs(), self._span, self._dot))
299 300 # String representation
301 - def __str__(self):
302 str = '[%s:%s] ' % (self._span[0], self._span[1]) 303 str += '%-2r ->' % (self._lhs,) 304 305 for i in range(len(self._rhs)): 306 if i == self._dot: str += ' *' 307 str += ' %r' % (self._rhs[i],) 308 if len(self._rhs) == self._dot: str += ' *' 309 return str
310
311 - def __repr__(self):
312 return '[Edge: %s]' % self
313
314 -class LeafEdge(EdgeI):
315 """ 316 An edge that records the fact that a leaf value is consistent with 317 a word in the sentence. A leaf edge consists of: 318 319 - An X{index}, indicating the position of the word. 320 - A X{leaf}, specifying the word's content. 321 322 A leaf edge's left-hand side is its leaf value, and its right hand 323 side is C{()}. Its span is C{[index, index+1]}, and its dot 324 position is C{0}. 325 """
326 - def __init__(self, leaf, index):
327 """ 328 Construct a new C{LeafEdge}. 329 330 @param leaf: The new edge's leaf value, specifying the word 331 that is recorded by this edge. 332 @param index: The new edge's index, specifying the position of 333 the word that is recorded by this edge. 334 """ 335 self._leaf = leaf 336 self._index = index
337 338 # Accessors
339 - def lhs(self): return self._leaf
340 - def span(self): return (self._index, self._index+1)
341 - def start(self): return self._index
342 - def end(self): return self._index+1
343 - def length(self): return 1
344 - def rhs(self): return ()
345 - def dot(self): return 0
346 - def is_complete(self): return True
347 - def is_incomplete(self): return False
348 - def next(self): return None
349 350 # Comparisons & hashing
351 - def __cmp__(self, other):
352 if not isinstance(other, LeafEdge): return -1 353 return cmp((self._index, self._leaf), (other._index, other._leaf))
354 - def __hash__(self):
355 return hash((self._index, self._leaf))
356 357 # String representations
358 - def __str__(self):
359 return '[%s:%s] %r' % (self._index, self._index+1, self._leaf)
360 - def __repr__(self):
361 return '[Edge: %s]' % (self)
362 363 ######################################################################## 364 ## Chart 365 ######################################################################## 366
367 -class Chart(object):
368 """ 369 A blackboard for hypotheses about the syntactic constituents of a 370 sentence. A chart contains a set of edges, and each edge encodes 371 a single hypothesis about the structure of some portion of the 372 sentence. 373 374 The L{select} method can be used to select a specific collection 375 of edges. For example C{chart.select(is_complete=True, start=0)} 376 yields all complete edges whose start indices are 0. To ensure 377 the efficiency of these selection operations, C{Chart} dynamically 378 creates and maintains an index for each set of attributes that 379 have been selected on. 380 381 In order to reconstruct the trees that are represented by an edge, 382 the chart associates each edge with a set of child pointer lists. 383 A X{child pointer list} is a list of the edges that license an 384 edge's right-hand side. 385 386 @ivar _tokens: The sentence that the chart covers. 387 @ivar _num_leaves: The number of tokens. 388 @ivar _edges: A list of the edges in the chart 389 @ivar _edge_to_cpls: A dictionary mapping each edge to a set 390 of child pointer lists that are associated with that edge. 391 @ivar _indexes: A dictionary mapping tuples of edge attributes 392 to indices, where each index maps the corresponding edge 393 attribute values to lists of edges. 394 """
395 - def __init__(self, tokens):
396 """ 397 Construct a new chart. The chart is initialized with the 398 leaf edges corresponding to the terminal leaves. 399 400 @type tokens: L{list} 401 @param tokens: The sentence that this chart will be used to parse. 402 """ 403 # Record the sentence token and the sentence length. 404 self._tokens = tuple(tokens) 405 self._num_leaves = len(self._tokens) 406 407 # Initialise the chart. 408 self.initialize()
409
410 - def initialize(self):
411 """ 412 Clear the chart. 413 """ 414 # A list of edges contained in this chart. 415 self._edges = [] 416 417 # The set of child pointer lists associated with each edge. 418 self._edge_to_cpls = {} 419 420 # Indexes mapping attribute values to lists of edges 421 # (used by select()). 422 self._indexes = {}
423 424 #//////////////////////////////////////////////////////////// 425 # Sentence Access 426 #//////////////////////////////////////////////////////////// 427
428 - def num_leaves(self):
429 """ 430 @return: The number of words in this chart's sentence. 431 @rtype: C{int} 432 """ 433 return self._num_leaves
434
435 - def leaf(self, index):
436 """ 437 @return: The leaf value of the word at the given index. 438 @rtype: C{string} 439 """ 440 return self._tokens[index]
441
442 - def leaves(self):
443 """ 444 @return: A list of the leaf values of each word in the 445 chart's sentence. 446 @rtype: C{list} of C{string} 447 """ 448 return self._tokens
449 450 #//////////////////////////////////////////////////////////// 451 # Edge access 452 #//////////////////////////////////////////////////////////// 453
454 - def edges(self):
455 """ 456 @return: A list of all edges in this chart. New edges 457 that are added to the chart after the call to edges() 458 will I{not} be contained in this list. 459 @rtype: C{list} of L{EdgeI} 460 @see: L{iteredges}, L{select} 461 """ 462 return self._edges[:]
463
464 - def iteredges(self):
465 """ 466 @return: An iterator over the edges in this chart. It is 467 I{not} guaranteed that new edges which are added to the 468 chart before the iterator is exhausted will also be 469 generated. 470 @rtype: C{iter} of L{EdgeI} 471 @see: L{edges}, L{select} 472 """ 473 return iter(self._edges)
474 475 # Iterating over the chart yields its edges. 476 __iter__ = iteredges 477
478 - def num_edges(self):
479 """ 480 @return: The number of edges contained in this chart. 481 @rtype: C{int} 482 """ 483 return len(self._edge_to_cpls)
484
485 - def select(self, **restrictions):
486 """ 487 @return: An iterator over the edges in this chart. Any 488 new edges that are added to the chart before the iterator 489 is exahusted will also be generated. C{restrictions} 490 can be used to restrict the set of edges that will be 491 generated. 492 @rtype: C{iter} of L{EdgeI} 493 494 @kwarg span: Only generate edges C{e} where C{e.span()==span} 495 @kwarg start: Only generate edges C{e} where C{e.start()==start} 496 @kwarg end: Only generate edges C{e} where C{e.end()==end} 497 @kwarg length: Only generate edges C{e} where C{e.length()==length} 498 @kwarg lhs: Only generate edges C{e} where C{e.lhs()==lhs} 499 @kwarg rhs: Only generate edges C{e} where C{e.rhs()==rhs} 500 @kwarg next: Only generate edges C{e} where C{e.next()==next} 501 @kwarg dot: Only generate edges C{e} where C{e.dot()==dot} 502 @kwarg is_complete: Only generate edges C{e} where 503 C{e.is_complete()==is_complete} 504 @kwarg is_incomplete: Only generate edges C{e} where 505 C{e.is_incomplete()==is_incomplete} 506 """ 507 # If there are no restrictions, then return all edges. 508 if restrictions=={}: return iter(self._edges) 509 510 # Find the index corresponding to the given restrictions. 511 restr_keys = restrictions.keys() 512 restr_keys.sort() 513 restr_keys = tuple(restr_keys) 514 515 # If it doesn't exist, then create it. 516 if restr_keys not in self._indexes: 517 self._add_index(restr_keys) 518 519 vals = tuple(restrictions[key] for key in restr_keys) 520 return iter(self._indexes[restr_keys].get(vals, []))
521
522 - def _add_index(self, restr_keys):
523 """ 524 A helper function for L{select}, which creates a new index for 525 a given set of attributes (aka restriction keys). 526 """ 527 # Make sure it's a valid index. 528 for key in restr_keys: 529 if not hasattr(EdgeI, key): 530 raise ValueError, 'Bad restriction: %s' % key 531 532 # Create the index. 533 index = self._indexes[restr_keys] = {} 534 535 # Add all existing edges to the index. 536 for edge in self._edges: 537 vals = tuple(getattr(edge, key)() for key in restr_keys) 538 index.setdefault(vals, []).append(edge)
539
540 - def _register_with_indexes(self, edge):
541 """ 542 A helper function for L{insert}, which registers the new 543 edge with all existing indexes. 544 """ 545 for (restr_keys, index) in self._indexes.items(): 546 vals = tuple(getattr(edge, key)() for key in restr_keys) 547 index.setdefault(vals, []).append(edge)
548 549 #//////////////////////////////////////////////////////////// 550 # Edge Insertion 551 #//////////////////////////////////////////////////////////// 552
553 - def insert_with_backpointer(self, new_edge, previous_edge, child_edge):
554 """ 555 Add a new edge to the chart, using a pointer to the previous edge. 556 """ 557 cpls = self.child_pointer_lists(previous_edge) 558 new_cpls = [cpl+(child_edge,) for cpl in cpls] 559 return self.insert(new_edge, *new_cpls)
560
561 - def insert(self, edge, *child_pointer_lists):
562 """ 563 Add a new edge to the chart. 564 565 @type edge: L{EdgeI} 566 @param edge: The new edge 567 @type child_pointer_lists: C(sequence} of C{tuple} of L{EdgeI} 568 @param child_pointer_lists: A sequence of lists of the edges that 569 were used to form this edge. This list is used to reconstruct 570 the trees (or partial trees) that are associated with C{edge}. 571 @rtype: C{bool} 572 @return: True if this operation modified the chart. In 573 particular, return true iff the chart did not already 574 contain C{edge}, or if it did not already associate 575 C{child_pointer_lists} with C{edge}. 576 """ 577 # Is it a new edge? 578 if edge not in self._edge_to_cpls: 579 # Add it to the list of edges. 580 self._append_edge(edge) 581 # Register with indexes. 582 self._register_with_indexes(edge) 583 584 # Get the set of child pointer lists for this edge. 585 cpls = self._edge_to_cpls.setdefault(edge,{}) 586 chart_was_modified = False 587 for child_pointer_list in child_pointer_lists: 588 child_pointer_list = tuple(child_pointer_list) 589 if child_pointer_list not in cpls: 590 # It's a new CPL; register it, and return true. 591 cpls[child_pointer_list] = True 592 chart_was_modified = True 593 return chart_was_modified
594
595 - def _append_edge(self, edge):
596 self._edges.append(edge)
597 598 #//////////////////////////////////////////////////////////// 599 # Tree extraction & child pointer lists 600 #//////////////////////////////////////////////////////////// 601
602 - def parses(self, root, tree_class=Tree):
603 """ 604 @return: A list of the complete tree structures that span 605 the entire chart, and whose root node is C{root}. 606 """ 607 trees = [] 608 for edge in self.select(start=0, end=self._num_leaves, lhs=root): 609 trees += self.trees(edge, tree_class=tree_class, complete=True) 610 return trees
611
612 - def trees(self, edge, tree_class=Tree, complete=False):
613 """ 614 @return: A list of the tree structures that are associated 615 with C{edge}. 616 617 If C{edge} is incomplete, then the unexpanded children will be 618 encoded as childless subtrees, whose node value is the 619 corresponding terminal or nonterminal. 620 621 @rtype: C{list} of L{Tree} 622 @note: If two trees share a common subtree, then the same 623 C{Tree} may be used to encode that subtree in 624 both trees. If you need to eliminate this subtree 625 sharing, then create a deep copy of each tree. 626 """ 627 return self._trees(edge, complete, memo={}, tree_class=tree_class)
628
629 - def _trees(self, edge, complete, memo, tree_class):
630 """ 631 A helper function for L{trees}. 632 @param memo: A dictionary used to record the trees that we've 633 generated for each edge, so that when we see an edge more 634 than once, we can reuse the same trees. 635 """ 636 # If we've seen this edge before, then reuse our old answer. 637 if edge in memo: 638 return memo[edge] 639 640 trees = [] 641 642 # when we're reading trees off the chart, don't use incomplete edges 643 if complete and edge.is_incomplete(): 644 return trees 645 646 # Until we're done computing the trees for edge, set 647 # memo[edge] to be empty. This has the effect of filtering 648 # out any cyclic trees (i.e., trees that contain themselves as 649 # descendants), because if we reach this edge via a cycle, 650 # then it will appear that the edge doesn't generate any 651 # trees. 652 memo[edge] = [] 653 654 # Leaf edges. 655 if isinstance(edge, LeafEdge): 656 leaf = self._tokens[edge.start()] 657 memo[edge] = leaf 658 return [leaf] 659 660 # Each child pointer list can be used to form trees. 661 for cpl in self.child_pointer_lists(edge): 662 # Get the set of child choices for each child pointer. 663 # child_choices[i] is the set of choices for the tree's 664 # ith child. 665 child_choices = [self._trees(cp, complete, memo, tree_class) 666 for cp in cpl] 667 668 # For each combination of children, add a tree. 669 for children in self._choose_children(child_choices): 670 lhs = edge.lhs().symbol() 671 trees.append(tree_class(lhs, children)) 672 673 # If the edge is incomplete, then extend it with "partial trees": 674 if edge.is_incomplete(): 675 unexpanded = [tree_class(elt,[]) 676 for elt in edge.rhs()[edge.dot():]] 677 for tree in trees: 678 tree.extend(unexpanded) 679 680 # Update the memoization dictionary. 681 memo[edge] = trees 682 683 # Return the list of trees. 684 return trees
685
686 - def _choose_children(self, child_choices):
687 """ 688 A helper function for L{_trees} that finds the possible sets 689 of subtrees for a new tree. 690 691 @param child_choices: A list that specifies the options for 692 each child. In particular, C{child_choices[i]} is a list of 693 tokens and subtrees that can be used as the C{i}th child. 694 """ 695 children_lists = [[]] 696 for child_choice in child_choices: 697 if hasattr(child_choice, '__iter__') and \ 698 not isinstance(child_choice, basestring): 699 # Only iterate over the child trees 700 # if child_choice is iterable and NOT a string 701 children_lists = [child_list+[child] 702 for child in child_choice 703 for child_list in children_lists] 704 else: 705 # If child_choice is a string (or non-iterable) 706 # then it is a leaf 707 children_lists = [child_list+[child_choice] 708 for child_list in children_lists] 709 return children_lists
710
711 - def child_pointer_lists(self, edge):
712 """ 713 @rtype: C{list} of C{list} of C{EdgeI} 714 @return: The set of child pointer lists for the given edge. 715 Each child pointer list is a list of edges that have 716 been used to form this edge. 717 """ 718 # Make a copy, in case they modify it. 719 return self._edge_to_cpls.get(edge, {}).keys()
720 721 #//////////////////////////////////////////////////////////// 722 # Display 723 #////////////////////////////////////////////////////////////
724 - def pp_edge(self, edge, width=None):
725 """ 726 @return: A pretty-printed string representation of a given edge 727 in this chart. 728 @rtype: C{string} 729 @param width: The number of characters allotted to each 730 index in the sentence. 731 """ 732 if width is None: width = 50/(self.num_leaves()+1) 733 (start, end) = (edge.start(), edge.end()) 734 735 str = '|' + ('.'+' '*(width-1))*start 736 737 # Zero-width edges are "#" if complete, ">" if incomplete 738 if start == end: 739 if edge.is_complete(): str += '#' 740 else: str += '>' 741 742 # Spanning complete edges are "[===]"; Other edges are 743 # "[---]" if complete, "[--->" if incomplete 744 elif edge.is_complete() and edge.span() == (0,self._num_leaves): 745 str += '['+('='*width)*(end-start-1) + '='*(width-1)+']' 746 elif edge.is_complete(): 747 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+']' 748 else: 749 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+'>' 750 751 str += (' '*(width-1)+'.')*(self._num_leaves-end) 752 return str + '| %s' % edge
753
754 - def pp_leaves(self, width=None):
755 """ 756 @return: A pretty-printed string representation of this 757 chart's leaves. This string can be used as a header 758 for calls to L{pp_edge}. 759 """ 760 if width is None: width = 50/(self.num_leaves()+1) 761 762 if self._tokens is not None and width>1: 763 header = '|.' 764 for tok in self._tokens: 765 header += tok[:width-1].center(width-1)+'.' 766 header += '|' 767 else: 768 header = '' 769 770 return header
771
772 - def pp(self, width=None):
773 """ 774 @return: A pretty-printed string representation of this chart. 775 @rtype: C{string} 776 @param width: The number of characters allotted to each 777 index in the sentence. 778 """ 779 if width is None: width = 50/(self.num_leaves()+1) 780 # sort edges: primary key=length, secondary key=start index. 781 # (and filter out the token edges) 782 edges = [(e.length(), e.start(), e) for e in self] 783 edges.sort() 784 edges = [e for (_,_,e) in edges] 785 786 return (self.pp_leaves(width) + '\n' + 787 '\n'.join(self.pp_edge(edge, width) for edge in edges))
788 789 #//////////////////////////////////////////////////////////// 790 # Display: Dot (AT&T Graphviz) 791 #//////////////////////////////////////////////////////////// 792
793 - def dot_digraph(self):
794 # Header 795 s = 'digraph nltk_chart {\n' 796 #s += ' size="5,5";\n' 797 s += ' rankdir=LR;\n' 798 s += ' node [height=0.1,width=0.1];\n' 799 s += ' node [style=filled, color="lightgray"];\n' 800 801 # Set up the nodes 802 for y in range(self.num_edges(), -1, -1): 803 if y == 0: 804 s += ' node [style=filled, color="black"];\n' 805 for x in range(self.num_leaves()+1): 806 if y == 0 or (x <= self._edges[y-1].start() or 807 x >= self._edges[y-1].end()): 808 s += ' %04d.%04d [label=""];\n' % (x,y) 809 810 # Add a spacer 811 s += ' x [style=invis]; x->0000.0000 [style=invis];\n' 812 813 # Declare ranks. 814 for x in range(self.num_leaves()+1): 815 s += ' {rank=same;' 816 for y in range(self.num_edges()+1): 817 if y == 0 or (x <= self._edges[y-1].start() or 818 x >= self._edges[y-1].end()): 819 s += ' %04d.%04d' % (x,y) 820 s += '}\n' 821 822 # Add the leaves 823 s += ' edge [style=invis, weight=100];\n' 824 s += ' node [shape=plaintext]\n' 825 s += ' 0000.0000' 826 for x in range(self.num_leaves()): 827 s += '->%s->%04d.0000' % (self.leaf(x), x+1) 828 s += ';\n\n' 829 830 # Add the edges 831 s += ' edge [style=solid, weight=1];\n' 832 for y, edge in enumerate(self): 833 for x in range(edge.start()): 834 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 835 (x, y+1, x+1, y+1)) 836 s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' % 837 (edge.start(), y+1, edge.end(), y+1, edge)) 838 for x in range(edge.end(), self.num_leaves()): 839 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 840 (x, y+1, x+1, y+1)) 841 s += '}\n' 842 return s
843 844 ######################################################################## 845 ## Chart Rules 846 ######################################################################## 847
848 -class ChartRuleI(object):
849 """ 850 A rule that specifies what new edges are licensed by any given set 851 of existing edges. Each chart rule expects a fixed number of 852 edges, as indicated by the class variable L{NUM_EDGES}. In 853 particular: 854 855 - A chart rule with C{NUM_EDGES=0} specifies what new edges are 856 licensed, regardless of existing edges. 857 858 - A chart rule with C{NUM_EDGES=1} specifies what new edges are 859 licensed by a single existing edge. 860 861 - A chart rule with C{NUM_EDGES=2} specifies what new edges are 862 licensed by a pair of existing edges. 863 864 @type NUM_EDGES: C{int} 865 @cvar NUM_EDGES: The number of existing edges that this rule uses 866 to license new edges. Typically, this number ranges from zero 867 to two. 868 """
869 - def apply(self, chart, grammar, *edges):
870 """ 871 Add the edges licensed by this rule and the given edges to the 872 chart. 873 874 @type edges: C{list} of L{EdgeI} 875 @param edges: A set of existing edges. The number of edges 876 that should be passed to C{apply} is specified by the 877 L{NUM_EDGES} class variable. 878 @rtype: C{list} of L{EdgeI} 879 @return: A list of the edges that were added. 880 """ 881 raise AssertionError, 'ChartRuleI is an abstract interface'
882
883 - def apply_iter(self, chart, grammar, *edges):
884 """ 885 @return: A generator that will add edges licensed by this rule 886 and the given edges to the chart, one at a time. Each 887 time the generator is resumed, it will either add a new 888 edge and yield that edge; or return. 889 @rtype: C{iter} of L{EdgeI} 890 891 @type edges: C{list} of L{EdgeI} 892 @param edges: A set of existing edges. The number of edges 893 that should be passed to C{apply} is specified by the 894 L{NUM_EDGES} class variable. 895 """ 896 raise AssertionError, 'ChartRuleI is an abstract interface'
897
898 - def apply_everywhere(self, chart, grammar):
899 """ 900 Add all the edges licensed by this rule and the edges in the 901 chart to the chart. 902 903 @rtype: C{list} of L{EdgeI} 904 @return: A list of the edges that were added. 905 """ 906 raise AssertionError, 'ChartRuleI is an abstract interface'
907
908 - def apply_everywhere_iter(self, chart, grammar):
909 """ 910 @return: A generator that will add all edges licensed by 911 this rule, given the edges that are currently in the 912 chart, one at a time. Each time the generator is resumed, 913 it will either add a new edge and yield that edge; or 914 return. 915 @rtype: C{iter} of L{EdgeI} 916 """ 917 raise AssertionError, 'ChartRuleI is an abstract interface'
918
919 -class AbstractChartRule(ChartRuleI):
920 """ 921 An abstract base class for chart rules. C{AbstractChartRule} 922 provides: 923 - A default implementation for C{apply}, based on C{apply_iter}. 924 - A default implementation for C{apply_everywhere_iter}, 925 based on C{apply_iter}. 926 - A default implementation for C{apply_everywhere}, based on 927 C{apply_everywhere_iter}. Currently, this implementation 928 assumes that C{NUM_EDGES}<=3. 929 - A default implementation for C{__str__}, which returns a 930 name basd on the rule's class name. 931 """ 932 933 # Subclasses must define apply_iter.
934 - def apply_iter(self, chart, grammar, *edges):
935 raise AssertionError, 'AbstractChartRule is an abstract class'
936 937 # Default: loop through the given number of edges, and call 938 # self.apply() for each set of edges.
939 - def apply_everywhere_iter(self, chart, grammar):
940 if self.NUM_EDGES == 0: 941 for new_edge in self.apply_iter(chart, grammar): 942 yield new_edge 943 944 elif self.NUM_EDGES == 1: 945 for e1 in chart: 946 for new_edge in self.apply_iter(chart, grammar, e1): 947 yield new_edge 948 949 elif self.NUM_EDGES == 2: 950 for e1 in chart: 951 for e2 in chart: 952 for new_edge in self.apply_iter(chart, grammar, e1, e2): 953 yield new_edge 954 955 elif self.NUM_EDGES == 3: 956 for e1 in chart: 957 for e2 in chart: 958 for e3 in chart: 959 for new_edge in self.apply_iter(chart,grammar,e1,e2,e3): 960 yield new_edge 961 962 else: 963 raise AssertionError, 'NUM_EDGES>3 is not currently supported'
964 965 # Default: delegate to apply_iter.
966 - def apply(self, chart, grammar, *edges):
967 return list(self.apply_iter(chart, grammar, *edges))
968 969 # Default: delegate to apply_everywhere_iter.
970 - def apply_everywhere(self, chart, grammar):
972 973 # Default: return a name based on the class name.
974 - def __str__(self):
975 # Add spaces between InitialCapsWords. 976 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
977 978 #//////////////////////////////////////////////////////////// 979 # Fundamental Rule 980 #//////////////////////////////////////////////////////////// 981
982 -class FundamentalRule(AbstractChartRule):
983 """ 984 A rule that joins two adjacent edges to form a single combined 985 edge. In particular, this rule specifies that any pair of edges: 986 987 - [A S{->} S{alpha} * B S{beta}][i:j] 988 - [B S{->} S{gamma} *][j:k] 989 licenses the edge: 990 - [A S{->} S{alpha} B * S{beta}][i:j] 991 """ 992 NUM_EDGES = 2
993 - def apply_iter(self, chart, grammar, left_edge, right_edge):
994 # Make sure the rule is applicable. 995 if not (left_edge.is_incomplete() and 996 right_edge.is_complete() and 997 left_edge.end() == right_edge.start() and 998 left_edge.next() == right_edge.lhs()): 999 return 1000 1001 # Construct the new edge. 1002 new_edge = left_edge.move_dot_forward(right_edge.end()) 1003 1004 # Insert it into the chart. 1005 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1006 yield new_edge
1007
1008 -class SingleEdgeFundamentalRule(FundamentalRule):
1009 """ 1010 A rule that joins a given edge with adjacent edges in the chart, 1011 to form combined edges. In particular, this rule specifies that 1012 either of the edges: 1013 - [A S{->} S{alpha} * B S{beta}][i:j] 1014 - [B S{->} S{gamma} *][j:k] 1015 licenses the edge: 1016 - [A S{->} S{alpha} B * S{beta}][i:j] 1017 if the other edge is already in the chart. 1018 1019 @note: This is basically L{FundamentalRule}, with one edge left 1020 unspecified. 1021 """ 1022 NUM_EDGES = 1 1023
1024 - def apply_iter(self, chart, grammar, edge):
1025 if edge.is_incomplete(): 1026 for new_edge in self._apply_incomplete(chart, grammar, edge): 1027 yield new_edge 1028 else: 1029 for new_edge in self._apply_complete(chart, grammar, edge): 1030 yield new_edge
1031
1032 - def _apply_complete(self, chart, grammar, right_edge):
1033 for left_edge in chart.select(end=right_edge.start(), 1034 is_complete=False, 1035 next=right_edge.lhs()): 1036 new_edge = left_edge.move_dot_forward(right_edge.end()) 1037 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1038 yield new_edge
1039
1040 - def _apply_incomplete(self, chart, grammar, left_edge):
1041 for right_edge in chart.select(start=left_edge.end(), 1042 is_complete=True, 1043 lhs=left_edge.next()): 1044 new_edge = left_edge.move_dot_forward(right_edge.end()) 1045 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1046 yield new_edge
1047 1048 #//////////////////////////////////////////////////////////// 1049 # Inserting Terminal Leafs 1050 #//////////////////////////////////////////////////////////// 1051
1052 -class LeafInitRule(AbstractChartRule):
1053 NUM_EDGES=0
1054 - def apply_iter(self, chart, grammar):
1055 for index in range(chart.num_leaves()): 1056 new_edge = LeafEdge(chart.leaf(index), index) 1057 if chart.insert(new_edge, ()): 1058 yield new_edge
1059 1060 #//////////////////////////////////////////////////////////// 1061 # Top-Down Prediction 1062 #//////////////////////////////////////////////////////////// 1063
1064 -class TopDownInitRule(AbstractChartRule):
1065 """ 1066 A rule licensing edges corresponding to the grammar productions for 1067 the grammar's start symbol. In particular, this rule specifies that: 1068 - [S S{->} * S{alpha}][0:i] 1069 is licensed for each grammar production C{S S{->} S{alpha}}, where 1070 C{S} is the grammar's start symbol. 1071 """ 1072 NUM_EDGES = 0
1073 - def apply_iter(self, chart, grammar):
1074 for prod in grammar.productions(lhs=grammar.start()): 1075 new_edge = TreeEdge.from_production(prod, 0) 1076 if chart.insert(new_edge, ()): 1077 yield new_edge
1078
1079 -class TopDownPredictRule(AbstractChartRule):
1080 """ 1081 A rule licensing edges corresponding to the grammar productions 1082 for the nonterminal following an incomplete edge's dot. In 1083 particular, this rule specifies that: 1084 - [A S{->} S{alpha} * B S{beta}][i:j] 1085 licenses the edge: 1086 - [B S{->} * S{gamma}][j:j] 1087 for each grammar production C{B S{->} S{gamma}}. 1088 1089 @note: This rule corresponds to the Predictor Rule in Earley parsing. 1090 """ 1091 NUM_EDGES = 1
1092 - def apply_iter(self, chart, grammar, edge):
1093 if edge.is_complete(): return 1094 for prod in grammar.productions(lhs=edge.next()): 1095 new_edge = TreeEdge.from_production(prod, edge.end()) 1096 if chart.insert(new_edge, ()): 1097 yield new_edge
1098
1099 -class CachedTopDownPredictRule(TopDownPredictRule):
1100 """ 1101 A cached version of L{TopDownPredictRule}. After the first time 1102 this rule is applied to an edge with a given C{end} and C{next}, 1103 it will not generate any more edges for edges with that C{end} and 1104 C{next}. 1105 1106 If C{chart} or C{grammar} are changed, then the cache is flushed. 1107 """
1108 - def __init__(self):
1109 TopDownPredictRule.__init__(self) 1110 self._done = {}
1111
1112 - def apply_iter(self, chart, grammar, edge):
1113 if edge.is_complete(): return 1114 next, index = edge.next(), edge.end() 1115 if not is_nonterminal(next): return 1116 1117 # If we've already applied this rule to an edge with the same 1118 # next & end, and the chart & grammar have not changed, then 1119 # just return (no new edges to add). 1120 done = self._done.get((next, index), (None,None)) 1121 if done[0] is chart and done[1] is grammar: return 1122 1123 # Add all the edges indicated by the top down expand rule. 1124 for prod in grammar.productions(lhs=next): 1125 # If the left corner in the predicted production is 1126 # leaf, it must match with the input. 1127 if prod.rhs(): 1128 first = prod.rhs()[0] 1129 if is_terminal(first): 1130 if index >= chart.num_leaves() or first != chart.leaf(index): continue 1131 1132 new_edge = TreeEdge.from_production(prod, index) 1133 if chart.insert(new_edge, ()): 1134 yield new_edge 1135 1136 # Record the fact that we've applied this rule. 1137 self._done[next, index] = (chart, grammar)
1138 1139 #//////////////////////////////////////////////////////////// 1140 # Bottom-Up Prediction 1141 #//////////////////////////////////////////////////////////// 1142
1143 -class BottomUpPredictRule(AbstractChartRule):
1144 """ 1145 A rule licensing any edge corresponding to a production whose 1146 right-hand side begins with a complete edge's left-hand side. In 1147 particular, this rule specifies that: 1148 - [A S{->} S{alpha} *] 1149 licenses the edge: 1150 - [B S{->} * A S{beta}] 1151 for each grammar production C{B S{->} A S{beta}}. 1152 """ 1153 NUM_EDGES = 1
1154 - def apply_iter(self, chart, grammar, edge):
1155 if edge.is_incomplete(): return 1156 for prod in grammar.productions(rhs=edge.lhs()): 1157 new_edge = TreeEdge.from_production(prod, edge.start()) 1158 if chart.insert(new_edge, ()): 1159 yield new_edge
1160
1161 -class BottomUpPredictCombineRule(BottomUpPredictRule):
1162 """ 1163 A rule licensing any edge corresponding to a production whose 1164 right-hand side begins with a complete edge's left-hand side. In 1165 particular, this rule specifies that: 1166 - [A S{->} S{alpha} *] 1167 licenses the edge: 1168 - [B S{->} A * S{beta}] 1169 for each grammar production C{B S{->} A S{beta}}. 1170 1171 @note: This is like L{BottomUpPredictRule}, but it also applies 1172 the L{FundamentalRule} to the resulting edge. 1173 """ 1174 NUM_EDGES = 1
1175 - def apply_iter(self, chart, grammar, edge):
1176 if edge.is_incomplete(): return 1177 for prod in grammar.productions(rhs=edge.lhs()): 1178 new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) 1179 if chart.insert(new_edge, (edge,)): 1180 yield new_edge
1181
1182 -class EmptyPredictRule(AbstractChartRule):
1183 """ 1184 A rule that inserts all empty productions as passive edges, 1185 in every position in the chart. 1186 """ 1187 NUM_EDGES = 0
1188 - def apply_iter(self, chart, grammar):
1189 for prod in grammar.productions(empty=True): 1190 for index in xrange(chart.num_leaves() + 1): 1191 new_edge = TreeEdge.from_production(prod, index) 1192 if chart.insert(new_edge, ()): 1193 yield new_edge
1194 1195 1196 ######################################################################## 1197 ## Filtered Bottom Up 1198 ######################################################################## 1199
1200 -class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
1201 - def _apply_complete(self, chart, grammar, right_edge):
1202 end = right_edge.end() 1203 nexttoken = end < chart.num_leaves() and chart.leaf(end) 1204 for left_edge in chart.select(end=right_edge.start(), 1205 is_complete=False, 1206 next=right_edge.lhs()): 1207 if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): 1208 new_edge = left_edge.move_dot_forward(right_edge.end()) 1209 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1210 yield new_edge
1211
1212 - def _apply_incomplete(self, chart, grammar, left_edge):
1213 for right_edge in chart.select(start=left_edge.end(), 1214 is_complete=True, 1215 lhs=left_edge.next()): 1216 end = right_edge.end() 1217 nexttoken = end < chart.num_leaves() and chart.leaf(end) 1218 if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): 1219 new_edge = left_edge.move_dot_forward(right_edge.end()) 1220 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1221 yield new_edge
1222
1223 -class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule):
1224 - def apply_iter(self, chart, grammar, edge):
1225 if edge.is_incomplete(): return 1226 leftcorners = grammar.leftcorners 1227 end = edge.end() 1228 nexttoken = end < chart.num_leaves() and chart.leaf(end) 1229 for prod in grammar.productions(rhs=edge.lhs()): 1230 if _bottomup_filter(grammar, nexttoken, prod.rhs()): 1231 new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) 1232 if chart.insert(new_edge, (edge,)): 1233 yield new_edge
1234
1235 -def _bottomup_filter(grammar, nexttoken, rhs, dot=0):
1236 if len(rhs) <= dot + 1: 1237 return True 1238 next = rhs[dot + 1] 1239 if is_terminal(next): 1240 return nexttoken == next 1241 else: 1242 return grammar.is_leftcorner(next, nexttoken)
1243 1244 1245 ######################################################################## 1246 ## Generic Chart Parser 1247 ######################################################################## 1248 1249 TD_STRATEGY = [LeafInitRule(), 1250 TopDownInitRule(), 1251 CachedTopDownPredictRule(), 1252 SingleEdgeFundamentalRule()] 1253 BU_STRATEGY = [LeafInitRule(), 1254 EmptyPredictRule(), 1255 BottomUpPredictRule(), 1256 SingleEdgeFundamentalRule()] 1257 BU_LC_STRATEGY = [LeafInitRule(), 1258 EmptyPredictRule(), 1259 BottomUpPredictCombineRule(), 1260 SingleEdgeFundamentalRule()] 1261 1262 LC_STRATEGY = [LeafInitRule(), 1263 FilteredBottomUpPredictCombineRule(), 1264 FilteredSingleEdgeFundamentalRule()] 1265
1266 -class ChartParser(ParserI):
1267 """ 1268 A generic chart parser. A X{strategy}, or list of 1269 L{ChartRules<ChartRuleI>}, is used to decide what edges to add to 1270 the chart. In particular, C{ChartParser} uses the following 1271 algorithm to parse texts: 1272 1273 - Until no new edges are added: 1274 - For each I{rule} in I{strategy}: 1275 - Apply I{rule} to any applicable edges in the chart. 1276 - Return any complete parses in the chart 1277 """
1278 - def __init__(self, grammar, strategy=BU_LC_STRATEGY, trace=0, 1279 trace_chart_width=50, use_agenda=True, chart_class=Chart):
1280 """ 1281 Create a new chart parser, that uses C{grammar} to parse 1282 texts. 1283 1284 @type grammar: L{ContextFreeGrammar} 1285 @param grammar: The grammar used to parse texts. 1286 @type strategy: C{list} of L{ChartRuleI} 1287 @param strategy: A list of rules that should be used to decide 1288 what edges to add to the chart (top-down strategy by default). 1289 @type trace: C{int} 1290 @param trace: The level of tracing that should be used when 1291 parsing a text. C{0} will generate no tracing output; 1292 and higher numbers will produce more verbose tracing 1293 output. 1294 @type trace_chart_width: C{int} 1295 @param trace_chart_width: The default total width reserved for 1296 the chart in trace output. The remainder of each line will 1297 be used to display edges. 1298 @type use_agenda: C{bool} 1299 @param use_agenda: Use an optimized agenda-based algorithm, 1300 if possible. 1301 @param chart_class: The class that should be used to create 1302 the parse charts. 1303 """ 1304 self._grammar = grammar 1305 self._strategy = strategy 1306 self._trace = trace 1307 self._trace_chart_width = trace_chart_width 1308 # If the strategy only consists of axioms (NUM_EDGES==0) and 1309 # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm: 1310 self._use_agenda = use_agenda 1311 self._chart_class = chart_class 1312 1313 self._axioms = [] 1314 self._inference_rules = [] 1315 for rule in strategy: 1316 if rule.NUM_EDGES == 0: 1317 self._axioms.append(rule) 1318 elif rule.NUM_EDGES == 1: 1319 self._inference_rules.append(rule) 1320 else: 1321 self._use_agenda = False
1322
1323 - def grammar(self):
1324 return self._grammar
1325
1326 - def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width):
1327 if not trace: return 1328 should_print_rule_header = trace > 1 1329 for edge in new_edges: 1330 if should_print_rule_header: 1331 print '%s:' % rule 1332 should_print_rule_header = False 1333 print chart.pp_edge(edge, edge_width)
1334
1335 - def chart_parse(self, tokens, trace=None):
1336 """ 1337 @return: The final parse L{Chart}, 1338 from which all possible parse trees can be extracted. 1339 1340 @param tokens: The sentence to be parsed 1341 @type tokens: L{list} of L{string} 1342 @rtype: L{Chart} 1343 """ 1344 if trace is None: trace = self._trace 1345 trace_new_edges = self._trace_new_edges 1346 1347 tokens = list(tokens) 1348 self._grammar.check_coverage(tokens) 1349 chart = self._chart_class(tokens) 1350 grammar = self._grammar 1351 1352 # Width, for printing trace edges. 1353 trace_edge_width = self._trace_chart_width / (chart.num_leaves() + 1) 1354 if trace: print chart.pp_leaves(trace_edge_width) 1355 1356 if self._use_agenda: 1357 # Use an agenda-based algorithm. 1358 for axiom in self._axioms: 1359 new_edges = axiom.apply(chart, grammar) 1360 trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) 1361 1362 inference_rules = self._inference_rules 1363 agenda = chart.edges() 1364 # We reverse the initial agenda, since it is a stack 1365 # but chart.edges() functions as a queue. 1366 agenda.reverse() 1367 while agenda: 1368 edge = agenda.pop() 1369 for rule in inference_rules: 1370 new_edges = rule.apply_iter(chart, grammar, edge) 1371 if trace: 1372 new_edges = list(new_edges) 1373 trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) 1374 agenda += new_edges 1375 1376 else: 1377 # Do not use an agenda-based algorithm. 1378 edges_added = True 1379 while edges_added: 1380 edges_added = False 1381 for rule in self._strategy: 1382 new_edges = rule.apply_everywhere(chart, grammar) 1383 edges_added = len(new_edges) 1384 trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) 1385 1386 # Return the final chart. 1387 return chart
1388
1389 - def nbest_parse(self, tokens, n=None, tree_class=Tree):
1390 chart = self.chart_parse(tokens) 1391 # Return a list of complete parses. 1392 return chart.parses(self._grammar.start(), tree_class=tree_class)[:n]
1393
1394 -class TopDownChartParser(ChartParser):
1395 """ 1396 A L{ChartParser} using a top-down parsing strategy. 1397 See L{ChartParser} for more information. 1398 """
1399 - def __init__(self, grammar, **parser_args):
1400 ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args)
1401
1402 -class BottomUpChartParser(ChartParser):
1403 """ 1404 A L{ChartParser} using a bottom-up parsing strategy. 1405 See L{ChartParser} for more information. 1406 """
1407 - def __init__(self, grammar, **parser_args):
1408 if isinstance(grammar, WeightedGrammar): 1409 warnings.warn("BottomUpChartParser only works for ContextFreeGrammar, " 1410 "use BottomUpProbabilisticChartParser instead", 1411 category=DeprecationWarning) 1412 ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args)
1413
1414 -class BottomUpLeftCornerChartParser(ChartParser):
1415 """ 1416 A L{ChartParser} using a bottom-up left-corner parsing strategy. 1417 This strategy is often more efficient than standard bottom-up. 1418 See L{ChartParser} for more information. 1419 """
1420 - def __init__(self, grammar, **parser_args):
1421 ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args)
1422
1423 -class LeftCornerChartParser(ChartParser):
1424 - def __init__(self, grammar, **parser_args):
1425 if not grammar.is_nonempty(): 1426 raise ValueError("LeftCornerParser only works for grammars " 1427 "without empty productions.") 1428 ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args)
1429 1430 ######################################################################## 1431 ## Stepping Chart Parser 1432 ######################################################################## 1433
1434 -class SteppingChartParser(ChartParser):
1435 """ 1436 A C{ChartParser} that allows you to step through the parsing 1437 process, adding a single edge at a time. It also allows you to 1438 change the parser's strategy or grammar midway through parsing a 1439 text. 1440 1441 The C{initialize} method is used to start parsing a text. C{step} 1442 adds a single edge to the chart. C{set_strategy} changes the 1443 strategy used by the chart parser. C{parses} returns the set of 1444 parses that has been found by the chart parser. 1445 1446 @ivar _restart: Records whether the parser's strategy, grammar, 1447 or chart has been changed. If so, then L{step} must restart 1448 the parsing algorithm. 1449 """
1450 - def __init__(self, grammar, strategy=[], trace=0):
1451 self._chart = None 1452 self._current_chartrule = None 1453 self._restart = False 1454 ChartParser.__init__(self, grammar, strategy, trace)
1455 1456 #//////////////////////////////////////////////////////////// 1457 # Initialization 1458 #//////////////////////////////////////////////////////////// 1459
1460 - def initialize(self, tokens):
1461 "Begin parsing the given tokens." 1462 self._chart = Chart(list(tokens)) 1463 self._restart = True
1464 1465 #//////////////////////////////////////////////////////////// 1466 # Stepping 1467 #//////////////////////////////////////////////////////////// 1468
1469 - def step(self):
1470 """ 1471 @return: A generator that adds edges to the chart, one at a 1472 time. Each time the generator is resumed, it adds a single 1473 edge and yields that edge. If no more edges can be added, 1474 then it yields C{None}. 1475 1476 If the parser's strategy, grammar, or chart is changed, then 1477 the generator will continue adding edges using the new 1478 strategy, grammar, or chart. 1479 1480 Note that this generator never terminates, since the grammar 1481 or strategy might be changed to values that would add new 1482 edges. Instead, it yields C{None} when no more edges can be 1483 added with the current strategy and grammar. 1484 """ 1485 if self._chart is None: 1486 raise ValueError, 'Parser must be initialized first' 1487 while 1: 1488 self._restart = False 1489 w = 50/(self._chart.num_leaves()+1) 1490 1491 for e in self._parse(): 1492 if self._trace > 1: print self._current_chartrule 1493 if self._trace > 0: print self._chart.pp_edge(e,w) 1494 yield e 1495 if self._restart: break 1496 else: 1497 yield None # No more edges.
1498
1499 - def _parse(self):
1500 """ 1501 A generator that implements the actual parsing algorithm. 1502 L{step} iterates through this generator, and restarts it 1503 whenever the parser's strategy, grammar, or chart is modified. 1504 """ 1505 chart = self._chart 1506 grammar = self._grammar 1507 edges_added = 1 1508 while edges_added > 0: 1509 edges_added = 0 1510 for rule in self._strategy: 1511 self._current_chartrule = rule 1512 for e in rule.apply_everywhere_iter(chart, grammar): 1513 edges_added += 1 1514 yield e
1515 1516 #//////////////////////////////////////////////////////////// 1517 # Accessors 1518 #//////////////////////////////////////////////////////////// 1519
1520 - def strategy(self):
1521 "@return: The strategy used by this parser." 1522 return self._strategy
1523
1524 - def grammar(self):
1525 "@return: The grammar used by this parser." 1526 return self._grammar
1527
1528 - def chart(self):
1529 "@return: The chart that is used by this parser." 1530 return self._chart
1531
1532 - def current_chartrule(self):
1533 "@return: The chart rule used to generate the most recent edge." 1534 return self._current_chartrule
1535
1536 - def parses(self, tree_class=Tree):
1537 "@return: The parse trees currently contained in the chart." 1538 return self._chart.parses(self._grammar.start(), tree_class)
1539 1540 #//////////////////////////////////////////////////////////// 1541 # Parser modification 1542 #//////////////////////////////////////////////////////////// 1543
1544 - def set_strategy(self, strategy):
1545 """ 1546 Change the startegy that the parser uses to decide which edges 1547 to add to the chart. 1548 @type strategy: C{list} of L{ChartRuleI} 1549 @param strategy: A list of rules that should be used to decide 1550 what edges to add to the chart. 1551 """ 1552 if strategy == self._strategy: return 1553 self._strategy = strategy[:] # Make a copy. 1554 self._restart = True
1555
1556 - def set_grammar(self, grammar):
1557 "Change the grammar used by the parser." 1558 if grammar is self._grammar: return 1559 self._grammar = grammar 1560 self._restart = True
1561
1562 - def set_chart(self, chart):
1563 "Load a given chart into the chart parser." 1564 if chart is self._chart: return 1565 self._chart = chart 1566 self._restart = True
1567 1568 #//////////////////////////////////////////////////////////// 1569 # Standard parser methods 1570 #//////////////////////////////////////////////////////////// 1571
1572 - def nbest_parse(self, tokens, n=None, tree_class=Tree):
1573 tokens = list(tokens) 1574 self._grammar.check_coverage(tokens) 1575 1576 # Initialize ourselves. 1577 self.initialize(tokens) 1578 1579 # Step until no more edges are generated. 1580 for e in self.step(): 1581 if e is None: break 1582 1583 # Return a list of complete parses. 1584 return self.parses(tree_class=tree_class)[:n]
1585 1586 ######################################################################## 1587 ## Demo Code 1588 ######################################################################## 1589
1590 -def demo_grammar():
1591 from nltk.grammar import parse_cfg 1592 return parse_cfg(""" 1593 S -> NP VP 1594 PP -> "with" NP 1595 NP -> NP PP 1596 VP -> VP PP 1597 VP -> Verb NP 1598 VP -> Verb 1599 NP -> Det Noun 1600 NP -> "John" 1601 NP -> "I" 1602 Det -> "the" 1603 Det -> "my" 1604 Det -> "a" 1605 Noun -> "dog" 1606 Noun -> "cookie" 1607 Verb -> "ate" 1608 Verb -> "saw" 1609 Prep -> "with" 1610 Prep -> "under" 1611 """)
1612
1613 -def demo(choice=None, 1614 should_print_times=True, should_print_grammar=False, 1615 should_print_trees=True, trace=2, 1616 sent='I saw John with a dog with my cookie', numparses=5):
1617 """ 1618 A demonstration of the chart parsers. 1619 """ 1620 import sys, time 1621 from nltk import nonterminals, Production, ContextFreeGrammar 1622 1623 # The grammar for ChartParser and SteppingChartParser: 1624 grammar = demo_grammar() 1625 if should_print_grammar: 1626 print "* Grammar" 1627 print grammar 1628 1629 # Tokenize the sample sentence. 1630 print "* Sentence:" 1631 print sent 1632 tokens = sent.split() 1633 print tokens 1634 print 1635 1636 # Ask the user which parser to test, 1637 # if the parser wasn't provided as an argument 1638 if choice is None: 1639 print ' 1: Top-down chart parser' 1640 print ' 2: Bottom-up chart parser' 1641 print ' 3: Bottom-up left-corner chart parser' 1642 print ' 4: Left-corner chart parser with bottom-up filter' 1643 print ' 5: Stepping chart parser (alternating top-down & bottom-up)' 1644 print ' 6: All parsers' 1645 print '\nWhich parser (1-6)? ', 1646 choice = sys.stdin.readline().strip() 1647 print 1648 1649 choice = str(choice) 1650 if choice not in "123456": 1651 print 'Bad parser number' 1652 return 1653 1654 # Keep track of how long each parser takes. 1655 times = {} 1656 1657 strategies = {'1': ('Top-down', TD_STRATEGY), 1658 '2': ('Bottom-up', BU_STRATEGY), 1659 '3': ('Bottom-up left-corner', BU_LC_STRATEGY), 1660 '4': ('Filtered left-corner', LC_STRATEGY)} 1661 choices = [] 1662 if strategies.has_key(choice): choices = [choice] 1663 if choice=='6': choices = "1234" 1664 1665 # Run the requested chart parser(s), except the stepping parser. 1666 for strategy in choices: 1667 print "* Strategy: " + strategies[strategy][0] 1668 print 1669 cp = ChartParser(grammar, strategies[strategy][1], trace=trace) 1670 t = time.time() 1671 chart = cp.chart_parse(tokens) 1672 parses = chart.parses(grammar.start()) 1673 times[strategies[strategy][0]] = time.time()-t 1674 print "Nr edges in chart:", len(chart.edges()) 1675 if numparses: 1676 assert len(parses)==numparses, 'Not all parses found' 1677 if should_print_trees: 1678 for tree in parses: print tree 1679 else: 1680 print "Nr trees:", len(parses) 1681 print 1682 1683 # Run the stepping parser, if requested. 1684 if choice in "56": 1685 print "* Strategy: Stepping (top-down vs bottom-up)" 1686 print 1687 t = time.time() 1688 cp = SteppingChartParser(grammar, trace=trace) 1689 cp.initialize(tokens) 1690 for i in range(5): 1691 print '*** SWITCH TO TOP DOWN' 1692 cp.set_strategy(TD_STRATEGY) 1693 for j, e in enumerate(cp.step()): 1694 if j>20 or e is None: break 1695 print '*** SWITCH TO BOTTOM UP' 1696 cp.set_strategy(BU_STRATEGY) 1697 for j, e in enumerate(cp.step()): 1698 if j>20 or e is None: break 1699 times['Stepping'] = time.time()-t 1700 print "Nr edges in chart:", len(cp.chart().edges()) 1701 if numparses: 1702 assert len(cp.parses())==numparses, 'Not all parses found' 1703 if should_print_trees: 1704 for tree in cp.parses(): print tree 1705 else: 1706 print "Nr trees:", len(cp.parses()) 1707 print 1708 1709 # Print the times of all parsers: 1710 if not (should_print_times and times): return 1711 print "* Parsing times" 1712 print 1713 maxlen = max(len(key) for key in times.keys()) 1714 format = '%' + `maxlen` + 's parser: %6.3fsec' 1715 times_items = times.items() 1716 times_items.sort(lambda a,b:cmp(a[1], b[1])) 1717 for (parser, t) in times_items: 1718 print format % (parser, t)
1719 1720 if __name__ == '__main__': demo() 1721