1
2
3
4
5
6
7
8
9
10
11
12
13
14 """
15 Data classes and parser implementations for \"chart parsers\", which
16 use dynamic programming to efficiently parse a text. A X{chart
17 parser} derives parse trees for a text by iteratively adding \"edges\"
18 to a \"chart.\" Each X{edge} represents a hypothesis about the tree
19 structure for a subsequence of the text. The X{chart} is a
20 \"blackboard\" for composing and combining these hypotheses.
21
22 When a chart parser begins parsing a text, it creates a new (empty)
23 chart, spanning the text. It then incrementally adds new edges to the
24 chart. A set of X{chart rules} specifies the conditions under which
25 new edges should be added to the chart. Once the chart reaches a
26 stage where none of the chart rules adds any new edges, parsing is
27 complete.
28
29 Charts are encoded with the L{Chart} class, and edges are encoded with
30 the L{TreeEdge} and L{LeafEdge} classes. The chart parser module
31 defines three chart parsers:
32
33 - C{ChartParser} is a simple and flexible chart parser. Given a
34 set of chart rules, it will apply those rules to the chart until
35 no more edges are added.
36
37 - C{SteppingChartParser} is a subclass of C{ChartParser} that can
38 be used to step through the parsing process.
39 """
40
41 from nltk.tree import Tree
42 from nltk.grammar import WeightedGrammar, is_nonterminal, is_terminal
43 from nltk.compat import defaultdict
44
45 from api import *
46
47 import re
48 import warnings
49
50
51
52
53
55 """
56 A hypothesis about the structure of part of a sentence.
57 Each edge records the fact that a structure is (partially)
58 consistent with the sentence. An edge contains:
59
60 - A X{span}, indicating what part of the sentence is
61 consistent with the hypothesized structure.
62
63 - A X{left-hand side}, specifying what kind of structure is
64 hypothesized.
65
66 - A X{right-hand side}, specifying the contents of the
67 hypothesized structure.
68
69 - A X{dot position}, indicating how much of the hypothesized
70 structure is consistent with the sentence.
71
72 Every edge is either X{complete} or X{incomplete}:
73
74 - An edge is X{complete} if its structure is fully consistent
75 with the sentence.
76
77 - An edge is X{incomplete} if its structure is partially
78 consistent with the sentence. For every incomplete edge, the
79 span specifies a possible prefix for the edge's structure.
80
81 There are two kinds of edge:
82
83 - C{TreeEdges<TreeEdge>} record which trees have been found to
84 be (partially) consistent with the text.
85
86 - C{LeafEdges<leafEdge>} record the tokens occur in the text.
87
88 The C{EdgeI} interface provides a common interface to both types
89 of edge, allowing chart parsers to treat them in a uniform manner.
90 """
92 if self.__class__ == EdgeI:
93 raise TypeError('Edge is an abstract interface')
94
95
96
97
98
100 """
101 @return: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
102 portion of the sentence that is consistent with this
103 edge's structure.
104 @rtype: C{(int, int)}
105 """
106 raise AssertionError('EdgeI is an abstract interface')
107
109 """
110 @return: The start index of this edge's span.
111 @rtype: C{int}
112 """
113 raise AssertionError('EdgeI is an abstract interface')
114
116 """
117 @return: The end index of this edge's span.
118 @rtype: C{int}
119 """
120 raise AssertionError('EdgeI is an abstract interface')
121
123 """
124 @return: The length of this edge's span.
125 @rtype: C{int}
126 """
127 raise AssertionError('EdgeI is an abstract interface')
128
129
130
131
132
134 """
135 @return: This edge's left-hand side, which specifies what kind
136 of structure is hypothesized by this edge.
137 @see: L{TreeEdge} and L{LeafEdge} for a description of
138 the left-hand side values for each edge type.
139 """
140 raise AssertionError('EdgeI is an abstract interface')
141
142
143
144
145
147 """
148 @return: This edge's right-hand side, which specifies
149 the content of the structure hypothesized by this
150 edge.
151 @see: L{TreeEdge} and L{LeafEdge} for a description of
152 the right-hand side values for each edge type.
153 """
154 raise AssertionError('EdgeI is an abstract interface')
155
157 """
158 @return: This edge's dot position, which indicates how much of
159 the hypothesized structure is consistent with the
160 sentence. In particular, C{self.rhs[:dot]} is consistent
161 with C{subtoks[self.start():self.end()]}.
162 @rtype: C{int}
163 """
164 raise AssertionError('EdgeI is an abstract interface')
165
167 """
168 @return: The element of this edge's right-hand side that
169 immediately follows its dot.
170 @rtype: C{Nonterminal} or X{terminal} or C{None}
171 """
172 raise AssertionError('EdgeI is an abstract interface')
173
175 """
176 @return: True if this edge's structure is fully consistent
177 with the text.
178 @rtype: C{boolean}
179 """
180 raise AssertionError('EdgeI is an abstract interface')
181
183 """
184 @return: True if this edge's structure is partially consistent
185 with the text.
186 @rtype: C{boolean}
187 """
188 raise AssertionError('EdgeI is an abstract interface')
189
190
191
192
194 raise AssertionError('EdgeI is an abstract interface')
195
197 raise AssertionError('EdgeI is an abstract interface')
198
200 """
201 An edge that records the fact that a tree is (partially)
202 consistent with the sentence. A tree edge consists of:
203
204 - A X{span}, indicating what part of the sentence is
205 consistent with the hypothesized tree.
206
207 - A X{left-hand side}, specifying the hypothesized tree's node
208 value.
209
210 - A X{right-hand side}, specifying the hypothesized tree's
211 children. Each element of the right-hand side is either a
212 terminal, specifying a token with that terminal as its leaf
213 value; or a nonterminal, specifying a subtree with that
214 nonterminal's symbol as its node value.
215
216 - A X{dot position}, indicating which children are consistent
217 with part of the sentence. In particular, if C{dot} is the
218 dot position, C{rhs} is the right-hand size, C{(start,end)}
219 is the span, and C{sentence} is the list of subtokens in the
220 sentence, then C{subtokens[start:end]} can be spanned by the
221 children specified by C{rhs[:dot]}.
222
223 For more information about edges, see the L{EdgeI} interface.
224 """
225 - def __init__(self, span, lhs, rhs, dot=0):
226 """
227 Construct a new C{TreeEdge}.
228
229 @type span: C{(int, int)}
230 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
231 portion of the sentence that is consistent with the new
232 edge's structure.
233 @type lhs: L{Nonterminal}
234 @param lhs: The new edge's left-hand side, specifying the
235 hypothesized tree's node value.
236 @type rhs: C{list} of (L{Nonterminal} and C{string})
237 @param rhs: The new edge's right-hand side, specifying the
238 hypothesized tree's children.
239 @type dot: C{int}
240 @param dot: The position of the new edge's dot. This position
241 specifies what prefix of the production's right hand side
242 is consistent with the text. In particular, if
243 C{sentence} is the list of subtokens in the sentence, then
244 C{subtokens[span[0]:span[1]]} can be spanned by the
245 children specified by C{rhs[:dot]}.
246 """
247 self._lhs = lhs
248 self._rhs = tuple(rhs)
249 self._span = span
250 self._dot = dot
251
252
254 """
255 @return: A new C{TreeEdge} formed from the given production.
256 The new edge's left-hand side and right-hand side will
257 be taken from C{production}; its span will be
258 C{(index,index)}; and its dot position will be C{0}.
259 @rtype: L{TreeEdge}
260 """
261 return TreeEdge(span=(index, index), lhs=production.lhs(),
262 rhs=production.rhs(), dot=0)
263 from_production = staticmethod(from_production)
264
266 """
267 @return: A new C{TreeEdge} formed from this edge.
268 The new edge's dot position is increased by C{1},
269 and its end index will be replaced by C{new_end}.
270 @rtype: L{TreeEdge}
271 @param new_end: The new end index.
272 @type new_end: C{int}
273 """
274 return TreeEdge(span=(self._span[0], new_end),
275 lhs=self._lhs, rhs=self._rhs,
276 dot=self._dot+1)
277
278
279 - def lhs(self): return self._lhs
280 - def span(self): return self._span
281 - def start(self): return self._span[0]
282 - def end(self): return self._span[1]
283 - def length(self): return self._span[1] - self._span[0]
284 - def rhs(self): return self._rhs
285 - def dot(self): return self._dot
286 - def is_complete(self): return self._dot == len(self._rhs)
289 if self._dot >= len(self._rhs): return None
290 else: return self._rhs[self._dot]
291
292
294 if self.__class__ != other.__class__: return -1
295 return cmp((self._span, self.lhs(), self.rhs(), self._dot),
296 (other._span, other.lhs(), other.rhs(), other._dot))
298 return hash((self.lhs(), self.rhs(), self._span, self._dot))
299
300
302 str = '[%s:%s] ' % (self._span[0], self._span[1])
303 str += '%-2r ->' % (self._lhs,)
304
305 for i in range(len(self._rhs)):
306 if i == self._dot: str += ' *'
307 str += ' %r' % (self._rhs[i],)
308 if len(self._rhs) == self._dot: str += ' *'
309 return str
310
312 return '[Edge: %s]' % self
313
315 """
316 An edge that records the fact that a leaf value is consistent with
317 a word in the sentence. A leaf edge consists of:
318
319 - An X{index}, indicating the position of the word.
320 - A X{leaf}, specifying the word's content.
321
322 A leaf edge's left-hand side is its leaf value, and its right hand
323 side is C{()}. Its span is C{[index, index+1]}, and its dot
324 position is C{0}.
325 """
327 """
328 Construct a new C{LeafEdge}.
329
330 @param leaf: The new edge's leaf value, specifying the word
331 that is recorded by this edge.
332 @param index: The new edge's index, specifying the position of
333 the word that is recorded by this edge.
334 """
335 self._leaf = leaf
336 self._index = index
337
338
339 - def lhs(self): return self._leaf
344 - def rhs(self): return ()
345 - def dot(self): return 0
348 - def next(self): return None
349
350
352 if not isinstance(other, LeafEdge): return -1
353 return cmp((self._index, self._leaf), (other._index, other._leaf))
355 return hash((self._index, self._leaf))
356
357
359 return '[%s:%s] %r' % (self._index, self._index+1, self._leaf)
361 return '[Edge: %s]' % (self)
362
363
364
365
366
368 """
369 A blackboard for hypotheses about the syntactic constituents of a
370 sentence. A chart contains a set of edges, and each edge encodes
371 a single hypothesis about the structure of some portion of the
372 sentence.
373
374 The L{select} method can be used to select a specific collection
375 of edges. For example C{chart.select(is_complete=True, start=0)}
376 yields all complete edges whose start indices are 0. To ensure
377 the efficiency of these selection operations, C{Chart} dynamically
378 creates and maintains an index for each set of attributes that
379 have been selected on.
380
381 In order to reconstruct the trees that are represented by an edge,
382 the chart associates each edge with a set of child pointer lists.
383 A X{child pointer list} is a list of the edges that license an
384 edge's right-hand side.
385
386 @ivar _tokens: The sentence that the chart covers.
387 @ivar _num_leaves: The number of tokens.
388 @ivar _edges: A list of the edges in the chart
389 @ivar _edge_to_cpls: A dictionary mapping each edge to a set
390 of child pointer lists that are associated with that edge.
391 @ivar _indexes: A dictionary mapping tuples of edge attributes
392 to indices, where each index maps the corresponding edge
393 attribute values to lists of edges.
394 """
396 """
397 Construct a new chart. The chart is initialized with the
398 leaf edges corresponding to the terminal leaves.
399
400 @type tokens: L{list}
401 @param tokens: The sentence that this chart will be used to parse.
402 """
403
404 self._tokens = tuple(tokens)
405 self._num_leaves = len(self._tokens)
406
407
408 self.initialize()
409
411 """
412 Clear the chart.
413 """
414
415 self._edges = []
416
417
418 self._edge_to_cpls = {}
419
420
421
422 self._indexes = {}
423
424
425
426
427
429 """
430 @return: The number of words in this chart's sentence.
431 @rtype: C{int}
432 """
433 return self._num_leaves
434
435 - def leaf(self, index):
436 """
437 @return: The leaf value of the word at the given index.
438 @rtype: C{string}
439 """
440 return self._tokens[index]
441
443 """
444 @return: A list of the leaf values of each word in the
445 chart's sentence.
446 @rtype: C{list} of C{string}
447 """
448 return self._tokens
449
450
451
452
453
455 """
456 @return: A list of all edges in this chart. New edges
457 that are added to the chart after the call to edges()
458 will I{not} be contained in this list.
459 @rtype: C{list} of L{EdgeI}
460 @see: L{iteredges}, L{select}
461 """
462 return self._edges[:]
463
465 """
466 @return: An iterator over the edges in this chart. It is
467 I{not} guaranteed that new edges which are added to the
468 chart before the iterator is exhausted will also be
469 generated.
470 @rtype: C{iter} of L{EdgeI}
471 @see: L{edges}, L{select}
472 """
473 return iter(self._edges)
474
475
476 __iter__ = iteredges
477
479 """
480 @return: The number of edges contained in this chart.
481 @rtype: C{int}
482 """
483 return len(self._edge_to_cpls)
484
485 - def select(self, **restrictions):
486 """
487 @return: An iterator over the edges in this chart. Any
488 new edges that are added to the chart before the iterator
489 is exahusted will also be generated. C{restrictions}
490 can be used to restrict the set of edges that will be
491 generated.
492 @rtype: C{iter} of L{EdgeI}
493
494 @kwarg span: Only generate edges C{e} where C{e.span()==span}
495 @kwarg start: Only generate edges C{e} where C{e.start()==start}
496 @kwarg end: Only generate edges C{e} where C{e.end()==end}
497 @kwarg length: Only generate edges C{e} where C{e.length()==length}
498 @kwarg lhs: Only generate edges C{e} where C{e.lhs()==lhs}
499 @kwarg rhs: Only generate edges C{e} where C{e.rhs()==rhs}
500 @kwarg next: Only generate edges C{e} where C{e.next()==next}
501 @kwarg dot: Only generate edges C{e} where C{e.dot()==dot}
502 @kwarg is_complete: Only generate edges C{e} where
503 C{e.is_complete()==is_complete}
504 @kwarg is_incomplete: Only generate edges C{e} where
505 C{e.is_incomplete()==is_incomplete}
506 """
507
508 if restrictions=={}: return iter(self._edges)
509
510
511 restr_keys = restrictions.keys()
512 restr_keys.sort()
513 restr_keys = tuple(restr_keys)
514
515
516 if restr_keys not in self._indexes:
517 self._add_index(restr_keys)
518
519 vals = tuple(restrictions[key] for key in restr_keys)
520 return iter(self._indexes[restr_keys].get(vals, []))
521
523 """
524 A helper function for L{select}, which creates a new index for
525 a given set of attributes (aka restriction keys).
526 """
527
528 for key in restr_keys:
529 if not hasattr(EdgeI, key):
530 raise ValueError, 'Bad restriction: %s' % key
531
532
533 index = self._indexes[restr_keys] = {}
534
535
536 for edge in self._edges:
537 vals = tuple(getattr(edge, key)() for key in restr_keys)
538 index.setdefault(vals, []).append(edge)
539
541 """
542 A helper function for L{insert}, which registers the new
543 edge with all existing indexes.
544 """
545 for (restr_keys, index) in self._indexes.items():
546 vals = tuple(getattr(edge, key)() for key in restr_keys)
547 index.setdefault(vals, []).append(edge)
548
549
550
551
552
554 """
555 Add a new edge to the chart, using a pointer to the previous edge.
556 """
557 cpls = self.child_pointer_lists(previous_edge)
558 new_cpls = [cpl+(child_edge,) for cpl in cpls]
559 return self.insert(new_edge, *new_cpls)
560
561 - def insert(self, edge, *child_pointer_lists):
562 """
563 Add a new edge to the chart.
564
565 @type edge: L{EdgeI}
566 @param edge: The new edge
567 @type child_pointer_lists: C(sequence} of C{tuple} of L{EdgeI}
568 @param child_pointer_lists: A sequence of lists of the edges that
569 were used to form this edge. This list is used to reconstruct
570 the trees (or partial trees) that are associated with C{edge}.
571 @rtype: C{bool}
572 @return: True if this operation modified the chart. In
573 particular, return true iff the chart did not already
574 contain C{edge}, or if it did not already associate
575 C{child_pointer_lists} with C{edge}.
576 """
577
578 if edge not in self._edge_to_cpls:
579
580 self._append_edge(edge)
581
582 self._register_with_indexes(edge)
583
584
585 cpls = self._edge_to_cpls.setdefault(edge,{})
586 chart_was_modified = False
587 for child_pointer_list in child_pointer_lists:
588 child_pointer_list = tuple(child_pointer_list)
589 if child_pointer_list not in cpls:
590
591 cpls[child_pointer_list] = True
592 chart_was_modified = True
593 return chart_was_modified
594
597
598
599
600
601
603 """
604 @return: A list of the complete tree structures that span
605 the entire chart, and whose root node is C{root}.
606 """
607 trees = []
608 for edge in self.select(start=0, end=self._num_leaves, lhs=root):
609 trees += self.trees(edge, tree_class=tree_class, complete=True)
610 return trees
611
612 - def trees(self, edge, tree_class=Tree, complete=False):
613 """
614 @return: A list of the tree structures that are associated
615 with C{edge}.
616
617 If C{edge} is incomplete, then the unexpanded children will be
618 encoded as childless subtrees, whose node value is the
619 corresponding terminal or nonterminal.
620
621 @rtype: C{list} of L{Tree}
622 @note: If two trees share a common subtree, then the same
623 C{Tree} may be used to encode that subtree in
624 both trees. If you need to eliminate this subtree
625 sharing, then create a deep copy of each tree.
626 """
627 return self._trees(edge, complete, memo={}, tree_class=tree_class)
628
629 - def _trees(self, edge, complete, memo, tree_class):
630 """
631 A helper function for L{trees}.
632 @param memo: A dictionary used to record the trees that we've
633 generated for each edge, so that when we see an edge more
634 than once, we can reuse the same trees.
635 """
636
637 if edge in memo:
638 return memo[edge]
639
640 trees = []
641
642
643 if complete and edge.is_incomplete():
644 return trees
645
646
647
648
649
650
651
652 memo[edge] = []
653
654
655 if isinstance(edge, LeafEdge):
656 leaf = self._tokens[edge.start()]
657 memo[edge] = leaf
658 return [leaf]
659
660
661 for cpl in self.child_pointer_lists(edge):
662
663
664
665 child_choices = [self._trees(cp, complete, memo, tree_class)
666 for cp in cpl]
667
668
669 for children in self._choose_children(child_choices):
670 lhs = edge.lhs().symbol()
671 trees.append(tree_class(lhs, children))
672
673
674 if edge.is_incomplete():
675 unexpanded = [tree_class(elt,[])
676 for elt in edge.rhs()[edge.dot():]]
677 for tree in trees:
678 tree.extend(unexpanded)
679
680
681 memo[edge] = trees
682
683
684 return trees
685
687 """
688 A helper function for L{_trees} that finds the possible sets
689 of subtrees for a new tree.
690
691 @param child_choices: A list that specifies the options for
692 each child. In particular, C{child_choices[i]} is a list of
693 tokens and subtrees that can be used as the C{i}th child.
694 """
695 children_lists = [[]]
696 for child_choice in child_choices:
697 if hasattr(child_choice, '__iter__') and \
698 not isinstance(child_choice, basestring):
699
700
701 children_lists = [child_list+[child]
702 for child in child_choice
703 for child_list in children_lists]
704 else:
705
706
707 children_lists = [child_list+[child_choice]
708 for child_list in children_lists]
709 return children_lists
710
712 """
713 @rtype: C{list} of C{list} of C{EdgeI}
714 @return: The set of child pointer lists for the given edge.
715 Each child pointer list is a list of edges that have
716 been used to form this edge.
717 """
718
719 return self._edge_to_cpls.get(edge, {}).keys()
720
721
722
723
724 - def pp_edge(self, edge, width=None):
753
755 """
756 @return: A pretty-printed string representation of this
757 chart's leaves. This string can be used as a header
758 for calls to L{pp_edge}.
759 """
760 if width is None: width = 50/(self.num_leaves()+1)
761
762 if self._tokens is not None and width>1:
763 header = '|.'
764 for tok in self._tokens:
765 header += tok[:width-1].center(width-1)+'.'
766 header += '|'
767 else:
768 header = ''
769
770 return header
771
772 - def pp(self, width=None):
788
789
790
791
792
794
795 s = 'digraph nltk_chart {\n'
796
797 s += ' rankdir=LR;\n'
798 s += ' node [height=0.1,width=0.1];\n'
799 s += ' node [style=filled, color="lightgray"];\n'
800
801
802 for y in range(self.num_edges(), -1, -1):
803 if y == 0:
804 s += ' node [style=filled, color="black"];\n'
805 for x in range(self.num_leaves()+1):
806 if y == 0 or (x <= self._edges[y-1].start() or
807 x >= self._edges[y-1].end()):
808 s += ' %04d.%04d [label=""];\n' % (x,y)
809
810
811 s += ' x [style=invis]; x->0000.0000 [style=invis];\n'
812
813
814 for x in range(self.num_leaves()+1):
815 s += ' {rank=same;'
816 for y in range(self.num_edges()+1):
817 if y == 0 or (x <= self._edges[y-1].start() or
818 x >= self._edges[y-1].end()):
819 s += ' %04d.%04d' % (x,y)
820 s += '}\n'
821
822
823 s += ' edge [style=invis, weight=100];\n'
824 s += ' node [shape=plaintext]\n'
825 s += ' 0000.0000'
826 for x in range(self.num_leaves()):
827 s += '->%s->%04d.0000' % (self.leaf(x), x+1)
828 s += ';\n\n'
829
830
831 s += ' edge [style=solid, weight=1];\n'
832 for y, edge in enumerate(self):
833 for x in range(edge.start()):
834 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
835 (x, y+1, x+1, y+1))
836 s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' %
837 (edge.start(), y+1, edge.end(), y+1, edge))
838 for x in range(edge.end(), self.num_leaves()):
839 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
840 (x, y+1, x+1, y+1))
841 s += '}\n'
842 return s
843
844
845
846
847
849 """
850 A rule that specifies what new edges are licensed by any given set
851 of existing edges. Each chart rule expects a fixed number of
852 edges, as indicated by the class variable L{NUM_EDGES}. In
853 particular:
854
855 - A chart rule with C{NUM_EDGES=0} specifies what new edges are
856 licensed, regardless of existing edges.
857
858 - A chart rule with C{NUM_EDGES=1} specifies what new edges are
859 licensed by a single existing edge.
860
861 - A chart rule with C{NUM_EDGES=2} specifies what new edges are
862 licensed by a pair of existing edges.
863
864 @type NUM_EDGES: C{int}
865 @cvar NUM_EDGES: The number of existing edges that this rule uses
866 to license new edges. Typically, this number ranges from zero
867 to two.
868 """
869 - def apply(self, chart, grammar, *edges):
870 """
871 Add the edges licensed by this rule and the given edges to the
872 chart.
873
874 @type edges: C{list} of L{EdgeI}
875 @param edges: A set of existing edges. The number of edges
876 that should be passed to C{apply} is specified by the
877 L{NUM_EDGES} class variable.
878 @rtype: C{list} of L{EdgeI}
879 @return: A list of the edges that were added.
880 """
881 raise AssertionError, 'ChartRuleI is an abstract interface'
882
884 """
885 @return: A generator that will add edges licensed by this rule
886 and the given edges to the chart, one at a time. Each
887 time the generator is resumed, it will either add a new
888 edge and yield that edge; or return.
889 @rtype: C{iter} of L{EdgeI}
890
891 @type edges: C{list} of L{EdgeI}
892 @param edges: A set of existing edges. The number of edges
893 that should be passed to C{apply} is specified by the
894 L{NUM_EDGES} class variable.
895 """
896 raise AssertionError, 'ChartRuleI is an abstract interface'
897
899 """
900 Add all the edges licensed by this rule and the edges in the
901 chart to the chart.
902
903 @rtype: C{list} of L{EdgeI}
904 @return: A list of the edges that were added.
905 """
906 raise AssertionError, 'ChartRuleI is an abstract interface'
907
909 """
910 @return: A generator that will add all edges licensed by
911 this rule, given the edges that are currently in the
912 chart, one at a time. Each time the generator is resumed,
913 it will either add a new edge and yield that edge; or
914 return.
915 @rtype: C{iter} of L{EdgeI}
916 """
917 raise AssertionError, 'ChartRuleI is an abstract interface'
918
920 """
921 An abstract base class for chart rules. C{AbstractChartRule}
922 provides:
923 - A default implementation for C{apply}, based on C{apply_iter}.
924 - A default implementation for C{apply_everywhere_iter},
925 based on C{apply_iter}.
926 - A default implementation for C{apply_everywhere}, based on
927 C{apply_everywhere_iter}. Currently, this implementation
928 assumes that C{NUM_EDGES}<=3.
929 - A default implementation for C{__str__}, which returns a
930 name basd on the rule's class name.
931 """
932
933
935 raise AssertionError, 'AbstractChartRule is an abstract class'
936
937
938
940 if self.NUM_EDGES == 0:
941 for new_edge in self.apply_iter(chart, grammar):
942 yield new_edge
943
944 elif self.NUM_EDGES == 1:
945 for e1 in chart:
946 for new_edge in self.apply_iter(chart, grammar, e1):
947 yield new_edge
948
949 elif self.NUM_EDGES == 2:
950 for e1 in chart:
951 for e2 in chart:
952 for new_edge in self.apply_iter(chart, grammar, e1, e2):
953 yield new_edge
954
955 elif self.NUM_EDGES == 3:
956 for e1 in chart:
957 for e2 in chart:
958 for e3 in chart:
959 for new_edge in self.apply_iter(chart,grammar,e1,e2,e3):
960 yield new_edge
961
962 else:
963 raise AssertionError, 'NUM_EDGES>3 is not currently supported'
964
965
966 - def apply(self, chart, grammar, *edges):
968
969
972
973
975
976 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
977
978
979
980
981
983 """
984 A rule that joins two adjacent edges to form a single combined
985 edge. In particular, this rule specifies that any pair of edges:
986
987 - [A S{->} S{alpha} * B S{beta}][i:j]
988 - [B S{->} S{gamma} *][j:k]
989 licenses the edge:
990 - [A S{->} S{alpha} B * S{beta}][i:j]
991 """
992 NUM_EDGES = 2
993 - def apply_iter(self, chart, grammar, left_edge, right_edge):
1007
1009 """
1010 A rule that joins a given edge with adjacent edges in the chart,
1011 to form combined edges. In particular, this rule specifies that
1012 either of the edges:
1013 - [A S{->} S{alpha} * B S{beta}][i:j]
1014 - [B S{->} S{gamma} *][j:k]
1015 licenses the edge:
1016 - [A S{->} S{alpha} B * S{beta}][i:j]
1017 if the other edge is already in the chart.
1018
1019 @note: This is basically L{FundamentalRule}, with one edge left
1020 unspecified.
1021 """
1022 NUM_EDGES = 1
1023
1031
1039
1047
1048
1049
1050
1051
1059
1060
1061
1062
1063
1065 """
1066 A rule licensing edges corresponding to the grammar productions for
1067 the grammar's start symbol. In particular, this rule specifies that:
1068 - [S S{->} * S{alpha}][0:i]
1069 is licensed for each grammar production C{S S{->} S{alpha}}, where
1070 C{S} is the grammar's start symbol.
1071 """
1072 NUM_EDGES = 0
1078
1080 """
1081 A rule licensing edges corresponding to the grammar productions
1082 for the nonterminal following an incomplete edge's dot. In
1083 particular, this rule specifies that:
1084 - [A S{->} S{alpha} * B S{beta}][i:j]
1085 licenses the edge:
1086 - [B S{->} * S{gamma}][j:j]
1087 for each grammar production C{B S{->} S{gamma}}.
1088
1089 @note: This rule corresponds to the Predictor Rule in Earley parsing.
1090 """
1091 NUM_EDGES = 1
1098
1100 """
1101 A cached version of L{TopDownPredictRule}. After the first time
1102 this rule is applied to an edge with a given C{end} and C{next},
1103 it will not generate any more edges for edges with that C{end} and
1104 C{next}.
1105
1106 If C{chart} or C{grammar} are changed, then the cache is flushed.
1107 """
1111
1138
1139
1140
1141
1142
1144 """
1145 A rule licensing any edge corresponding to a production whose
1146 right-hand side begins with a complete edge's left-hand side. In
1147 particular, this rule specifies that:
1148 - [A S{->} S{alpha} *]
1149 licenses the edge:
1150 - [B S{->} * A S{beta}]
1151 for each grammar production C{B S{->} A S{beta}}.
1152 """
1153 NUM_EDGES = 1
1160
1162 """
1163 A rule licensing any edge corresponding to a production whose
1164 right-hand side begins with a complete edge's left-hand side. In
1165 particular, this rule specifies that:
1166 - [A S{->} S{alpha} *]
1167 licenses the edge:
1168 - [B S{->} A * S{beta}]
1169 for each grammar production C{B S{->} A S{beta}}.
1170
1171 @note: This is like L{BottomUpPredictRule}, but it also applies
1172 the L{FundamentalRule} to the resulting edge.
1173 """
1174 NUM_EDGES = 1
1181
1183 """
1184 A rule that inserts all empty productions as passive edges,
1185 in every position in the chart.
1186 """
1187 NUM_EDGES = 0
1194
1195
1196
1197
1198
1199
1222
1234
1243
1244
1245
1246
1247
1248
1249 TD_STRATEGY = [LeafInitRule(),
1250 TopDownInitRule(),
1251 CachedTopDownPredictRule(),
1252 SingleEdgeFundamentalRule()]
1253 BU_STRATEGY = [LeafInitRule(),
1254 EmptyPredictRule(),
1255 BottomUpPredictRule(),
1256 SingleEdgeFundamentalRule()]
1257 BU_LC_STRATEGY = [LeafInitRule(),
1258 EmptyPredictRule(),
1259 BottomUpPredictCombineRule(),
1260 SingleEdgeFundamentalRule()]
1261
1262 LC_STRATEGY = [LeafInitRule(),
1263 FilteredBottomUpPredictCombineRule(),
1264 FilteredSingleEdgeFundamentalRule()]
1265
1267 """
1268 A generic chart parser. A X{strategy}, or list of
1269 L{ChartRules<ChartRuleI>}, is used to decide what edges to add to
1270 the chart. In particular, C{ChartParser} uses the following
1271 algorithm to parse texts:
1272
1273 - Until no new edges are added:
1274 - For each I{rule} in I{strategy}:
1275 - Apply I{rule} to any applicable edges in the chart.
1276 - Return any complete parses in the chart
1277 """
1280 """
1281 Create a new chart parser, that uses C{grammar} to parse
1282 texts.
1283
1284 @type grammar: L{ContextFreeGrammar}
1285 @param grammar: The grammar used to parse texts.
1286 @type strategy: C{list} of L{ChartRuleI}
1287 @param strategy: A list of rules that should be used to decide
1288 what edges to add to the chart (top-down strategy by default).
1289 @type trace: C{int}
1290 @param trace: The level of tracing that should be used when
1291 parsing a text. C{0} will generate no tracing output;
1292 and higher numbers will produce more verbose tracing
1293 output.
1294 @type trace_chart_width: C{int}
1295 @param trace_chart_width: The default total width reserved for
1296 the chart in trace output. The remainder of each line will
1297 be used to display edges.
1298 @type use_agenda: C{bool}
1299 @param use_agenda: Use an optimized agenda-based algorithm,
1300 if possible.
1301 @param chart_class: The class that should be used to create
1302 the parse charts.
1303 """
1304 self._grammar = grammar
1305 self._strategy = strategy
1306 self._trace = trace
1307 self._trace_chart_width = trace_chart_width
1308
1309
1310 self._use_agenda = use_agenda
1311 self._chart_class = chart_class
1312
1313 self._axioms = []
1314 self._inference_rules = []
1315 for rule in strategy:
1316 if rule.NUM_EDGES == 0:
1317 self._axioms.append(rule)
1318 elif rule.NUM_EDGES == 1:
1319 self._inference_rules.append(rule)
1320 else:
1321 self._use_agenda = False
1322
1324 return self._grammar
1325
1327 if not trace: return
1328 should_print_rule_header = trace > 1
1329 for edge in new_edges:
1330 if should_print_rule_header:
1331 print '%s:' % rule
1332 should_print_rule_header = False
1333 print chart.pp_edge(edge, edge_width)
1334
1336 """
1337 @return: The final parse L{Chart},
1338 from which all possible parse trees can be extracted.
1339
1340 @param tokens: The sentence to be parsed
1341 @type tokens: L{list} of L{string}
1342 @rtype: L{Chart}
1343 """
1344 if trace is None: trace = self._trace
1345 trace_new_edges = self._trace_new_edges
1346
1347 tokens = list(tokens)
1348 self._grammar.check_coverage(tokens)
1349 chart = self._chart_class(tokens)
1350 grammar = self._grammar
1351
1352
1353 trace_edge_width = self._trace_chart_width / (chart.num_leaves() + 1)
1354 if trace: print chart.pp_leaves(trace_edge_width)
1355
1356 if self._use_agenda:
1357
1358 for axiom in self._axioms:
1359 new_edges = axiom.apply(chart, grammar)
1360 trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
1361
1362 inference_rules = self._inference_rules
1363 agenda = chart.edges()
1364
1365
1366 agenda.reverse()
1367 while agenda:
1368 edge = agenda.pop()
1369 for rule in inference_rules:
1370 new_edges = rule.apply_iter(chart, grammar, edge)
1371 if trace:
1372 new_edges = list(new_edges)
1373 trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
1374 agenda += new_edges
1375
1376 else:
1377
1378 edges_added = True
1379 while edges_added:
1380 edges_added = False
1381 for rule in self._strategy:
1382 new_edges = rule.apply_everywhere(chart, grammar)
1383 edges_added = len(new_edges)
1384 trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
1385
1386
1387 return chart
1388
1393
1395 """
1396 A L{ChartParser} using a top-down parsing strategy.
1397 See L{ChartParser} for more information.
1398 """
1399 - def __init__(self, grammar, **parser_args):
1400 ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args)
1401
1403 """
1404 A L{ChartParser} using a bottom-up parsing strategy.
1405 See L{ChartParser} for more information.
1406 """
1407 - def __init__(self, grammar, **parser_args):
1408 if isinstance(grammar, WeightedGrammar):
1409 warnings.warn("BottomUpChartParser only works for ContextFreeGrammar, "
1410 "use BottomUpProbabilisticChartParser instead",
1411 category=DeprecationWarning)
1412 ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args)
1413
1415 """
1416 A L{ChartParser} using a bottom-up left-corner parsing strategy.
1417 This strategy is often more efficient than standard bottom-up.
1418 See L{ChartParser} for more information.
1419 """
1420 - def __init__(self, grammar, **parser_args):
1421 ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args)
1422
1429
1430
1431
1432
1433
1435 """
1436 A C{ChartParser} that allows you to step through the parsing
1437 process, adding a single edge at a time. It also allows you to
1438 change the parser's strategy or grammar midway through parsing a
1439 text.
1440
1441 The C{initialize} method is used to start parsing a text. C{step}
1442 adds a single edge to the chart. C{set_strategy} changes the
1443 strategy used by the chart parser. C{parses} returns the set of
1444 parses that has been found by the chart parser.
1445
1446 @ivar _restart: Records whether the parser's strategy, grammar,
1447 or chart has been changed. If so, then L{step} must restart
1448 the parsing algorithm.
1449 """
1450 - def __init__(self, grammar, strategy=[], trace=0):
1455
1456
1457
1458
1459
1461 "Begin parsing the given tokens."
1462 self._chart = Chart(list(tokens))
1463 self._restart = True
1464
1465
1466
1467
1468
1470 """
1471 @return: A generator that adds edges to the chart, one at a
1472 time. Each time the generator is resumed, it adds a single
1473 edge and yields that edge. If no more edges can be added,
1474 then it yields C{None}.
1475
1476 If the parser's strategy, grammar, or chart is changed, then
1477 the generator will continue adding edges using the new
1478 strategy, grammar, or chart.
1479
1480 Note that this generator never terminates, since the grammar
1481 or strategy might be changed to values that would add new
1482 edges. Instead, it yields C{None} when no more edges can be
1483 added with the current strategy and grammar.
1484 """
1485 if self._chart is None:
1486 raise ValueError, 'Parser must be initialized first'
1487 while 1:
1488 self._restart = False
1489 w = 50/(self._chart.num_leaves()+1)
1490
1491 for e in self._parse():
1492 if self._trace > 1: print self._current_chartrule
1493 if self._trace > 0: print self._chart.pp_edge(e,w)
1494 yield e
1495 if self._restart: break
1496 else:
1497 yield None
1498
1500 """
1501 A generator that implements the actual parsing algorithm.
1502 L{step} iterates through this generator, and restarts it
1503 whenever the parser's strategy, grammar, or chart is modified.
1504 """
1505 chart = self._chart
1506 grammar = self._grammar
1507 edges_added = 1
1508 while edges_added > 0:
1509 edges_added = 0
1510 for rule in self._strategy:
1511 self._current_chartrule = rule
1512 for e in rule.apply_everywhere_iter(chart, grammar):
1513 edges_added += 1
1514 yield e
1515
1516
1517
1518
1519
1521 "@return: The strategy used by this parser."
1522 return self._strategy
1523
1525 "@return: The grammar used by this parser."
1526 return self._grammar
1527
1529 "@return: The chart that is used by this parser."
1530 return self._chart
1531
1533 "@return: The chart rule used to generate the most recent edge."
1534 return self._current_chartrule
1535
1537 "@return: The parse trees currently contained in the chart."
1538 return self._chart.parses(self._grammar.start(), tree_class)
1539
1540
1541
1542
1543
1545 """
1546 Change the startegy that the parser uses to decide which edges
1547 to add to the chart.
1548 @type strategy: C{list} of L{ChartRuleI}
1549 @param strategy: A list of rules that should be used to decide
1550 what edges to add to the chart.
1551 """
1552 if strategy == self._strategy: return
1553 self._strategy = strategy[:]
1554 self._restart = True
1555
1557 "Change the grammar used by the parser."
1558 if grammar is self._grammar: return
1559 self._grammar = grammar
1560 self._restart = True
1561
1563 "Load a given chart into the chart parser."
1564 if chart is self._chart: return
1565 self._chart = chart
1566 self._restart = True
1567
1568
1569
1570
1571
1585
1586
1587
1588
1589
1591 from nltk.grammar import parse_cfg
1592 return parse_cfg("""
1593 S -> NP VP
1594 PP -> "with" NP
1595 NP -> NP PP
1596 VP -> VP PP
1597 VP -> Verb NP
1598 VP -> Verb
1599 NP -> Det Noun
1600 NP -> "John"
1601 NP -> "I"
1602 Det -> "the"
1603 Det -> "my"
1604 Det -> "a"
1605 Noun -> "dog"
1606 Noun -> "cookie"
1607 Verb -> "ate"
1608 Verb -> "saw"
1609 Prep -> "with"
1610 Prep -> "under"
1611 """)
1612
1613 -def demo(choice=None,
1614 should_print_times=True, should_print_grammar=False,
1615 should_print_trees=True, trace=2,
1616 sent='I saw John with a dog with my cookie', numparses=5):
1617 """
1618 A demonstration of the chart parsers.
1619 """
1620 import sys, time
1621 from nltk import nonterminals, Production, ContextFreeGrammar
1622
1623
1624 grammar = demo_grammar()
1625 if should_print_grammar:
1626 print "* Grammar"
1627 print grammar
1628
1629
1630 print "* Sentence:"
1631 print sent
1632 tokens = sent.split()
1633 print tokens
1634 print
1635
1636
1637
1638 if choice is None:
1639 print ' 1: Top-down chart parser'
1640 print ' 2: Bottom-up chart parser'
1641 print ' 3: Bottom-up left-corner chart parser'
1642 print ' 4: Left-corner chart parser with bottom-up filter'
1643 print ' 5: Stepping chart parser (alternating top-down & bottom-up)'
1644 print ' 6: All parsers'
1645 print '\nWhich parser (1-6)? ',
1646 choice = sys.stdin.readline().strip()
1647 print
1648
1649 choice = str(choice)
1650 if choice not in "123456":
1651 print 'Bad parser number'
1652 return
1653
1654
1655 times = {}
1656
1657 strategies = {'1': ('Top-down', TD_STRATEGY),
1658 '2': ('Bottom-up', BU_STRATEGY),
1659 '3': ('Bottom-up left-corner', BU_LC_STRATEGY),
1660 '4': ('Filtered left-corner', LC_STRATEGY)}
1661 choices = []
1662 if strategies.has_key(choice): choices = [choice]
1663 if choice=='6': choices = "1234"
1664
1665
1666 for strategy in choices:
1667 print "* Strategy: " + strategies[strategy][0]
1668 print
1669 cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
1670 t = time.time()
1671 chart = cp.chart_parse(tokens)
1672 parses = chart.parses(grammar.start())
1673 times[strategies[strategy][0]] = time.time()-t
1674 print "Nr edges in chart:", len(chart.edges())
1675 if numparses:
1676 assert len(parses)==numparses, 'Not all parses found'
1677 if should_print_trees:
1678 for tree in parses: print tree
1679 else:
1680 print "Nr trees:", len(parses)
1681 print
1682
1683
1684 if choice in "56":
1685 print "* Strategy: Stepping (top-down vs bottom-up)"
1686 print
1687 t = time.time()
1688 cp = SteppingChartParser(grammar, trace=trace)
1689 cp.initialize(tokens)
1690 for i in range(5):
1691 print '*** SWITCH TO TOP DOWN'
1692 cp.set_strategy(TD_STRATEGY)
1693 for j, e in enumerate(cp.step()):
1694 if j>20 or e is None: break
1695 print '*** SWITCH TO BOTTOM UP'
1696 cp.set_strategy(BU_STRATEGY)
1697 for j, e in enumerate(cp.step()):
1698 if j>20 or e is None: break
1699 times['Stepping'] = time.time()-t
1700 print "Nr edges in chart:", len(cp.chart().edges())
1701 if numparses:
1702 assert len(cp.parses())==numparses, 'Not all parses found'
1703 if should_print_trees:
1704 for tree in cp.parses(): print tree
1705 else:
1706 print "Nr trees:", len(cp.parses())
1707 print
1708
1709
1710 if not (should_print_times and times): return
1711 print "* Parsing times"
1712 print
1713 maxlen = max(len(key) for key in times.keys())
1714 format = '%' + `maxlen` + 's parser: %6.3fsec'
1715 times_items = times.items()
1716 times_items.sort(lambda a,b:cmp(a[1], b[1]))
1717 for (parser, t) in times_items:
1718 print format % (parser, t)
1719
1720 if __name__ == '__main__': demo()
1721