1
2
3
4
5
6
7
8
9 import re
10 import types
11
12 from nltk.tree import Tree
13
14 from nltk.chunk.api import *
15 from nltk.chunk.util import *
22 """
23 A string-based encoding of a particular chunking of a text.
24 Internally, the C{ChunkString} class uses a single string to
25 encode the chunking of the input text. This string contains a
26 sequence of angle-bracket delimited tags, with chunking indicated
27 by braces. An example of this encoding is::
28
29 {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
30
31 C{ChunkString} are created from tagged texts (i.e., C{list}s of
32 C{tokens} whose type is C{TaggedType}). Initially, nothing is
33 chunked.
34
35 The chunking of a C{ChunkString} can be modified with the C{xform}
36 method, which uses a regular expression to transform the string
37 representation. These transformations should only add and remove
38 braces; they should I{not} modify the sequence of angle-bracket
39 delimited tags.
40
41 @type _str: C{string}
42 @ivar _str: The internal string representation of the text's
43 encoding. This string representation contains a sequence of
44 angle-bracket delimited tags, with chunking indicated by
45 braces. An example of this encoding is::
46
47 {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
48
49 @type _pieces: C{list} of pieces (tagged tokens and chunks)
50 @ivar _pieces: The tagged tokens and chunks encoded by this C{ChunkString}.
51 @ivar _debug: The debug level. See the constructor docs.
52
53 @cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that
54 will only match positions that are in chunks.
55 @cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that
56 will only match positions that are in chinks.
57 """
58 CHUNK_TAG_CHAR = r'[^\{\}<>]'
59 CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
60
61 IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
62 IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
63
64
65 _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
66 _CHINK = r'(%s+?)+?' % CHUNK_TAG
67 _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
68 _BRACKETS = re.compile('[^\{\}]+')
69 _BALANCED_BRACKETS = re.compile(r'(\{\})*$')
70
71 - def __init__(self, chunk_struct, debug_level=1):
72 """
73 Construct a new C{ChunkString} that encodes the chunking of
74 the text C{tagged_tokens}.
75
76 @type chunk_struct: C{Tree}
77 @param chunk_struct: The chunk structure to be further chunked.
78 @type debug_level: int
79 @param debug_level: The level of debugging which should be
80 applied to transformations on the C{ChunkString}. The
81 valid levels are:
82 - 0: no checks
83 - 1: full check on to_chunkstruct
84 - 2: full check on to_chunkstruct and cursory check after
85 each transformation.
86 - 3: full check on to_chunkstruct and full check after
87 each transformation.
88 We recommend you use at least level 1. You should
89 probably use level 3 if you use any non-standard
90 subclasses of C{RegexpChunkRule}.
91 """
92 self._top_node = chunk_struct.node
93 self._pieces = chunk_struct[:]
94 tags = [self._tag(tok) for tok in self._pieces]
95 self._str = '<' + '><'.join(tags) + '>'
96 self._debug = debug_level
97
98 - def _tag(self, tok):
99 if type(tok) == types.TupleType:
100 return tok[1]
101 elif isinstance(tok, Tree):
102 return tok.node
103 else:
104 raise ValueError('chunk structures must contain tagged '
105 'tokens or trees')
106
107 - def _verify(self, s, verify_tags):
108 """
109 Check to make sure that C{s} still corresponds to some chunked
110 version of C{_pieces}.
111
112 @type verify_tags: C{boolean}
113 @param verify_tags: Whether the individual tags should be
114 checked. If this is false, C{_verify} will check to make
115 sure that C{_str} encodes a chunked version of I{some}
116 list of tokens. If this is true, then C{_verify} will
117 check to make sure that the tags in C{_str} match those in
118 C{_pieces}.
119
120 @raise ValueError: if this C{ChunkString}'s internal string
121 representation is invalid or not consistent with _pieces.
122 """
123
124 if not ChunkString._VALID.match(s):
125 raise ValueError('Transformation generated invalid '
126 'chunkstring:\n %s' % s)
127
128
129
130
131 brackets = ChunkString._BRACKETS.sub('', s)
132 for i in range(1+len(brackets)/5000):
133 substr = brackets[i*5000:i*5000+5000]
134 if not ChunkString._BALANCED_BRACKETS.match(substr):
135 raise ValueError('Transformation generated invalid '
136 'chunkstring:\n %s' % s)
137
138 if verify_tags<=0: return
139
140 tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
141 tags2 = [self._tag(piece) for piece in self._pieces]
142 if tags1 != tags2:
143 raise ValueError('Transformation generated invalid '
144 'chunkstring: tag changed')
145
147 """
148 @return: the chunk structure encoded by this C{ChunkString}.
149 @rtype: C{Tree}
150 @raise ValueError: If a transformation has generated an
151 invalid chunkstring.
152 """
153 if self._debug > 0: self._verify(self._str, 1)
154
155
156 pieces = []
157 index = 0
158 piece_in_chunk = 0
159 for piece in re.split('[{}]', self._str):
160
161
162 length = piece.count('<')
163 subsequence = self._pieces[index:index+length]
164
165
166 if piece_in_chunk:
167 pieces.append(Tree(chunk_node, subsequence))
168 else:
169 pieces += subsequence
170
171
172 index += length
173 piece_in_chunk = not piece_in_chunk
174
175 return Tree(self._top_node, pieces)
176
215
217 """
218 @rtype: C{string}
219 @return: A string representation of this C{ChunkString}. This
220 string representation has the form::
221
222 <ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'>
223
224 """
225 return '<ChunkString: %s>' % `self._str`
226
228 """
229 @rtype: C{string}
230 @return: A formatted representation of this C{ChunkString}'s
231 string encoding. This representation will include extra
232 spaces to ensure that tags will line up with the
233 representation of other C{ChunkStrings} for the same text,
234 regardless of the chunking.
235 """
236
237 str = re.sub(r'>(?!\})', r'> ', self._str)
238 str = re.sub(r'([^\{])<', r'\1 <', str)
239 if str[0] == '<': str = ' ' + str
240 return str
241
247 """
248 A rule specifying how to modify the chunking in a C{ChunkString},
249 using a transformational regular expression. The
250 C{RegexpChunkRule} class itself can be used to implement any
251 transformational rule based on regular expressions. There are
252 also a number of subclasses, which can be used to implement
253 simpler types of rules, based on matching regular expressions.
254
255 Each C{RegexpChunkRule} has a regular expression and a
256 replacement expression. When a C{RegexpChunkRule} is X{applied}
257 to a C{ChunkString}, it searches the C{ChunkString} for any
258 substring that matches the regular expression, and replaces it
259 using the replacement expression. This search/replace operation
260 has the same semantics as C{re.sub}.
261
262 Each C{RegexpChunkRule} also has a description string, which
263 gives a short (typically less than 75 characters) description of
264 the purpose of the rule.
265
266 This transformation defined by this C{RegexpChunkRule} should
267 only add and remove braces; it should I{not} modify the sequence
268 of angle-bracket delimited tags. Furthermore, this transformation
269 may not result in nested or mismatched bracketing.
270 """
271 - def __init__(self, regexp, repl, descr):
272 """
273 Construct a new RegexpChunkRule.
274
275 @type regexp: C{regexp} or C{string}
276 @param regexp: This C{RegexpChunkRule}'s regular expression.
277 When this rule is applied to a C{ChunkString}, any
278 substring that matches C{regexp} will be replaced using
279 the replacement string C{repl}. Note that this must be a
280 normal regular expression, not a tag pattern.
281 @type repl: C{string}
282 @param repl: This C{RegexpChunkRule}'s replacement
283 expression. When this rule is applied to a
284 C{ChunkString}, any substring that matches C{regexp} will
285 be replaced using C{repl}.
286 @type descr: C{string}
287 @param descr: A short description of the purpose and/or effect
288 of this rule.
289 """
290 if isinstance(regexp, basestring):
291 regexp = re.compile(regexp)
292 self._repl = repl
293 self._descr = descr
294 self._regexp = regexp
295
296 - def apply(self, chunkstr):
297
298 """
299 Apply this rule to the given C{ChunkString}. See the
300 class reference documentation for a description of what it
301 means to apply a rule.
302
303 @type chunkstr: C{ChunkString}
304 @param chunkstr: The chunkstring to which this rule is
305 applied.
306 @rtype: C{None}
307 @raise ValueError: If this transformation generated an
308 invalid chunkstring.
309 """
310 chunkstr.xform(self._regexp, self._repl)
311
313 """
314 @rtype: C{string}
315 @return: a short description of the purpose and/or effect of
316 this rule.
317 """
318 return self._descr
319
321 """
322 @rtype: C{string}
323 @return: A string representation of this rule. This
324 string representation has the form::
325
326 <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
327
328 Note that this representation does not include the
329 description string; that string can be accessed
330 separately with the C{descr} method.
331 """
332 return ('<RegexpChunkRule: '+`self._regexp.pattern`+
333 '->'+`self._repl`+'>')
334
335 @staticmethod
337 """
338 Create a RegexpChunkRule from a string description.
339 Currently, the following formats are supported::
340
341 {regexp} # chunk rule
342 }regexp{ # chink rule
343 regexp}{regexp # split rule
344 regexp{}regexp # merge rule
345
346 Where C{regexp} is a regular expression for the rule. Any
347 text following the comment marker (C{#}) will be used as
348 the rule's description:
349
350 >>> RegexpChunkRule.parse('{<DT>?<NN.*>+}')
351 <ChunkRule: '<DT>?<NN.*>+'>
352 """
353
354 m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
355 rule = m.group('rule').strip()
356 comment = (m.group('comment') or '')[1:].strip()
357
358
359 try:
360 if not rule:
361 raise ValueError('Empty chunk pattern')
362 if rule[0] == '{' and rule[-1] == '}':
363 return ChunkRule(rule[1:-1], comment)
364 elif rule[0] == '}' and rule[-1] == '{':
365 return ChinkRule(rule[1:-1], comment)
366 elif '}{' in rule:
367 left, right = rule.split('}{')
368 return SplitRule(left, right, comment)
369 elif '{}' in rule:
370 left, right = rule.split('{}')
371 return MergeRule(left, right, comment)
372 elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
373 left, chunk, right = re.split('[{}]', rule)
374 return ChunkRuleWithContext(left, chunk, right, comment)
375 else:
376 raise ValueError('Illegal chunk pattern: %s' % rule)
377 except (ValueError, re.error):
378 raise ValueError('Illegal chunk pattern: %s' % rule)
379
382 """
383 A rule specifying how to add chunks to a C{ChunkString}, using a
384 matching tag pattern. When applied to a C{ChunkString}, it will
385 find any substring that matches this tag pattern and that is not
386 already part of a chunk, and create a new chunk containing that
387 substring.
388 """
389 - def __init__(self, tag_pattern, descr):
390
391 """
392 Construct a new C{ChunkRule}.
393
394 @type tag_pattern: C{string}
395 @param tag_pattern: This rule's tag pattern. When
396 applied to a C{ChunkString}, this rule will
397 chunk any substring that matches this tag pattern and that
398 is not already part of a chunk.
399 @type descr: C{string}
400 @param descr: A short description of the purpose and/or effect
401 of this rule.
402 """
403 self._pattern = tag_pattern
404 regexp = re.compile('(?P<chunk>%s)%s' %
405 (tag_pattern2re_pattern(tag_pattern),
406 ChunkString.IN_CHINK_PATTERN))
407 RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
408
410 """
411 @rtype: C{string}
412 @return: A string representation of this rule. This
413 string representation has the form::
414
415 <ChunkRule: '<IN|VB.*>'>
416
417 Note that this representation does not include the
418 description string; that string can be accessed
419 separately with the C{descr} method.
420 """
421 return '<ChunkRule: '+`self._pattern`+'>'
422
424 """
425 A rule specifying how to remove chinks to a C{ChunkString},
426 using a matching tag pattern. When applied to a
427 C{ChunkString}, it will find any substring that matches this
428 tag pattern and that is contained in a chunk, and remove it
429 from that chunk, thus creating two new chunks.
430 """
431 - def __init__(self, tag_pattern, descr):
432 """
433 Construct a new C{ChinkRule}.
434
435 @type tag_pattern: C{string}
436 @param tag_pattern: This rule's tag pattern. When
437 applied to a C{ChunkString}, this rule will
438 find any substring that matches this tag pattern and that
439 is contained in a chunk, and remove it from that chunk,
440 thus creating two new chunks.
441 @type descr: C{string}
442 @param descr: A short description of the purpose and/or effect
443 of this rule.
444 """
445 self._pattern = tag_pattern
446 regexp = re.compile('(?P<chink>%s)%s' %
447 (tag_pattern2re_pattern(tag_pattern),
448 ChunkString.IN_CHUNK_PATTERN))
449 RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
450
452 """
453 @rtype: C{string}
454 @return: A string representation of this rule. This
455 string representation has the form::
456
457 <ChinkRule: '<IN|VB.*>'>
458
459 Note that this representation does not include the
460 description string; that string can be accessed
461 separately with the C{descr} method.
462 """
463 return '<ChinkRule: '+`self._pattern`+'>'
464
466 """
467 A rule specifying how to remove chunks to a C{ChunkString},
468 using a matching tag pattern. When applied to a
469 C{ChunkString}, it will find any complete chunk that matches this
470 tag pattern, and un-chunk it.
471 """
472 - def __init__(self, tag_pattern, descr):
473 """
474 Construct a new C{UnChunkRule}.
475
476 @type tag_pattern: C{string}
477 @param tag_pattern: This rule's tag pattern. When
478 applied to a C{ChunkString}, this rule will
479 find any complete chunk that matches this tag pattern,
480 and un-chunk it.
481 @type descr: C{string}
482 @param descr: A short description of the purpose and/or effect
483 of this rule.
484 """
485 self._pattern = tag_pattern
486 regexp = re.compile('\{(?P<chunk>%s)\}' %
487 tag_pattern2re_pattern(tag_pattern))
488 RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
489
491 """
492 @rtype: C{string}
493 @return: A string representation of this rule. This
494 string representation has the form::
495
496 <UnChunkRule: '<IN|VB.*>'>
497
498 Note that this representation does not include the
499 description string; that string can be accessed
500 separately with the C{descr} method.
501 """
502 return '<UnChunkRule: '+`self._pattern`+'>'
503
505 """
506 A rule specifying how to merge chunks in a C{ChunkString}, using
507 two matching tag patterns: a left pattern, and a right pattern.
508 When applied to a C{ChunkString}, it will find any chunk whose end
509 matches left pattern, and immediately followed by a chunk whose
510 beginning matches right pattern. It will then merge those two
511 chunks into a single chunk.
512 """
513 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
514 """
515 Construct a new C{MergeRule}.
516
517 @type right_tag_pattern: C{string}
518 @param right_tag_pattern: This rule's right tag
519 pattern. When applied to a C{ChunkString}, this
520 rule will find any chunk whose end matches
521 C{left_tag_pattern}, and immediately followed by a chunk
522 whose beginning matches this pattern. It will
523 then merge those two chunks into a single chunk.
524 @type left_tag_pattern: C{string}
525 @param left_tag_pattern: This rule's left tag
526 pattern. When applied to a C{ChunkString}, this
527 rule will find any chunk whose end matches
528 this pattern, and immediately followed by a chunk
529 whose beginning matches C{right_tag_pattern}. It will
530 then merge those two chunks into a single chunk.
531
532 @type descr: C{string}
533 @param descr: A short description of the purpose and/or effect
534 of this rule.
535 """
536
537
538 re.compile(tag_pattern2re_pattern(left_tag_pattern))
539 re.compile(tag_pattern2re_pattern(right_tag_pattern))
540
541 self._left_tag_pattern = left_tag_pattern
542 self._right_tag_pattern = right_tag_pattern
543 regexp = re.compile('(?P<left>%s)}{(?=%s)' %
544 (tag_pattern2re_pattern(left_tag_pattern),
545 tag_pattern2re_pattern(right_tag_pattern)))
546 RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
547
549 """
550 @rtype: C{string}
551 @return: A string representation of this rule. This
552 string representation has the form::
553
554 <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
555
556 Note that this representation does not include the
557 description string; that string can be accessed
558 separately with the C{descr} method.
559 """
560 return ('<MergeRule: '+`self._left_tag_pattern`+', '+
561 `self._right_tag_pattern`+'>')
562
564 """
565 A rule specifying how to split chunks in a C{ChunkString}, using
566 two matching tag patterns: a left pattern, and a right pattern.
567 When applied to a C{ChunkString}, it will find any chunk that
568 matches the left pattern followed by the right pattern. It will
569 then split the chunk into two new chunks, at the point between the
570 two pattern matches.
571 """
572 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
573 """
574 Construct a new C{SplitRule}.
575
576 @type right_tag_pattern: C{string}
577 @param right_tag_pattern: This rule's right tag
578 pattern. When applied to a C{ChunkString}, this rule will
579 find any chunk containing a substring that matches
580 C{left_tag_pattern} followed by this pattern. It will
581 then split the chunk into two new chunks at the point
582 between these two matching patterns.
583 @type left_tag_pattern: C{string}
584 @param left_tag_pattern: This rule's left tag
585 pattern. When applied to a C{ChunkString}, this rule will
586 find any chunk containing a substring that matches this
587 pattern followed by C{right_tag_pattern}. It will then
588 split the chunk into two new chunks at the point between
589 these two matching patterns.
590 @type descr: C{string}
591 @param descr: A short description of the purpose and/or effect
592 of this rule.
593 """
594
595
596 re.compile(tag_pattern2re_pattern(left_tag_pattern))
597 re.compile(tag_pattern2re_pattern(right_tag_pattern))
598
599 self._left_tag_pattern = left_tag_pattern
600 self._right_tag_pattern = right_tag_pattern
601 regexp = re.compile('(?P<left>%s)(?=%s)' %
602 (tag_pattern2re_pattern(left_tag_pattern),
603 tag_pattern2re_pattern(right_tag_pattern)))
604 RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
605
607 """
608 @rtype: C{string}
609 @return: A string representation of this rule. This
610 string representation has the form::
611
612 <SplitRule: '<NN>', '<DT>'>
613
614 Note that this representation does not include the
615 description string; that string can be accessed
616 separately with the C{descr} method.
617 """
618 return ('<SplitRule: '+`self._left_tag_pattern`+', '+
619 `self._right_tag_pattern`+'>')
620
622 """
623 A rule specifying how to expand chunks in a C{ChunkString} to the left,
624 using two matching tag patterns: a left pattern, and a right pattern.
625 When applied to a C{ChunkString}, it will find any chunk whose beginning
626 matches right pattern, and immediately preceded by a chink whose
627 end matches left pattern. It will then expand the chunk to incorporate
628 the new material on the left.
629 """
630 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
631 """
632 Construct a new C{ExpandRightRule}.
633
634 @type right_tag_pattern: C{string}
635 @param right_tag_pattern: This rule's right tag
636 pattern. When applied to a C{ChunkString}, this
637 rule will find any chunk whose beginning matches
638 C{right_tag_pattern}, and immediately preceded by a chink
639 whose end matches this pattern. It will
640 then merge those two chunks into a single chunk.
641 @type left_tag_pattern: C{string}
642 @param left_tag_pattern: This rule's left tag
643 pattern. When applied to a C{ChunkString}, this
644 rule will find any chunk whose beginning matches
645 this pattern, and immediately preceded by a chink
646 whose end matches C{left_tag_pattern}. It will
647 then expand the chunk to incorporate the new material on the left.
648
649 @type descr: C{string}
650 @param descr: A short description of the purpose and/or effect
651 of this rule.
652 """
653
654
655 re.compile(tag_pattern2re_pattern(left_tag_pattern))
656 re.compile(tag_pattern2re_pattern(right_tag_pattern))
657
658 self._left_tag_pattern = left_tag_pattern
659 self._right_tag_pattern = right_tag_pattern
660 regexp = re.compile('(?P<left>%s)\{(?P<right>%s)' %
661 (tag_pattern2re_pattern(left_tag_pattern),
662 tag_pattern2re_pattern(right_tag_pattern)))
663 RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
664
666 """
667 @rtype: C{string}
668 @return: A string representation of this rule. This
669 string representation has the form::
670
671 <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
672
673 Note that this representation does not include the
674 description string; that string can be accessed
675 separately with the C{descr} method.
676 """
677 return ('<ExpandLeftRule: '+`self._left_tag_pattern`+', '+
678 `self._right_tag_pattern`+'>')
679
681 """
682 A rule specifying how to expand chunks in a C{ChunkString} to the
683 right, using two matching tag patterns: a left pattern, and a
684 right pattern. When applied to a C{ChunkString}, it will find any
685 chunk whose end matches left pattern, and immediately followed by
686 a chink whose beginning matches right pattern. It will then
687 expand the chunk to incorporate the new material on the right.
688 """
689 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
690 """
691 Construct a new C{ExpandRightRule}.
692
693 @type right_tag_pattern: C{string}
694 @param right_tag_pattern: This rule's right tag
695 pattern. When applied to a C{ChunkString}, this
696 rule will find any chunk whose end matches
697 C{left_tag_pattern}, and immediately followed by a chink
698 whose beginning matches this pattern. It will
699 then merge those two chunks into a single chunk.
700 @type left_tag_pattern: C{string}
701 @param left_tag_pattern: This rule's left tag
702 pattern. When applied to a C{ChunkString}, this
703 rule will find any chunk whose end matches
704 this pattern, and immediately followed by a chink
705 whose beginning matches C{right_tag_pattern}. It will
706 then expand the chunk to incorporate the new material on the right.
707
708 @type descr: C{string}
709 @param descr: A short description of the purpose and/or effect
710 of this rule.
711 """
712
713
714 re.compile(tag_pattern2re_pattern(left_tag_pattern))
715 re.compile(tag_pattern2re_pattern(right_tag_pattern))
716
717 self._left_tag_pattern = left_tag_pattern
718 self._right_tag_pattern = right_tag_pattern
719 regexp = re.compile('(?P<left>%s)\}(?P<right>%s)' %
720 (tag_pattern2re_pattern(left_tag_pattern),
721 tag_pattern2re_pattern(right_tag_pattern)))
722 RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
723
725 """
726 @rtype: C{string}
727 @return: A string representation of this rule. This
728 string representation has the form::
729
730 <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
731
732 Note that this representation does not include the
733 description string; that string can be accessed
734 separately with the C{descr} method.
735 """
736 return ('<ExpandRightRule: '+`self._left_tag_pattern`+', '+
737 `self._right_tag_pattern`+'>')
738
739 -class ChunkRuleWithContext(RegexpChunkRule):
740 """
741 A rule specifying how to add chunks to a C{ChunkString}, using
742 three matching tag patterns: one for the left context, one for the
743 chunk, and one for the right context. When applied to a
744 C{ChunkString}, it will find any substring that matches the chunk
745 tag pattern, is surrounded by substrings that match the two
746 context patterns, and is not already part of a chunk; and create a
747 new chunk containing the substring that matched the chunk tag
748 pattern.
749
750 Caveat: Both the left and right context are consumed when this
751 rule matches; therefore, if you need to find overlapping matches,
752 you will need to apply your rule more than once.
753 """
754 - def __init__(self, left_context_tag_pattern, chunk_tag_pattern,
755 right_context_tag_pattern, descr):
756 """
757 Construct a new C{ChunkRuleWithContext}.
758
759 @type left_context_tag_pattern: C{string}
760 @param left_context_tag_pattern: A tag pattern that must match
761 the left context of C{chunk_tag_pattern} for this rule to
762 apply.
763 @type chunk_tag_pattern: C{string}
764 @param chunk_tag_pattern: A tag pattern that must match for this
765 rule to apply. If the rule does apply, then this pattern
766 also identifies the substring that will be made into a chunk.
767 @type right_context_tag_pattern: C{string}
768 @param right_context_tag_pattern: A tag pattern that must match
769 the right context of C{chunk_tag_pattern} for this rule to
770 apply.
771 @type descr: C{string}
772 @param descr: A short description of the purpose and/or effect
773 of this rule.
774 """
775
776
777 re.compile(tag_pattern2re_pattern(left_context_tag_pattern))
778 re.compile(tag_pattern2re_pattern(chunk_tag_pattern))
779 re.compile(tag_pattern2re_pattern(right_context_tag_pattern))
780
781 self._left_context_tag_pattern = left_context_tag_pattern
782 self._chunk_tag_pattern = chunk_tag_pattern
783 self._right_context_tag_pattern = right_context_tag_pattern
784 regexp = re.compile('(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s' %
785 (tag_pattern2re_pattern(left_context_tag_pattern),
786 tag_pattern2re_pattern(chunk_tag_pattern),
787 tag_pattern2re_pattern(right_context_tag_pattern),
788 ChunkString.IN_CHINK_PATTERN))
789 replacement = r'\g<left>{\g<chunk>}\g<right>'
790 RegexpChunkRule.__init__(self, regexp, replacement, descr)
791
792 - def __repr__(self):
793 """
794 @rtype: C{string}
795 @return: A string representation of this rule. This
796 string representation has the form::
797
798 <ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'>
799
800 Note that this representation does not include the
801 description string; that string can be accessed
802 separately with the C{descr} method.
803 """
804 return '<ChunkRuleWithContext: %r, %r, %r>' % (
805 self._left_context_tag_pattern, self._chunk_tag_pattern,
806 self._right_context_tag_pattern)
807
808
809
810
811
812
813
814 CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' %
815 ('[^\{\}<>]+',
816 '[^\{\}<>]+'))
819 """
820 Convert a tag pattern to a regular expression pattern. A X{tag
821 pattern} is a modified version of a regular expression, designed
822 for matching sequences of tags. The differences between regular
823 expression patterns and tag patterns are:
824
825 - In tag patterns, C{'<'} and C{'>'} act as parentheses; so
826 C{'<NN>+'} matches one or more repetitions of C{'<NN>'}, not
827 C{'<NN'} followed by one or more repetitions of C{'>'}.
828 - Whitespace in tag patterns is ignored. So
829 C{'<DT> | <NN>'} is equivalant to C{'<DT>|<NN>'}
830 - In tag patterns, C{'.'} is equivalant to C{'[^{}<>]'}; so
831 C{'<NN.*>'} matches any single tag starting with C{'NN'}.
832
833 In particular, C{tag_pattern2re_pattern} performs the following
834 transformations on the given pattern:
835
836 - Replace '.' with '[^<>{}]'
837 - Remove any whitespace
838 - Add extra parens around '<' and '>', to make '<' and '>' act
839 like parentheses. E.g., so that in '<NN>+', the '+' has scope
840 over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
841 scope over 'NN' and 'IN', but not '<' or '>'.
842 - Check to make sure the resulting pattern is valid.
843
844 @type tag_pattern: C{string}
845 @param tag_pattern: The tag pattern to convert to a regular
846 expression pattern.
847 @raise ValueError: If C{tag_pattern} is not a valid tag pattern.
848 In particular, C{tag_pattern} should not include braces; and it
849 should not contain nested or mismatched angle-brackets.
850 @rtype: C{string}
851 @return: A regular expression pattern corresponding to
852 C{tag_pattern}.
853 """
854
855 tag_pattern = re.sub(r'\s', '', tag_pattern)
856 tag_pattern = re.sub(r'<', '(<(', tag_pattern)
857 tag_pattern = re.sub(r'>', ')>)', tag_pattern)
858
859
860 if not CHUNK_TAG_PATTERN.match(tag_pattern):
861 raise ValueError('Bad tag pattern: %r' % tag_pattern)
862
863
864
865
866
867
868
869 def reverse_str(str):
870 lst = list(str)
871 lst.reverse()
872 return ''.join(lst)
873 tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
874 reversed = reverse_str(tag_pattern)
875 reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
876 tag_pattern = reverse_str(reversed)
877
878 return tag_pattern
879
886 """
887 A regular expression based chunk parser. C{RegexpChunkParser} uses a
888 sequence of X{rules} to find chunks of a single type within a
889 text. The chunking of the text is encoded using a C{ChunkString},
890 and each rule acts by modifying the chunking in the
891 C{ChunkString}. The rules are all implemented using regular
892 expression matching and substitution.
893
894 The C{RegexpChunkRule} class and its subclasses (C{ChunkRule},
895 C{ChinkRule}, C{UnChunkRule}, C{MergeRule}, and C{SplitRule})
896 define the rules that are used by C{RegexpChunkParser}. Each rule
897 defines an C{apply} method, which modifies the chunking encoded
898 by a given C{ChunkString}.
899
900 @type _rules: C{list} of C{RegexpChunkRule}
901 @ivar _rules: The list of rules that should be applied to a text.
902 @type _trace: C{int}
903 @ivar _trace: The default level of tracing.
904
905 """
906 - def __init__(self, rules, chunk_node='NP', top_node='S', trace=0):
907 """
908 Construct a new C{RegexpChunkParser}.
909
910 @type rules: C{list} of C{RegexpChunkRule}
911 @param rules: The sequence of rules that should be used to
912 generate the chunking for a tagged text.
913 @type chunk_node: C{string}
914 @param chunk_node: The node value that should be used for
915 chunk subtrees. This is typically a short string
916 describing the type of information contained by the chunk,
917 such as C{"NP"} for base noun phrases.
918 @type top_node: C{string}
919 @param top_node: The node value that should be used for the
920 top node of the chunk structure.
921 @type trace: C{int}
922 @param trace: The level of tracing that should be used when
923 parsing a text. C{0} will generate no tracing output;
924 C{1} will generate normal tracing output; and C{2} or
925 higher will generate verbose tracing output.
926 """
927 self._rules = rules
928 self._trace = trace
929 self._chunk_node = chunk_node
930 self._top_node = top_node
931
933 """
934 Apply each of this C{RegexpChunkParser}'s rules to C{chunkstr}, in
935 turn. Generate trace output between each rule. If C{verbose}
936 is true, then generate verbose output.
937
938 @type chunkstr: C{ChunkString}
939 @param chunkstr: The chunk string to which each rule should be
940 applied.
941 @type verbose: C{boolean}
942 @param verbose: Whether output should be verbose.
943 @rtype: C{None}
944 """
945 print '# Input:'
946 print chunkstr
947 for rule in self._rules:
948 rule.apply(chunkstr)
949 if verbose:
950 print '#', rule.descr()+' ('+`rule`+'):'
951 else:
952 print '#', rule.descr()+':'
953 print chunkstr
954
956 """
957 Apply each of this C{RegexpChunkParser}'s rules to C{chunkstr}, in
958 turn.
959
960 @param chunkstr: The chunk string to which each rule should be
961 applied.
962 @type chunkstr: C{ChunkString}
963 @rtype: C{None}
964 """
965
966 for rule in self._rules:
967 rule.apply(chunkstr)
968
969 - def parse(self, chunk_struct, trace=None):
970 """
971 @type chunk_struct: C{Tree}
972 @param chunk_struct: the chunk structure to be (further) chunked
973 @type trace: C{int}
974 @param trace: The level of tracing that should be used when
975 parsing a text. C{0} will generate no tracing output;
976 C{1} will generate normal tracing output; and C{2} or
977 highter will generate verbose tracing output. This value
978 overrides the trace level value that was given to the
979 constructor.
980 @rtype: C{Tree}
981 @return: a chunk structure that encodes the chunks in a given
982 tagged sentence. A chunk is a non-overlapping linguistic
983 group, such as a noun phrase. The set of chunks
984 identified in the chunk structure depends on the rules
985 used to define this C{RegexpChunkParser}.
986 """
987 if len(chunk_struct) == 0:
988 print 'Warning: parsing empty text'
989 return Tree(self._top_node, [])
990
991 try:
992 chunk_struct.node
993 except AttributeError:
994 chunk_struct = Tree(self._top_node, chunk_struct)
995
996
997 if trace == None: trace = self._trace
998
999 chunkstr = ChunkString(chunk_struct)
1000
1001
1002 if trace:
1003 verbose = (trace>1)
1004 self._trace_apply(chunkstr, verbose)
1005 else:
1006 self._notrace_apply(chunkstr)
1007
1008
1009 return chunkstr.to_chunkstruct(self._chunk_node)
1010
1012 """
1013 @return: the sequence of rules used by C{RegexpChunkParser}.
1014 @rtype: C{list} of C{RegexpChunkRule}
1015 """
1016 return self._rules
1017
1019 """
1020 @return: a concise string representation of this
1021 C{RegexpChunkParser}.
1022 @rtype: C{string}
1023 """
1024 return "<RegexpChunkParser with %d rules>" % len(self._rules)
1025
1027 """
1028 @return: a verbose string representation of this C{RegexpChunkParser}.
1029 @rtype: C{string}
1030 """
1031 s = "RegexpChunkParser with %d rules:\n" % len(self._rules)
1032 margin = 0
1033 for rule in self._rules:
1034 margin = max(margin, len(rule.descr()))
1035 if margin < 35:
1036 format = " %" + `-(margin+3)` + "s%s\n"
1037 else:
1038 format = " %s\n %s\n"
1039 for rule in self._rules:
1040 s += format % (rule.descr(), `rule`)
1041 return s[:-1]
1042
1048 """
1049 A grammar based chunk parser. C{chunk.RegexpParser} uses a set of
1050 regular expression patterns to specify the behavior of the parser.
1051 The chunking of the text is encoded using a C{ChunkString}, and
1052 each rule acts by modifying the chunking in the C{ChunkString}.
1053 The rules are all implemented using regular expression matching
1054 and substitution.
1055
1056 A grammar contains one or more clauses in the following form::
1057
1058 NP:
1059 {<DT|JJ>} # chunk determiners and adjectives
1060 }<[\.VI].*>+{ # chink any tag beginning with V, I, or .
1061 <.*>}{<DT> # split a chunk at a determiner
1062 <DT|JJ>{}<NN.*> # merge chunk ending with det/adj
1063 # with one starting with a noun
1064
1065 The patterns of a clause are executed in order. An earlier
1066 pattern may introduce a chunk boundary that prevents a later
1067 pattern from executing. Sometimes an individual pattern will
1068 match on multiple, overlapping extents of the input. As with
1069 regular expression substitution more generally, the chunker will
1070 identify the first match possible, then continue looking for matches
1071 after this one has ended.
1072
1073 The clauses of a grammar are also executed in order. A cascaded
1074 chunk parser is one having more than one clause. The maximum depth
1075 of a parse tree created by this chunk parser is the same as the
1076 number of clauses in the grammar.
1077
1078 When tracing is turned on, the comment portion of a line is displayed
1079 each time the corresponding pattern is applied.
1080
1081 @type _start: C{string}
1082 @ivar _start: The start symbol of the grammar (the root node of
1083 resulting trees)
1084 @type _stages: C{int}
1085 @ivar _stages: The list of parsing stages corresponding to the grammar
1086
1087 """
1088 - def __init__(self, grammar, top_node='S', loop=1, trace=0):
1089 """
1090 Create a new chunk parser, from the given start state
1091 and set of chunk patterns.
1092
1093 @param grammar: The grammar, or a list of RegexpChunkParser objects
1094 @type grammar: C{string} or C{list} of C{RegexpChunkParser}
1095 @param top_node: The top node of the tree being created
1096 @type top_node: C{string} or C{Nonterminal}
1097 @param loop: The number of times to run through the patterns
1098 @type loop: C{int}
1099 @type trace: C{int}
1100 @param trace: The level of tracing that should be used when
1101 parsing a text. C{0} will generate no tracing output;
1102 C{1} will generate normal tracing output; and C{2} or
1103 higher will generate verbose tracing output.
1104 """
1105 self._trace = trace
1106 self._stages = []
1107 self._grammar = grammar
1108 self._loop = loop
1109
1110 if isinstance(grammar, basestring):
1111 self._parse_grammar(grammar, top_node, trace)
1112 else:
1113
1114 type_err = ('Expected string or list of RegexpChunkParsers '
1115 'for the grammar.')
1116 try: grammar = list(grammar)
1117 except: raise TypeError(type_err)
1118 for elt in grammar:
1119 if not isinstance(elt, RegexpChunkParser):
1120 raise TypeError(type_err)
1121 self._stages = grammar
1122
1151
1152 - def _add_stage(self, rules, lhs, top_node, trace):
1153 """
1154 Helper function for __init__: add a new stage to the parser.
1155 """
1156 if rules != []:
1157 if not lhs:
1158 raise ValueError('Expected stage marker (eg NP:)')
1159 parser = RegexpChunkParser(rules, chunk_node=lhs,
1160 top_node=top_node, trace=trace)
1161 self._stages.append(parser)
1162
1163 - def parse(self, chunk_struct, trace=None):
1164 """
1165 Apply the chunk parser to this input.
1166
1167 @type chunk_struct: C{Tree}
1168 @param chunk_struct: the chunk structure to be (further) chunked
1169 (this tree is modified, and is also returned)
1170 @type trace: C{int}
1171 @param trace: The level of tracing that should be used when
1172 parsing a text. C{0} will generate no tracing output;
1173 C{1} will generate normal tracing output; and C{2} or
1174 highter will generate verbose tracing output. This value
1175 overrides the trace level value that was given to the
1176 constructor.
1177 @return: the chunked output.
1178 @rtype: C{Tree}
1179 """
1180 if trace == None: trace = self._trace
1181 for i in range(self._loop):
1182 for parser in self._stages:
1183 chunk_struct = parser.parse(chunk_struct, trace=trace)
1184 return chunk_struct
1185
1187 """
1188 @return: a concise string representation of this C{chunk.RegexpParser}.
1189 @rtype: C{string}
1190 """
1191 return "<chunk.RegexpParser with %d stages>" % len(self._stages)
1192
1194 """
1195 @return: a verbose string representation of this
1196 C{RegexpChunkParser}.
1197 @rtype: C{string}
1198 """
1199 s = "chunk.RegexpParser with %d stages:\n" % len(self._stages)
1200 margin = 0
1201 for parser in self._stages:
1202 s += parser.__str__() + "\n"
1203 return s[:-1]
1204
1205
1206
1207
1208
1209 -def demo_eval(chunkparser, text):
1210 """
1211 Demonstration code for evaluating a chunk parser, using a
1212 C{ChunkScore}. This function assumes that C{text} contains one
1213 sentence per line, and that each sentence has the form expected by
1214 C{tree.chunk}. It runs the given chunk parser on each sentence in
1215 the text, and scores the result. It prints the final score
1216 (precision, recall, and f-measure); and reports the set of chunks
1217 that were missed and the set of chunks that were incorrect. (At
1218 most 10 missing chunks and 10 incorrect chunks are reported).
1219
1220 @param chunkparser: The chunkparser to be tested
1221 @type chunkparser: C{ChunkParserI}
1222 @param text: The chunked tagged text that should be used for
1223 evaluation.
1224 @type text: C{string}
1225 """
1226 from nltk import chunk
1227 from nltk.tree import Tree
1228
1229
1230 chunkscore = chunk.ChunkScore()
1231
1232 for sentence in text.split('\n'):
1233 print sentence
1234 sentence = sentence.strip()
1235 if not sentence: continue
1236 gold = chunk.tagstr2tree(sentence)
1237 tokens = gold.leaves()
1238 test = chunkparser.parse(Tree('S', tokens), trace=1)
1239 chunkscore.score(gold, test)
1240 print
1241
1242 print '/'+('='*75)+'\\'
1243 print 'Scoring', chunkparser
1244 print ('-'*77)
1245 print 'Precision: %5.1f%%' % (chunkscore.precision()*100), ' '*4,
1246 print 'Recall: %5.1f%%' % (chunkscore.recall()*100), ' '*6,
1247 print 'F-Measure: %5.1f%%' % (chunkscore.f_measure()*100)
1248
1249
1250
1251 if chunkscore.missed():
1252 print 'Missed:'
1253 missed = chunkscore.missed()
1254 for chunk in missed[:10]:
1255 print ' ', ' '.join(c.__str__() for c in chunk)
1256 if len(chunkscore.missed()) > 10:
1257 print ' ...'
1258
1259
1260 if chunkscore.incorrect():
1261 print 'Incorrect:'
1262 incorrect = chunkscore.incorrect()
1263 for chunk in incorrect[:10]:
1264 print ' ', ' '.join(c.__str__() for c in chunk)
1265 if len(chunkscore.incorrect()) > 10:
1266 print ' ...'
1267
1268 print '\\'+('='*75)+'/'
1269 print
1270
1272 """
1273 A demonstration for the C{RegexpChunkParser} class. A single text is
1274 parsed with four different chunk parsers, using a variety of rules
1275 and strategies.
1276 """
1277
1278 from nltk import chunk, Tree
1279
1280 text = """\
1281 [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
1282 [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
1283 [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
1284 """
1285
1286 print '*'*75
1287 print 'Evaluation text:'
1288 print text
1289 print '*'*75
1290 print
1291
1292 grammar = r"""
1293 NP: # NP stage
1294 {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
1295 {<NNP>+} # chunk proper nouns
1296 """
1297 cp = chunk.RegexpParser(grammar)
1298 chunk.demo_eval(cp, text)
1299
1300 grammar = r"""
1301 NP:
1302 {<.*>} # start by chunking each tag
1303 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
1304 <DT|JJ>{}<NN.*> # merge det/adj with nouns
1305 """
1306 cp = chunk.RegexpParser(grammar)
1307 chunk.demo_eval(cp, text)
1308
1309 grammar = r"""
1310 NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
1311 VP: {<TO>?<VB.*>} # VP = verb words
1312 """
1313 cp = chunk.RegexpParser(grammar)
1314 chunk.demo_eval(cp, text)
1315
1316 grammar = r"""
1317 NP: {<.*>*} # start by chunking everything
1318 }<[\.VI].*>+{ # chink any verbs, prepositions or periods
1319 <.*>}{<DT> # separate on determiners
1320 PP: {<IN><NP>} # PP = preposition + noun phrase
1321 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
1322 """
1323 cp = chunk.RegexpParser(grammar)
1324 chunk.demo_eval(cp, text)
1325
1326
1327
1328 from nltk.corpus import conll2000
1329
1330 print
1331 print "Demonstration of empty grammar:"
1332
1333 cp = chunk.RegexpParser("")
1334 print chunk.accuracy(cp, conll2000.chunked_sents('test.txt',
1335 chunk_types=('NP',)))
1336
1337 print
1338 print "Demonstration of accuracy evaluation using CoNLL tags:"
1339
1340 grammar = r"""
1341 NP:
1342 {<.*>} # start by chunking each tag
1343 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
1344 <DT|JJ>{}<NN.*> # merge det/adj with nouns
1345 """
1346 cp = chunk.RegexpParser(grammar)
1347 print chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5])
1348
1349 print
1350 print "Demonstration of tagged token input"
1351
1352 grammar = r"""
1353 NP: {<.*>*} # start by chunking everything
1354 }<[\.VI].*>+{ # chink any verbs, prepositions or periods
1355 <.*>}{<DT> # separate on determiners
1356 PP: {<IN><NP>} # PP = preposition + noun phrase
1357 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
1358 """
1359 cp = chunk.RegexpParser(grammar)
1360 print cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"),
1361 ("sat", "VBD"), ("on", "IN"), ("the", "DT"),
1362 ("mat", "NN"), (".", ".")])
1363
1364 if __name__ == '__main__':
1365 demo()
1366