1
2
3
4
5
6
7
8
9
10 """
11 Classes for tagging sentences sequentially, left to right. The
12 abstract base class L{SequentialBackoffTagger} serves as the base
13 class for all the taggers in this module. Tagging of individual words
14 is performed by the method L{choose_tag()
15 <SequentialBackoffTagger.choose_tag>}, which is defined by
16 subclasses of L{SequentialBackoffTagger}. If a tagger is unable to
17 determine a tag for the specified token, then its I{backoff tagger} is
18 consulted instead. Any C{SequentialBackoffTagger} may serve as a
19 backoff tagger for any other C{SequentialBackoffTagger}.
20 """
21
22 import re, yaml
23
24 from nltk.probability import FreqDist, ConditionalFreqDist
25 from nltk.classify.naivebayes import NaiveBayesClassifier
26
27 from api import *
28 from util import *
29
30
31
32
34 """
35 An abstract base class for taggers that tags words sequentially,
36 left to right. Tagging of individual words is performed by the
37 method L{choose_tag()}, which should be defined by subclasses. If
38 a tagger is unable to determine a tag for the specified token,
39 then its backoff tagger is consulted.
40
41 @ivar _taggers: A list of all the taggers that should be tried to
42 tag a token (i.e., C{self} and its backoff taggers).
43 """
45 if backoff is None:
46 self._taggers = [self]
47 else:
48 self._taggers = [self] + backoff._taggers
49
51 if len(self._taggers) < 2: return None
52 else: return self._taggers[1]
53 backoff = property(_get_backoff, doc='''
54 The backoff tagger for this tagger.''')
55
56 - def tag(self, tokens):
62
63 - def tag_one(self, tokens, index, history):
64 """
65 Determine an appropriate tag for the specified token, and
66 return that tag. If this tagger is unable to determine a tag
67 for the specified token, then its backoff tagger is consulted.
68
69 @rtype: C{str}
70 @type tokens: C{list}
71 @param tokens: The list of words that are being tagged.
72 @type index: C{int}
73 @param index: The index of the word whose tag should be
74 returned.
75 @type history: C{list} of C{str}
76 @param history: A list of the tags for all words before
77 C{index}.
78 """
79 tag = None
80 for tagger in self._taggers:
81 tag = tagger.choose_tag(tokens, index, history)
82 if tag is not None: break
83 return tag
84
86 """
87 Decide which tag should be used for the specified token, and
88 return that tag. If this tagger is unable to determine a tag
89 for the specified token, return C{None} -- do I{not} consult
90 the backoff tagger. This method should be overridden by
91 subclasses of C{SequentialBackoffTagger}.
92
93 @rtype: C{str}
94 @type tokens: C{list}
95 @param tokens: The list of words that are being tagged.
96 @type index: C{int}
97 @param index: The index of the word whose tag should be
98 returned.
99 @type history: C{list} of C{str}
100 @param history: A list of the tags for all words before
101 C{index}.
102 """
103 raise AssertionError('SequentialBackoffTagger is an abstract class')
104
105
106 -class ContextTagger(SequentialBackoffTagger):
107 """
108 An abstract base class for sequential backoff taggers that choose
109 a tag for a token based on the value of its "context". Different
110 subclasses are used to define different contexts.
111
112 A C{ContextTagger} chooses the tag for a token by calculating the
113 token's context, and looking up the corresponding tag in a table.
114 This table can be constructed manually; or it can be automatically
115 constructed based on a training corpus, using the L{_train()}
116 factory method.
117
118 @ivar _context_to_tag: Dictionary mapping contexts to tags.
119 """
120 - def __init__(self, context_to_tag, backoff=None):
121 """
122 @param context_to_tag: A dictionary mapping contexts to tags.
123 @param backoff: The backoff tagger that should be used for this tagger.
124 """
125 SequentialBackoffTagger.__init__(self, backoff)
126 if context_to_tag:
127 self._context_to_tag = context_to_tag
128 else:
129 self._context_to_tag = {}
130
131 - def context(self, tokens, index, history):
132 """
133 @return: the context that should be used to look up the tag
134 for the specified token; or C{None} if the specified token
135 should not be handled by this tagger.
136 @rtype: (hashable)
137 """
138 raise AssertionError('Abstract base class')
139
140 - def choose_tag(self, tokens, index, history):
141 context = self.context(tokens, index, history)
142 return self._context_to_tag.get(context)
143
145 """
146 @return: The number of entries in the table used by this
147 tagger to map from contexts to tags.
148 """
149 return len(self._context_to_tag)
150
151 - def __repr__(self):
152 return '<%s: size=%d>' % (self.__class__.__name__, self.size())
153
154 - def _train(self, tagged_corpus, cutoff=0, verbose=False):
155 """
156 Initialize this C{ContextTagger}'s L{_context_to_tag} table
157 based on the given training data. In particular, for each
158 context C{I{c}} in the training data, set
159 C{_context_to_tag[I{c}]} to the most frequent tag for that
160 context. However, exclude any contexts that are already
161 tagged perfectly by the backoff tagger(s).
162
163 The old value of C{self._context_to_tag} (if any) is discarded.
164
165 @param tagged_corpus: A tagged corpus. Each item should be
166 a C{list} of C{(word, tag)} tuples.
167 @param cutoff: If the most likely tag for a context occurs
168 fewer than C{cutoff} times, then exclude it from the
169 context-to-tag table for the new tagger.
170 """
171
172 token_count = hit_count = 0
173
174
175
176 useful_contexts = set()
177
178
179 fd = ConditionalFreqDist()
180 for sentence in tagged_corpus:
181 tokens, tags = zip(*sentence)
182 for index, (token, tag) in enumerate(sentence):
183
184 token_count += 1
185 context = self.context(tokens, index, tags[:index])
186 if context is None: continue
187 fd[context].inc(tag)
188
189 if (self.backoff is None or
190 tag != self.backoff.tag_one(tokens, index, tags[:index])):
191 useful_contexts.add(context)
192
193
194
195
196 for context in useful_contexts:
197 best_tag = fd[context].max()
198 hits = fd[context][best_tag]
199 if hits > cutoff:
200 self._context_to_tag[context] = best_tag
201 hit_count += hits
202
203
204 if verbose:
205 size = len(self._context_to_tag)
206 backoff = 100 - (hit_count * 100.0)/ token_count
207 pruning = 100 - (size * 100.0) / len(fd.conditions())
208 print "[Trained Unigram tagger:",
209 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
210 size, backoff, pruning)
211
212
213
214
215
217 """
218 A tagger that assigns the same tag to every token.
219 """
220 yaml_tag = '!nltk.DefaultTagger'
221
228
231
233 return '<DefaultTagger: tag=%s>' % self._tag
234
235
237 """
238 A tagger that chooses a token's tag based on its word string and
239 on the preceeding I{n} word's tags. In particular, a tuple
240 C{(tags[i-n:i-1], words[i])} is looked up in a table, and the
241 corresponding tag is returned. N-gram taggers are typically
242 trained on a tagged corpus.
243 """
244 yaml_tag = '!nltk.NgramTagger'
245
246 - def __init__(self, n, train=None, model=None,
247 backoff=None, cutoff=0, verbose=False):
248 """
249 Train a new C{NgramTagger} using the given training data or
250 the supplied model. In particular, construct a new tagger
251 whose table maps from each context C{(tag[i-n:i-1], word[i])}
252 to the most frequent tag for that context. But exclude any
253 contexts that are already tagged perfectly by the backoff
254 tagger.
255
256 @param train: A tagged corpus consisting of a C{list} of tagged
257 sentences, where each sentence is a C{list} of C{(word, tag)} tuples.
258 @param backoff: A backoff tagger, to be used by the new
259 tagger if it encounters an unknown context.
260 @param cutoff: If the most likely tag for a context occurs
261 fewer than C{cutoff} times, then exclude it from the
262 context-to-tag table for the new tagger.
263 """
264 self._n = n
265 self._check_params(train, model)
266
267 ContextTagger.__init__(self, model, backoff)
268
269 if train:
270 self._train(train, cutoff, verbose)
271
272 - def context(self, tokens, index, history):
273 tag_context = tuple(history[max(0,index-self._n+1):index])
274 return (tag_context, tokens[index])
275
276
278 """
279 A tagger that chooses a token's tag based its word string.
280 Unigram taggers are typically trained on a tagged corpus.
281 """
282 yaml_tag = '!nltk.UnigramTagger'
283
284 - def __init__(self, train=None, model=None,
285 backoff=None, cutoff=0, verbose=False):
288
289 - def context(self, tokens, index, history):
291
292
294 """
295 A tagger that chooses a token's tag based its word string and on
296 the preceeding words' tag. In particular, a tuple consisting
297 of the previous tag and the word is looked up in a table, and
298 the corresponding tag is returned. Bigram taggers are typically
299 trained on a tagged corpus.
300 """
301 yaml_tag = '!nltk.BigramTagger'
302
303 - def __init__(self, train, model=None,
304 backoff=None, cutoff=0, verbose=False):
307
308
310 """
311 A tagger that chooses a token's tag based its word string and on
312 the preceeding two words' tags. In particular, a tuple consisting
313 of the previous two tags and the word is looked up in a table, and
314 the corresponding tag is returned. Trigram taggers are typically
315 trained them on a tagged corpus.
316 """
317 yaml_tag = '!nltk.TrigramTagger'
318
319 - def __init__(self, train=None, model=None,
320 backoff=None, cutoff=0, verbose=False):
323
324
326 """
327 A tagger that chooses a token's tag based on a leading or trailing
328 substring of its word string. (It is important to note that these
329 substrings are not necessarily "true" morphological affixes). In
330 particular, a fixed-length substring of the word is looked up in a
331 table, and the corresponding tag is returned. Affix taggers are
332 typically constructed by training them on a tagged corpus; see
333 L{the constructor <__init__>}.
334 """
335 yaml_tag = '!nltk.AffixTagger'
336
337 - def __init__(self, train=None, model=None, affix_length=-3,
338 min_stem_length=2, backoff=None, cutoff=0, verbose=False):
339 """
340 Construct a new affix tagger.
341
342 @param affix_length: The length of the affixes that should be
343 considered during training and tagging. Use negative
344 numbers for suffixes.
345 @param min_stem_length: Any words whose length is less than
346 C{min_stem_length+abs(affix_length)} will be assigned a
347 tag of C{None} by this tagger.
348 """
349 self._check_params(train, model)
350
351 ContextTagger.__init__(self, model, backoff)
352
353 self._affix_length = affix_length
354 self._min_word_length = min_stem_length + abs(affix_length)
355
356 if train:
357 self._train(train, cutoff, verbose)
358
359 - def context(self, tokens, index, history):
360 token = tokens[index]
361 if len(token) < self._min_word_length:
362 return None
363 elif self._affix_length > 0:
364 return token[:self._affix_length]
365 else:
366 return token[self._affix_length:]
367
368
369 -class RegexpTagger(SequentialBackoffTagger, yaml.YAMLObject):
370 """
371 A tagger that assigns tags to words based on regular expressions
372 over word strings.
373 """
374 yaml_tag = '!nltk.RegexpTagger'
375 - def __init__(self, regexps, backoff=None):
376 """
377 Construct a new regexp tagger.
378
379 @type regexps: C{list} of C{(str, str)}
380 @param regexps: A list of C{(regexp, tag)} pairs, each of
381 which indicates that a word matching C{regexp} should
382 be tagged with C{tag}. The pairs will be evalutated in
383 order. If none of the regexps match a word, then the
384 optional backoff tagger is invoked, else it is
385 assigned the tag C{None}.
386 """
387 self._regexps = regexps
388 SequentialBackoffTagger.__init__(self, backoff)
389
395
397 return '<Regexp Tagger: size=%d>' % len(self._regexps)
398
400 """
401 A sequential tagger that uses a classifier to choose the tag for
402 each token in a sentence. The featureset input for the classifier
403 is generated by a feature detector function::
404
405 feature_detector(tokens, index, history) -> featureset
406
407 Where C{tokens} is the list of unlabeled tokens in the sentence;
408 C{index} is the index of the token for which feature detection
409 should be performed; and C{history} is list of the tags for all
410 tokens before C{index}.
411 """
412 - def __init__(self, feature_detector=None, train=None,
413 classifier_builder=NaiveBayesClassifier.train,
414 classifier=None, backoff=None,
415 cutoff_prob=None, verbose=False):
416 """
417 Construct a new classifier-based sequential tagger.
418
419 @param feature_detector: A function used to generate the
420 featureset input for the classifier::
421 feature_detector(tokens, index, history) -> featureset
422
423 @param train: A tagged corpus consisting of a C{list} of tagged
424 sentences, where each sentence is a C{list} of C{(word, tag)} tuples.
425
426 @param backoff: A backoff tagger, to be used by the new tagger
427 if it encounters an unknown context.
428
429 @param classifier_builder: A function used to train a new
430 classifier based on the data in C{train}. It should take
431 one argument, a list of labeled featuresets (i.e.,
432 C{(featureset, label)} tuples).
433
434 @param classifier: The classifier that should be used by the
435 tagger. This is only useful if you want to manually
436 construct the classifier; normally, you would use C{train}
437 instead.
438
439 @param backoff: A backoff tagger, used if this tagger is
440 unable to determine a tag for a given token.
441
442 @param cutoff_prob: If specified, then this tagger will fall
443 back on its backoff tagger if the probability of the most
444 likely tag is less than C{cutoff_prob}.
445 """
446 self._check_params(train, classifier)
447
448 SequentialBackoffTagger.__init__(self, backoff)
449
450 if (train and classifier) or (not train and not classifier):
451 raise ValueError('Must specify either training data or '
452 'trained classifier.')
453
454 if feature_detector is not None:
455 self._feature_detector = feature_detector
456
457
458
459 self._cutoff_prob = cutoff_prob
460 """Cutoff probability for tagging -- if the probability of the
461 most likely tag is less than this, then use backoff."""
462
463 self._classifier = classifier
464 """The classifier used to choose a tag for each token."""
465
466 if train:
467 self._train(train, classifier_builder, verbose)
468
470
471 featureset = self.feature_detector(tokens, index, history)
472
473
474
475
476 if self._cutoff_prob is None:
477 return self._classifier.classify(featureset)
478 else:
479 pdist = self._classifier.prob_classify(featureset)
480 tag = pdist.max()
481 if pdist.prob(tag) >= self._cutoff_prob:
482 return tag
483 else:
484 return None
485
486 - def _train(self, tagged_corpus, classifier_builder, verbose):
487 """
488 Build a new classifier, based on the given training data
489 (C{tagged_corpus}).
490 """
491
492 classifier_corpus = []
493 if verbose:
494 print 'Constructing training corpus for classifier.'
495
496 for sentence in tagged_corpus:
497 history = []
498 untagged_sentence, tags = zip(*sentence)
499 for index in range(len(sentence)):
500 featureset = self.feature_detector(untagged_sentence,
501 index, history)
502 classifier_corpus.append( (featureset, tags[index]) )
503 history.append(tags[index])
504
505 if verbose:
506 print 'Training classifier (%d instances)' % len(classifier_corpus)
507 self._classifier = classifier_builder(classifier_corpus)
508
510 return '<ClassifierBasedTagger: %r>' % self._classifier
511
513 """
514 Return the feature detector that this tagger uses to generate
515 featuresets for its classifier. The feature detector is a
516 function with the signature::
517
518 feature_detector(tokens, index, history) -> featureset
519
520 @see: L{classifier()}
521 """
522 return self._feature_detector(tokens, index, history)
523
525 """
526 Return the classifier that this tagger uses to choose a tag
527 for each word in a sentence. The input for this classifier is
528 generated using this tagger's feature detector.
529
530 @see: L{feature_detector()}
531 """
532 return self._classifier
533
534 -class ClassifierBasedPOSTagger(ClassifierBasedTagger):
535 """
536 A classifier based part of speech tagger.
537 """
538 - def feature_detector(self, tokens, index, history):
539 word = tokens[index]
540 if index == 0:
541 prevword = prevprevword = None
542 prevtag = prevprevtag = None
543 elif index == 1:
544 prevword = tokens[index-1].lower()
545 prevprevword = None
546 prevtag = history[index-1]
547 prevprevtag = None
548 else:
549 prevword = tokens[index-1].lower()
550 prevprevword = tokens[index-2].lower()
551 prevtag = history[index-1]
552 prevprevtag = history[index-2]
553
554 if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
555 shape = 'number'
556 elif re.match('\W+$', word):
557 shape = 'punct'
558 elif re.match('[A-Z][a-z]+$', word):
559 shape = 'upcase'
560 elif re.match('[a-z]+$', word):
561 shape = 'downcase'
562 elif re.match('\w+$', word):
563 shape = 'mixedcase'
564 else:
565 shape = 'other'
566
567 features = {
568 'prevtag': prevtag,
569 'prevprevtag': prevprevtag,
570 'word': word,
571 'word.lower': word.lower(),
572 'suffix3': word.lower()[-3:],
573 'suffix2': word.lower()[-2:],
574 'suffix1': word.lower()[-1:],
575 'prevprevword': prevprevword,
576 'prevword': prevword,
577 'prevtag+word': '%s+%s' % (prevtag, word.lower()),
578 'prevprevtag+word': '%s+%s' % (prevprevtag, word.lower()),
579 'prevword+word': '%s+%s' % (prevword, word.lower()),
580 'shape': shape,
581 }
582 return features
583