Package nltk :: Package tag :: Module sequential
[hide private]
[frames] | no frames]

Source Code for Module nltk.tag.sequential

  1  # Natural Language Toolkit: Sequential Backoff Taggers 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (minor additions) 
  6  #         Tiago Tresoldi <tresoldi@users.sf.net> (original affix tagger) 
  7  # URL: <http://www.nltk.org/> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  Classes for tagging sentences sequentially, left to right.  The 
 12  abstract base class L{SequentialBackoffTagger} serves as the base 
 13  class for all the taggers in this module.  Tagging of individual words 
 14  is performed by the method L{choose_tag() 
 15  <SequentialBackoffTagger.choose_tag>}, which is defined by 
 16  subclasses of L{SequentialBackoffTagger}.  If a tagger is unable to 
 17  determine a tag for the specified token, then its I{backoff tagger} is 
 18  consulted instead.  Any C{SequentialBackoffTagger} may serve as a 
 19  backoff tagger for any other C{SequentialBackoffTagger}. 
 20  """ 
 21   
 22  import re, yaml 
 23   
 24  from nltk.probability import FreqDist, ConditionalFreqDist 
 25  from nltk.classify.naivebayes import NaiveBayesClassifier 
 26   
 27  from api import * 
 28  from util import * 
 29   
 30  ###################################################################### 
 31  #{ Abstract Base Classes 
 32  ###################################################################### 
33 -class SequentialBackoffTagger(TaggerI):
34 """ 35 An abstract base class for taggers that tags words sequentially, 36 left to right. Tagging of individual words is performed by the 37 method L{choose_tag()}, which should be defined by subclasses. If 38 a tagger is unable to determine a tag for the specified token, 39 then its backoff tagger is consulted. 40 41 @ivar _taggers: A list of all the taggers that should be tried to 42 tag a token (i.e., C{self} and its backoff taggers). 43 """
44 - def __init__(self, backoff=None):
45 if backoff is None: 46 self._taggers = [self] 47 else: 48 self._taggers = [self] + backoff._taggers
49
50 - def _get_backoff(self):
51 if len(self._taggers) < 2: return None 52 else: return self._taggers[1]
53 backoff = property(_get_backoff, doc=''' 54 The backoff tagger for this tagger.''') 55
56 - def tag(self, tokens):
57 # docs inherited from TaggerI 58 tags = [] 59 for i in range(len(tokens)): 60 tags.append(self.tag_one(tokens, i, tags)) 61 return zip(tokens, tags)
62
63 - def tag_one(self, tokens, index, history):
64 """ 65 Determine an appropriate tag for the specified token, and 66 return that tag. If this tagger is unable to determine a tag 67 for the specified token, then its backoff tagger is consulted. 68 69 @rtype: C{str} 70 @type tokens: C{list} 71 @param tokens: The list of words that are being tagged. 72 @type index: C{int} 73 @param index: The index of the word whose tag should be 74 returned. 75 @type history: C{list} of C{str} 76 @param history: A list of the tags for all words before 77 C{index}. 78 """ 79 tag = None 80 for tagger in self._taggers: 81 tag = tagger.choose_tag(tokens, index, history) 82 if tag is not None: break 83 return tag
84
85 - def choose_tag(self, tokens, index, history):
86 """ 87 Decide which tag should be used for the specified token, and 88 return that tag. If this tagger is unable to determine a tag 89 for the specified token, return C{None} -- do I{not} consult 90 the backoff tagger. This method should be overridden by 91 subclasses of C{SequentialBackoffTagger}. 92 93 @rtype: C{str} 94 @type tokens: C{list} 95 @param tokens: The list of words that are being tagged. 96 @type index: C{int} 97 @param index: The index of the word whose tag should be 98 returned. 99 @type history: C{list} of C{str} 100 @param history: A list of the tags for all words before 101 C{index}. 102 """ 103 raise AssertionError('SequentialBackoffTagger is an abstract class')
104 105
106 -class ContextTagger(SequentialBackoffTagger):
107 """ 108 An abstract base class for sequential backoff taggers that choose 109 a tag for a token based on the value of its "context". Different 110 subclasses are used to define different contexts. 111 112 A C{ContextTagger} chooses the tag for a token by calculating the 113 token's context, and looking up the corresponding tag in a table. 114 This table can be constructed manually; or it can be automatically 115 constructed based on a training corpus, using the L{_train()} 116 factory method. 117 118 @ivar _context_to_tag: Dictionary mapping contexts to tags. 119 """
120 - def __init__(self, context_to_tag, backoff=None):
121 """ 122 @param context_to_tag: A dictionary mapping contexts to tags. 123 @param backoff: The backoff tagger that should be used for this tagger. 124 """ 125 SequentialBackoffTagger.__init__(self, backoff) 126 if context_to_tag: 127 self._context_to_tag = context_to_tag 128 else: 129 self._context_to_tag = {}
130
131 - def context(self, tokens, index, history):
132 """ 133 @return: the context that should be used to look up the tag 134 for the specified token; or C{None} if the specified token 135 should not be handled by this tagger. 136 @rtype: (hashable) 137 """ 138 raise AssertionError('Abstract base class')
139
140 - def choose_tag(self, tokens, index, history):
141 context = self.context(tokens, index, history) 142 return self._context_to_tag.get(context)
143
144 - def size(self):
145 """ 146 @return: The number of entries in the table used by this 147 tagger to map from contexts to tags. 148 """ 149 return len(self._context_to_tag)
150
151 - def __repr__(self):
152 return '<%s: size=%d>' % (self.__class__.__name__, self.size())
153
154 - def _train(self, tagged_corpus, cutoff=0, verbose=False):
155 """ 156 Initialize this C{ContextTagger}'s L{_context_to_tag} table 157 based on the given training data. In particular, for each 158 context C{I{c}} in the training data, set 159 C{_context_to_tag[I{c}]} to the most frequent tag for that 160 context. However, exclude any contexts that are already 161 tagged perfectly by the backoff tagger(s). 162 163 The old value of C{self._context_to_tag} (if any) is discarded. 164 165 @param tagged_corpus: A tagged corpus. Each item should be 166 a C{list} of C{(word, tag)} tuples. 167 @param cutoff: If the most likely tag for a context occurs 168 fewer than C{cutoff} times, then exclude it from the 169 context-to-tag table for the new tagger. 170 """ 171 172 token_count = hit_count = 0 173 174 # A context is considered 'useful' if it's not already tagged 175 # perfectly by the backoff tagger. 176 useful_contexts = set() 177 178 # Count how many times each tag occurs in each context. 179 fd = ConditionalFreqDist() 180 for sentence in tagged_corpus: 181 tokens, tags = zip(*sentence) 182 for index, (token, tag) in enumerate(sentence): 183 # Record the event. 184 token_count += 1 185 context = self.context(tokens, index, tags[:index]) 186 if context is None: continue 187 fd[context].inc(tag) 188 # If the backoff got it wrong, this context is useful: 189 if (self.backoff is None or 190 tag != self.backoff.tag_one(tokens, index, tags[:index])): 191 useful_contexts.add(context) 192 193 # Build the context_to_tag table -- for each context, figure 194 # out what the most likely tag is. Only include contexts that 195 # we've seen at least `cutoff` times. 196 for context in useful_contexts: 197 best_tag = fd[context].max() 198 hits = fd[context][best_tag] 199 if hits > cutoff: 200 self._context_to_tag[context] = best_tag 201 hit_count += hits 202 203 # Display some stats, if requested. 204 if verbose: 205 size = len(self._context_to_tag) 206 backoff = 100 - (hit_count * 100.0)/ token_count 207 pruning = 100 - (size * 100.0) / len(fd.conditions()) 208 print "[Trained Unigram tagger:", 209 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( 210 size, backoff, pruning)
211 212 ###################################################################### 213 #{ Tagger Classes 214 ###################################################################### 215
216 -class DefaultTagger(SequentialBackoffTagger, yaml.YAMLObject):
217 """ 218 A tagger that assigns the same tag to every token. 219 """ 220 yaml_tag = '!nltk.DefaultTagger' 221
222 - def __init__(self, tag):
223 """ 224 Construct a new tagger that assigns C{tag} to all tokens. 225 """ 226 self._tag = tag 227 SequentialBackoffTagger.__init__(self, None)
228
229 - def choose_tag(self, tokens, index, history):
230 return self._tag # ignore token and history
231
232 - def __repr__(self):
233 return '<DefaultTagger: tag=%s>' % self._tag
234 235
236 -class NgramTagger(ContextTagger, yaml.YAMLObject):
237 """ 238 A tagger that chooses a token's tag based on its word string and 239 on the preceeding I{n} word's tags. In particular, a tuple 240 C{(tags[i-n:i-1], words[i])} is looked up in a table, and the 241 corresponding tag is returned. N-gram taggers are typically 242 trained on a tagged corpus. 243 """ 244 yaml_tag = '!nltk.NgramTagger' 245
246 - def __init__(self, n, train=None, model=None, 247 backoff=None, cutoff=0, verbose=False):
248 """ 249 Train a new C{NgramTagger} using the given training data or 250 the supplied model. In particular, construct a new tagger 251 whose table maps from each context C{(tag[i-n:i-1], word[i])} 252 to the most frequent tag for that context. But exclude any 253 contexts that are already tagged perfectly by the backoff 254 tagger. 255 256 @param train: A tagged corpus consisting of a C{list} of tagged 257 sentences, where each sentence is a C{list} of C{(word, tag)} tuples. 258 @param backoff: A backoff tagger, to be used by the new 259 tagger if it encounters an unknown context. 260 @param cutoff: If the most likely tag for a context occurs 261 fewer than C{cutoff} times, then exclude it from the 262 context-to-tag table for the new tagger. 263 """ 264 self._n = n 265 self._check_params(train, model) 266 267 ContextTagger.__init__(self, model, backoff) 268 269 if train: 270 self._train(train, cutoff, verbose)
271
272 - def context(self, tokens, index, history):
273 tag_context = tuple(history[max(0,index-self._n+1):index]) 274 return (tag_context, tokens[index])
275 276
277 -class UnigramTagger(NgramTagger):
278 """ 279 A tagger that chooses a token's tag based its word string. 280 Unigram taggers are typically trained on a tagged corpus. 281 """ 282 yaml_tag = '!nltk.UnigramTagger' 283
284 - def __init__(self, train=None, model=None, 285 backoff=None, cutoff=0, verbose=False):
286 NgramTagger.__init__(self, 1, train, model, 287 backoff, cutoff, verbose)
288
289 - def context(self, tokens, index, history):
290 return tokens[index]
291 292
293 -class BigramTagger(NgramTagger):
294 """ 295 A tagger that chooses a token's tag based its word string and on 296 the preceeding words' tag. In particular, a tuple consisting 297 of the previous tag and the word is looked up in a table, and 298 the corresponding tag is returned. Bigram taggers are typically 299 trained on a tagged corpus. 300 """ 301 yaml_tag = '!nltk.BigramTagger' 302
303 - def __init__(self, train, model=None, 304 backoff=None, cutoff=0, verbose=False):
305 NgramTagger.__init__(self, 2, train, model, 306 backoff, cutoff, verbose)
307 308
309 -class TrigramTagger(NgramTagger):
310 """ 311 A tagger that chooses a token's tag based its word string and on 312 the preceeding two words' tags. In particular, a tuple consisting 313 of the previous two tags and the word is looked up in a table, and 314 the corresponding tag is returned. Trigram taggers are typically 315 trained them on a tagged corpus. 316 """ 317 yaml_tag = '!nltk.TrigramTagger' 318
319 - def __init__(self, train=None, model=None, 320 backoff=None, cutoff=0, verbose=False):
321 NgramTagger.__init__(self, 3, train, model, 322 backoff, cutoff, verbose)
323 324
325 -class AffixTagger(ContextTagger, yaml.YAMLObject):
326 """ 327 A tagger that chooses a token's tag based on a leading or trailing 328 substring of its word string. (It is important to note that these 329 substrings are not necessarily "true" morphological affixes). In 330 particular, a fixed-length substring of the word is looked up in a 331 table, and the corresponding tag is returned. Affix taggers are 332 typically constructed by training them on a tagged corpus; see 333 L{the constructor <__init__>}. 334 """ 335 yaml_tag = '!nltk.AffixTagger' 336
337 - def __init__(self, train=None, model=None, affix_length=-3, 338 min_stem_length=2, backoff=None, cutoff=0, verbose=False):
339 """ 340 Construct a new affix tagger. 341 342 @param affix_length: The length of the affixes that should be 343 considered during training and tagging. Use negative 344 numbers for suffixes. 345 @param min_stem_length: Any words whose length is less than 346 C{min_stem_length+abs(affix_length)} will be assigned a 347 tag of C{None} by this tagger. 348 """ 349 self._check_params(train, model) 350 351 ContextTagger.__init__(self, model, backoff) 352 353 self._affix_length = affix_length 354 self._min_word_length = min_stem_length + abs(affix_length) 355 356 if train: 357 self._train(train, cutoff, verbose)
358
359 - def context(self, tokens, index, history):
360 token = tokens[index] 361 if len(token) < self._min_word_length: 362 return None 363 elif self._affix_length > 0: 364 return token[:self._affix_length] 365 else: 366 return token[self._affix_length:]
367 368
369 -class RegexpTagger(SequentialBackoffTagger, yaml.YAMLObject):
370 """ 371 A tagger that assigns tags to words based on regular expressions 372 over word strings. 373 """ 374 yaml_tag = '!nltk.RegexpTagger'
375 - def __init__(self, regexps, backoff=None):
376 """ 377 Construct a new regexp tagger. 378 379 @type regexps: C{list} of C{(str, str)} 380 @param regexps: A list of C{(regexp, tag)} pairs, each of 381 which indicates that a word matching C{regexp} should 382 be tagged with C{tag}. The pairs will be evalutated in 383 order. If none of the regexps match a word, then the 384 optional backoff tagger is invoked, else it is 385 assigned the tag C{None}. 386 """ 387 self._regexps = regexps 388 SequentialBackoffTagger.__init__(self, backoff)
389
390 - def choose_tag(self, tokens, index, history):
391 for regexp, tag in self._regexps: 392 if re.match(regexp, tokens[index]): # ignore history 393 return tag 394 return None
395
396 - def __repr__(self):
397 return '<Regexp Tagger: size=%d>' % len(self._regexps)
398
399 -class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
400 """ 401 A sequential tagger that uses a classifier to choose the tag for 402 each token in a sentence. The featureset input for the classifier 403 is generated by a feature detector function:: 404 405 feature_detector(tokens, index, history) -> featureset 406 407 Where C{tokens} is the list of unlabeled tokens in the sentence; 408 C{index} is the index of the token for which feature detection 409 should be performed; and C{history} is list of the tags for all 410 tokens before C{index}. 411 """
412 - def __init__(self, feature_detector=None, train=None, 413 classifier_builder=NaiveBayesClassifier.train, 414 classifier=None, backoff=None, 415 cutoff_prob=None, verbose=False):
416 """ 417 Construct a new classifier-based sequential tagger. 418 419 @param feature_detector: A function used to generate the 420 featureset input for the classifier:: 421 feature_detector(tokens, index, history) -> featureset 422 423 @param train: A tagged corpus consisting of a C{list} of tagged 424 sentences, where each sentence is a C{list} of C{(word, tag)} tuples. 425 426 @param backoff: A backoff tagger, to be used by the new tagger 427 if it encounters an unknown context. 428 429 @param classifier_builder: A function used to train a new 430 classifier based on the data in C{train}. It should take 431 one argument, a list of labeled featuresets (i.e., 432 C{(featureset, label)} tuples). 433 434 @param classifier: The classifier that should be used by the 435 tagger. This is only useful if you want to manually 436 construct the classifier; normally, you would use C{train} 437 instead. 438 439 @param backoff: A backoff tagger, used if this tagger is 440 unable to determine a tag for a given token. 441 442 @param cutoff_prob: If specified, then this tagger will fall 443 back on its backoff tagger if the probability of the most 444 likely tag is less than C{cutoff_prob}. 445 """ 446 self._check_params(train, classifier) 447 448 SequentialBackoffTagger.__init__(self, backoff) 449 450 if (train and classifier) or (not train and not classifier): 451 raise ValueError('Must specify either training data or ' 452 'trained classifier.') 453 454 if feature_detector is not None: 455 self._feature_detector = feature_detector 456 # The feature detector function, used to generate a featureset 457 # or each token: feature_detector(tokens, index, history) -> featureset 458 459 self._cutoff_prob = cutoff_prob 460 """Cutoff probability for tagging -- if the probability of the 461 most likely tag is less than this, then use backoff.""" 462 463 self._classifier = classifier 464 """The classifier used to choose a tag for each token.""" 465 466 if train: 467 self._train(train, classifier_builder, verbose)
468
469 - def choose_tag(self, tokens, index, history):
470 # Use our feature detector to get the featureset. 471 featureset = self.feature_detector(tokens, index, history) 472 473 # Use the classifier to pick a tag. If a cutoff probability 474 # was specified, then check that the tag's probability is 475 # higher than that cutoff first; otherwise, return None. 476 if self._cutoff_prob is None: 477 return self._classifier.classify(featureset) 478 else: 479 pdist = self._classifier.prob_classify(featureset) 480 tag = pdist.max() 481 if pdist.prob(tag) >= self._cutoff_prob: 482 return tag 483 else: 484 return None
485
486 - def _train(self, tagged_corpus, classifier_builder, verbose):
487 """ 488 Build a new classifier, based on the given training data 489 (C{tagged_corpus}). 490 """ 491 492 classifier_corpus = [] 493 if verbose: 494 print 'Constructing training corpus for classifier.' 495 496 for sentence in tagged_corpus: 497 history = [] 498 untagged_sentence, tags = zip(*sentence) 499 for index in range(len(sentence)): 500 featureset = self.feature_detector(untagged_sentence, 501 index, history) 502 classifier_corpus.append( (featureset, tags[index]) ) 503 history.append(tags[index]) 504 505 if verbose: 506 print 'Training classifier (%d instances)' % len(classifier_corpus) 507 self._classifier = classifier_builder(classifier_corpus)
508
509 - def __repr__(self):
510 return '<ClassifierBasedTagger: %r>' % self._classifier
511
512 - def feature_detector(self, tokens, index, history):
513 """ 514 Return the feature detector that this tagger uses to generate 515 featuresets for its classifier. The feature detector is a 516 function with the signature:: 517 518 feature_detector(tokens, index, history) -> featureset 519 520 @see: L{classifier()} 521 """ 522 return self._feature_detector(tokens, index, history)
523
524 - def classifier(self):
525 """ 526 Return the classifier that this tagger uses to choose a tag 527 for each word in a sentence. The input for this classifier is 528 generated using this tagger's feature detector. 529 530 @see: L{feature_detector()} 531 """ 532 return self._classifier
533
534 -class ClassifierBasedPOSTagger(ClassifierBasedTagger):
535 """ 536 A classifier based part of speech tagger. 537 """
538 - def feature_detector(self, tokens, index, history):
539 word = tokens[index] 540 if index == 0: 541 prevword = prevprevword = None 542 prevtag = prevprevtag = None 543 elif index == 1: 544 prevword = tokens[index-1].lower() 545 prevprevword = None 546 prevtag = history[index-1] 547 prevprevtag = None 548 else: 549 prevword = tokens[index-1].lower() 550 prevprevword = tokens[index-2].lower() 551 prevtag = history[index-1] 552 prevprevtag = history[index-2] 553 554 if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word): 555 shape = 'number' 556 elif re.match('\W+$', word): 557 shape = 'punct' 558 elif re.match('[A-Z][a-z]+$', word): 559 shape = 'upcase' 560 elif re.match('[a-z]+$', word): 561 shape = 'downcase' 562 elif re.match('\w+$', word): 563 shape = 'mixedcase' 564 else: 565 shape = 'other' 566 567 features = { 568 'prevtag': prevtag, 569 'prevprevtag': prevprevtag, 570 'word': word, 571 'word.lower': word.lower(), 572 'suffix3': word.lower()[-3:], 573 'suffix2': word.lower()[-2:], 574 'suffix1': word.lower()[-1:], 575 'prevprevword': prevprevword, 576 'prevword': prevword, 577 'prevtag+word': '%s+%s' % (prevtag, word.lower()), 578 'prevprevtag+word': '%s+%s' % (prevprevtag, word.lower()), 579 'prevword+word': '%s+%s' % (prevword, word.lower()), 580 'shape': shape, 581 } 582 return features
583