Package nltk :: Package tag :: Module tnt
[hide private]
[frames] | no frames]

Source Code for Module nltk.tag.tnt

  1  # Natural Language Toolkit: TnT Tagger 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Sam Huston <sjh900@gmail.com> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (modifications) 
  6  #          
  7  # URL: <http://www.nltk.org/> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  ''' 
 11  Implementation of 'TnT - A Statisical Part of Speech Tagger' 
 12  by Thorsten Brants 
 13   
 14  http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf 
 15  ''' 
 16   
 17  import nltk 
 18  from api import * 
 19   
20 -class TnT(TaggerI):
21 ''' 22 TnT - Statistical POS tagger 23 24 IMPORTANT NOTES: 25 26 * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS 27 It is possible to provide an untrained POS tagger to 28 create tags for unknown words, see __init__ function 29 30 * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT 31 - Due to the nature of this tagger, it works best when 32 trained over sentence delimited input. 33 - However it still produces good results if the training 34 data and testing data are separated on all punctuation eg: [,.?!] 35 - Input for training is expected to be a list of sentences 36 where each sentence is a list of (word, tag) tuples 37 - Input for tag function is a single sentence 38 Input for tagdata function is a list of sentences 39 Output is of a similar form 40 41 * Function provided to process text that is unsegmented 42 - Please see basic_sent_chop() 43 44 45 TnT uses a second order Markov model to produce tags for 46 a sequence of input, specifically: 47 48 argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) 49 50 IE: the maximum projection of a set of probabilities 51 52 The set of possible tags for a given word is derived 53 from the training data. It is the set of all tags 54 that exact word has been assigned. 55 56 The probability of a tag for a given word is the linear 57 interpolation of 3 markov models; a zero-order, first-order, 58 and a second order model. 59 60 P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + 61 l3*P(t_i| t_i-1, t_i-2) 62 63 A beam search is used to limit the memory usage of the algorithm. 64 The degree of the beam can be changed using N in the initialization. 65 N represents the maximum number of possible solutions to maintain 66 while tagging. 67 68 It is possible to differentiate the tags which are assigned to 69 capitalized words. However this does not result in a significant 70 gain in the accuracy of the results. 71 ''' 72
73 - def __init__(self, unk=None, Trained=False, N=1000, C=False):
74 ''' 75 Construct a TnT statistical tagger. Tagger must be trained 76 before being used to tag input. 77 78 @param unk: instance of a POS tagger, conforms to TaggerI 79 @type unk:(TaggerI) 80 @param Trained: Indication that the POS tagger is trained or not 81 @type Trained: boolean 82 @param N: Beam search degree (see above) 83 @type N:(int) 84 @param C: Capitalization flag 85 @type C: boolean 86 87 Initializer, creates frequency distributions to be used 88 for tagging 89 90 _lx values represent the portion of the tri/bi/uni taggers 91 to be used to calculate the probability 92 93 N value is the number of possible solutions to maintain 94 while tagging. A good value for this is 1000 95 96 C is a boolean value which specifies to use or 97 not use the Capitalization of the word as additional 98 information for tagging. 99 NOTE: using capitalization may not increase the accuracy 100 of the tagger 101 ''' 102 103 self._uni = nltk.probability.FreqDist() 104 self._bi = nltk.probability.ConditionalFreqDist() 105 self._tri = nltk.probability.ConditionalFreqDist() 106 self._wd = nltk.probability.ConditionalFreqDist() 107 self._eos = nltk.probability.ConditionalFreqDist() 108 self._l1 = 0.0 109 self._l2 = 0.0 110 self._l3 = 0.0 111 self._N = N 112 self._C = C 113 self._T = Trained 114 115 self._unk = unk 116 117 # statistical tools (ignore or delete me) 118 self.unknown = 0 119 self.known = 0
120
121 - def train(self, data):
122 ''' 123 Uses a set of tagged data to train the tagger. 124 If an unknown word tagger is specified, 125 it is trained on the same data. 126 127 @param data: List of lists of (word, tag) tuples 128 @type data: L{tuple} of L{str} 129 ''' 130 131 # Ensure that local C flag is initialized before use 132 C = False 133 134 if self._unk != None and self._T == False: 135 self._unk.train(data) 136 137 for sent in data: 138 history = ['BOS', 'BOS'] 139 for w, t in sent: 140 141 # if capitalization is requested, 142 # and the word begins with a capital 143 # set local flag C to True 144 if self._C and w[0].isupper(): C=True 145 146 self._wd[w].inc(t) 147 self._uni.inc((t,C)) 148 self._bi[history[1]].inc((t,C)) 149 self._tri[tuple(history)].inc((t,C)) 150 151 history.append((t,C)) 152 history.pop(0) 153 154 # set local flag C to false for the next word 155 C = False 156 157 self._eos[t].inc('EOS') 158 159 160 # compute lambda values from the trained frequency distributions 161 self._compute_lambda()
162 163 #(debugging -- ignore or delete me) 164 #print "lambdas" 165 #print i, self._l1, i, self._l2, i, self._l3 166 167
168 - def _compute_lambda(self):
169 ''' 170 creates lambda values based upon training data 171 172 NOTE: no need to explicitly reference C, 173 it is contained within the tag variable :: tag == (tag,C) 174 175 for each tag trigram (t1, t2, t3) 176 depending on the maximum value of 177 - f(t1,t2,t3)-1 / f(t1,t2)-1 178 - f(t2,t3)-1 / f(t2)-1 179 - f(t3)-1 / N-1 180 181 increment l3,l2, or l1 by f(t1,t2,t3) 182 183 ISSUES -- Resolutions: 184 if 2 values are equal, increment both lambda values 185 by (f(t1,t2,t3) / 2) 186 ''' 187 188 # temporary lambda variables 189 tl1 = 0.0 190 tl2 = 0.0 191 tl3 = 0.0 192 193 # for each t1,t2 in system 194 for history in self._tri.conditions(): 195 (h1, h2) = history 196 197 # for each t3 given t1,t2 in system 198 # (NOTE: tag actually represents (tag,C)) 199 # However no effect within this function 200 for tag in self._tri[history].samples(): 201 202 # if there has only been 1 occurance of this tag in the data 203 # then ignore this trigram. 204 if self._uni[tag] == 1: 205 continue 206 207 # safe_div provides a safe floating point division 208 # it returns -1 if the denominator is 0 209 c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1)) 210 c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1)) 211 c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1)) 212 213 214 # if c1 is the maximum value: 215 if (c1 > c3) and (c1 > c2): 216 tl1 += self._tri[history][tag] 217 218 # if c2 is the maximum value 219 elif (c2 > c3) and (c2 > c1): 220 tl2 += self._tri[history][tag] 221 222 # if c3 is the maximum value 223 elif (c3 > c2) and (c3 > c1): 224 tl3 += self._tri[history][tag] 225 226 # if c3, and c2 are equal and larger than c1 227 elif (c3 == c2) and (c3 > c1): 228 tl2 += float(self._tri[history][tag]) /2.0 229 tl3 += float(self._tri[history][tag]) /2.0 230 231 # if c1, and c2 are equal and larger than c3 232 # this might be a dumb thing to do....(not sure yet) 233 elif (c2 == c1) and (c1 > c3): 234 tl1 += float(self._tri[history][tag]) /2.0 235 tl2 += float(self._tri[history][tag]) /2.0 236 237 # otherwise there might be a problem 238 # eg: all values = 0 239 else: 240 #print "Problem", c1, c2 ,c3 241 pass 242 243 # Lambda normalisation: 244 # ensures that l1+l2+l3 = 1 245 self._l1 = tl1 / (tl1+tl2+tl3) 246 self._l2 = tl2 / (tl1+tl2+tl3) 247 self._l3 = tl3 / (tl1+tl2+tl3)
248 249 250
251 - def _safe_div(self, v1, v2):
252 ''' 253 Safe floating point division function, does not allow division by 0 254 returns -1 if the denominator is 0 255 ''' 256 if v2 == 0: 257 return -1 258 else: 259 return float(v1) / float(v2)
260
261 - def tagdata(self, data):
262 ''' 263 Tags each sentence in a list of sentences 264 265 @param data:list of list of words 266 @type data: [[string,],] 267 @return: list of list of (word, tag) tuples 268 269 Invokes tag(sent) function for each sentence 270 compiles the results into a list of tagged sentences 271 each tagged sentence is a list of (word, tag) tuples 272 ''' 273 res = [] 274 for sent in data: 275 res1 = self.tag(sent) 276 res.append(res1) 277 return res
278 279
280 - def tag(self, data):
281 ''' 282 Tags a single sentence 283 284 @param data: list of words 285 @type data: [string,] 286 287 @return: [(word, tag),] 288 289 Calls recursive function '_tagword' 290 to produce a list of tags 291 292 Associates the sequence of returned tags 293 with the correct words in the input sequence 294 295 returns a list of (word, tag) tuples 296 ''' 297 298 current_state = [(['BOS', 'BOS'], 1.0)] 299 300 sent = list(data) 301 302 tags = self._tagword(sent, current_state) 303 304 res = [] 305 for i in range(len(sent)): 306 # unpack and discard the C flags 307 (t,C) = tags[i+2] 308 res.append((sent[i], t)) 309 310 return res
311 312
313 - def _tagword(self, sent, current_states):
314 ''' 315 @param sent : List of words remaining in the sentence 316 @type sent : [word,] 317 @param current_states : List of possible tag combinations for 318 the sentence so far, and the probability 319 associated with each tag combination 320 @type current_states : [([tag, ],prob), ] 321 322 Tags the first word in the sentence and 323 recursively tags the reminder of sentence 324 325 Uses formula specified above to calculate the probability 326 of a particular tag 327 ''' 328 329 # if this word marks the end of the sentance, 330 # return the most probable tag 331 if sent == []: 332 (h,p) = current_states[0] 333 return h 334 335 # otherwise there are more words to be tagged 336 word = sent[0] 337 sent = sent[1:] 338 new_states = [] 339 340 # if the Capitalisation is requested, 341 # initalise the flag for this word 342 C = False 343 if self._C and word[0].isupper(): C=True 344 345 # if word is known 346 # compute the set of possible tags 347 # and their associated probabilities 348 if word in self._wd.conditions(): 349 self.known += 1 350 351 for (history, curr_sent_prob) in current_states: 352 probs = [] 353 354 for t in self._wd[word].samples(): 355 p_uni = self._uni.freq((t,C)) 356 p_bi = self._bi[history[-1]].freq((t,C)) 357 p_tri = self._tri[tuple(history[-2:])].freq((t,C)) 358 p_wd = float(self._wd[word][t])/float(self._uni[(t,C)]) 359 p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri 360 p2 = p * p_wd 361 362 probs.append(((t,C), p2)) 363 364 365 # compute the result of appending each tag to this history 366 for (tag, prob) in probs: 367 new_states.append((history + [tag], curr_sent_prob*prob)) 368 369 370 371 372 # otherwise a new word, set of possible tags is unknown 373 else: 374 self.unknown += 1 375 376 # since a set of possible tags, 377 # and the probability of each specific tag 378 # can not be returned from most classifiers: 379 # specify that any unknown words are tagged with certainty 380 p = 1 381 382 # if no unknown word tagger has been specified 383 # then use the tag 'Unk' 384 if self._unk == None: 385 tag = ('Unk',C) 386 387 # otherwise apply the unknown word tagger 388 else : 389 [(_w, t)] = list(self._unk.tag([word])) 390 tag = (t,C) 391 392 for (history, prob) in current_states: 393 history.append(tag) 394 395 new_states = current_states 396 397 398 399 # now have computed a set of possible new_states 400 401 # sort states by prob 402 # _cmp_tup is a comparison function, 403 # set is now ordered greatest to least probability 404 new_states.sort(self._cmp_tup) 405 406 # del everything after N (threshold) 407 # this is the beam search cut 408 if len(new_states) > self._N: 409 new_states = new_states[:self._N] 410 411 412 # compute the tags for the rest of the sentence 413 # return the best list of tags for the sentence 414 return self._tagword(sent, new_states)
415 416 417
418 - def _cmp_tup(self, (_hq, p1), (_h2, p2)):
419 ''' 420 comparison function 421 422 @params : (_, prob) 423 @types : (_, int) tuple 424 425 used to sort a list of these tuples 426 into descending order 427 ''' 428 if (p2-p1) > 0: 429 return 1 430 else: 431 return -1
432 433 434 ######################################## 435 # helper function -- basic sentence tokenizer 436 ######################################## 437
438 -def basic_sent_chop(data, raw=True):
439 ''' 440 Basic method for tokenizing input into sentences 441 for this tagger: 442 443 @param data: list of tokens 444 tokens can be either 445 words or (word, tag) tuples 446 @type data: [string,] 447 or [(string, string),] 448 449 @param raw: boolean flag marking the input data 450 as a list of words or a list of tagged words 451 @type raw: Boolean 452 453 @ret : list of sentences 454 sentences are a list of tokens 455 tokens are the same as the input 456 457 Function takes a list of tokens and separates the tokens into lists 458 where each list represents a sentence fragment 459 This function can separate both tagged and raw sequences into 460 basic sentences. 461 462 Sentence markers are the set of [,.!?] 463 464 This is a simple method which enhances the performance of the TnT 465 tagger. Better sentence tokenization will further enhance the results. 466 ''' 467 468 new_data = [] 469 curr_sent = [] 470 sent_mark = [',','.','?','!'] 471 472 473 if raw: 474 for word in data: 475 if word in sent_mark: 476 curr_sent.append(word) 477 new_data.append(curr_sent) 478 curr_sent = [] 479 else: 480 curr_sent.append(word) 481 482 else: 483 for (word,tag) in data: 484 if word in sent_mark: 485 curr_sent.append((word,tag)) 486 new_data.append(curr_sent) 487 curr_sent = [] 488 else: 489 curr_sent.append((word,tag)) 490 return new_data
491 492 493
494 -def demo():
495 from nltk.tag import tnt 496 from nltk.corpus import brown 497 sents = list(brown.tagged_sents()) 498 test = list(brown.sents()) 499 500 # create and train the tagger 501 tagger = tnt.TnT() 502 tagger.train(sents[200:1000]) 503 504 # tag some data 505 tagged_data = tagger.tagdata(test[100:120]) 506 507 # print results 508 for j in range(len(tagged_data)): 509 s = tagged_data[j] 510 t = sents[j+100] 511 for i in range(len(s)): 512 print s[i],'--', t[i] 513 print
514 515
516 -def demo2():
517 from nltk import tag 518 from nltk.tag import tnt 519 from nltk.corpus import treebank 520 521 d = list(treebank.tagged_sents()) 522 523 t = tnt.TnT(N=1000, C=False) 524 s = tnt.TnT(N=1000, C=True) 525 t.train(d[(11)*100:]) 526 s.train(d[(11)*100:]) 527 528 for i in range(10): 529 tacc = tag.accuracy(t, d[i*100:((i+1)*100)]) 530 tp_un = float(t.unknown) / float(t.known +t.unknown) 531 tp_kn = float(t.known) / float(t.known + t.unknown) 532 t.unknown = 0 533 t.known = 0 534 535 print 'Capitalization off:' 536 print 'Accuracy:', tacc 537 print 'Percentage known:', tp_kn 538 print 'Percentage unknown:', tp_un 539 print 'Accuracy over known words:', (tacc / tp_kn) 540 541 sacc = tag.accuracy(s, d[i*100:((i+1)*100)]) 542 sp_un = float(s.unknown) / float(s.known +s.unknown) 543 sp_kn = float(s.known) / float(s.known + s.unknown) 544 s.unknown = 0 545 s.known = 0 546 547 print 'Capitalization on:' 548 print 'Accuracy:', sacc 549 print 'Percentage known:', sp_kn 550 print 'Percentage unknown:', sp_un 551 print 'Accuracy over known words:', (sacc / sp_kn)
552
553 -def demo3():
554 from nltk import tag 555 from nltk.corpus import treebank, brown 556 from nltk.tag import tnt 557 558 d = list(treebank.tagged_sents()) 559 e = list(brown.tagged_sents()) 560 561 d = d[:1000] 562 e = e[:1000] 563 564 d10 = int(len(d)*0.1) 565 e10 = int(len(e)*0.1) 566 567 tknacc = 0 568 sknacc = 0 569 tallacc = 0 570 sallacc = 0 571 tknown = 0 572 sknown = 0 573 574 for i in range(10): 575 576 t = tnt.TnT(N=1000, C=False) 577 s = tnt.TnT(N=1000, C=False) 578 579 dtest = d[(i*d10):((i+1)*d10)] 580 etest = e[(i*e10):((i+1)*e10)] 581 582 dtrain = d[:(i*d10)] + d[((i+1)*d10):] 583 etrain = e[:(i*e10)] + e[((i+1)*e10):] 584 585 t.train(dtrain) 586 s.train(etrain) 587 588 tacc = tag.accuracy(t, dtest) 589 tp_un = float(t.unknown) / float(t.known +t.unknown) 590 tp_kn = float(t.known) / float(t.known + t.unknown) 591 tknown += tp_kn 592 t.unknown = 0 593 t.known = 0 594 595 sacc = tag.accuracy(s, etest) 596 sp_un = float(s.unknown) / float(s.known + s.unknown) 597 sp_kn = float(s.known) / float(s.known + s.unknown) 598 sknown += sp_kn 599 s.unknown = 0 600 s.known = 0 601 602 tknacc += (tacc / tp_kn) 603 sknacc += (sacc / tp_kn) 604 tallacc += tacc 605 sallacc += sacc 606 607 #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc 608 609 610 print "brown: acc over words known:", 10 * tknacc 611 print " : overall accuracy:", 10 * tallacc 612 print " : words known:", 10 * tknown 613 print "treebank: acc over words known:", 10 * sknacc 614 print " : overall accuracy:", 10 * sallacc 615 print " : words known:", 10 * sknown
616 617 618 if __name__ == "__main__": 619 demo() 620