1
2
3
4
5
6
7
8
9
10 '''
11 Implementation of 'TnT - A Statisical Part of Speech Tagger'
12 by Thorsten Brants
13
14 http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf
15 '''
16
17 import nltk
18 from api import *
19
21 '''
22 TnT - Statistical POS tagger
23
24 IMPORTANT NOTES:
25
26 * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS
27 It is possible to provide an untrained POS tagger to
28 create tags for unknown words, see __init__ function
29
30 * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT
31 - Due to the nature of this tagger, it works best when
32 trained over sentence delimited input.
33 - However it still produces good results if the training
34 data and testing data are separated on all punctuation eg: [,.?!]
35 - Input for training is expected to be a list of sentences
36 where each sentence is a list of (word, tag) tuples
37 - Input for tag function is a single sentence
38 Input for tagdata function is a list of sentences
39 Output is of a similar form
40
41 * Function provided to process text that is unsegmented
42 - Please see basic_sent_chop()
43
44
45 TnT uses a second order Markov model to produce tags for
46 a sequence of input, specifically:
47
48 argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)
49
50 IE: the maximum projection of a set of probabilities
51
52 The set of possible tags for a given word is derived
53 from the training data. It is the set of all tags
54 that exact word has been assigned.
55
56 The probability of a tag for a given word is the linear
57 interpolation of 3 markov models; a zero-order, first-order,
58 and a second order model.
59
60 P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
61 l3*P(t_i| t_i-1, t_i-2)
62
63 A beam search is used to limit the memory usage of the algorithm.
64 The degree of the beam can be changed using N in the initialization.
65 N represents the maximum number of possible solutions to maintain
66 while tagging.
67
68 It is possible to differentiate the tags which are assigned to
69 capitalized words. However this does not result in a significant
70 gain in the accuracy of the results.
71 '''
72
73 - def __init__(self, unk=None, Trained=False, N=1000, C=False):
74 '''
75 Construct a TnT statistical tagger. Tagger must be trained
76 before being used to tag input.
77
78 @param unk: instance of a POS tagger, conforms to TaggerI
79 @type unk:(TaggerI)
80 @param Trained: Indication that the POS tagger is trained or not
81 @type Trained: boolean
82 @param N: Beam search degree (see above)
83 @type N:(int)
84 @param C: Capitalization flag
85 @type C: boolean
86
87 Initializer, creates frequency distributions to be used
88 for tagging
89
90 _lx values represent the portion of the tri/bi/uni taggers
91 to be used to calculate the probability
92
93 N value is the number of possible solutions to maintain
94 while tagging. A good value for this is 1000
95
96 C is a boolean value which specifies to use or
97 not use the Capitalization of the word as additional
98 information for tagging.
99 NOTE: using capitalization may not increase the accuracy
100 of the tagger
101 '''
102
103 self._uni = nltk.probability.FreqDist()
104 self._bi = nltk.probability.ConditionalFreqDist()
105 self._tri = nltk.probability.ConditionalFreqDist()
106 self._wd = nltk.probability.ConditionalFreqDist()
107 self._eos = nltk.probability.ConditionalFreqDist()
108 self._l1 = 0.0
109 self._l2 = 0.0
110 self._l3 = 0.0
111 self._N = N
112 self._C = C
113 self._T = Trained
114
115 self._unk = unk
116
117
118 self.unknown = 0
119 self.known = 0
120
122 '''
123 Uses a set of tagged data to train the tagger.
124 If an unknown word tagger is specified,
125 it is trained on the same data.
126
127 @param data: List of lists of (word, tag) tuples
128 @type data: L{tuple} of L{str}
129 '''
130
131
132 C = False
133
134 if self._unk != None and self._T == False:
135 self._unk.train(data)
136
137 for sent in data:
138 history = ['BOS', 'BOS']
139 for w, t in sent:
140
141
142
143
144 if self._C and w[0].isupper(): C=True
145
146 self._wd[w].inc(t)
147 self._uni.inc((t,C))
148 self._bi[history[1]].inc((t,C))
149 self._tri[tuple(history)].inc((t,C))
150
151 history.append((t,C))
152 history.pop(0)
153
154
155 C = False
156
157 self._eos[t].inc('EOS')
158
159
160
161 self._compute_lambda()
162
163
164
165
166
167
169 '''
170 creates lambda values based upon training data
171
172 NOTE: no need to explicitly reference C,
173 it is contained within the tag variable :: tag == (tag,C)
174
175 for each tag trigram (t1, t2, t3)
176 depending on the maximum value of
177 - f(t1,t2,t3)-1 / f(t1,t2)-1
178 - f(t2,t3)-1 / f(t2)-1
179 - f(t3)-1 / N-1
180
181 increment l3,l2, or l1 by f(t1,t2,t3)
182
183 ISSUES -- Resolutions:
184 if 2 values are equal, increment both lambda values
185 by (f(t1,t2,t3) / 2)
186 '''
187
188
189 tl1 = 0.0
190 tl2 = 0.0
191 tl3 = 0.0
192
193
194 for history in self._tri.conditions():
195 (h1, h2) = history
196
197
198
199
200 for tag in self._tri[history].samples():
201
202
203
204 if self._uni[tag] == 1:
205 continue
206
207
208
209 c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1))
210 c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1))
211 c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1))
212
213
214
215 if (c1 > c3) and (c1 > c2):
216 tl1 += self._tri[history][tag]
217
218
219 elif (c2 > c3) and (c2 > c1):
220 tl2 += self._tri[history][tag]
221
222
223 elif (c3 > c2) and (c3 > c1):
224 tl3 += self._tri[history][tag]
225
226
227 elif (c3 == c2) and (c3 > c1):
228 tl2 += float(self._tri[history][tag]) /2.0
229 tl3 += float(self._tri[history][tag]) /2.0
230
231
232
233 elif (c2 == c1) and (c1 > c3):
234 tl1 += float(self._tri[history][tag]) /2.0
235 tl2 += float(self._tri[history][tag]) /2.0
236
237
238
239 else:
240
241 pass
242
243
244
245 self._l1 = tl1 / (tl1+tl2+tl3)
246 self._l2 = tl2 / (tl1+tl2+tl3)
247 self._l3 = tl3 / (tl1+tl2+tl3)
248
249
250
252 '''
253 Safe floating point division function, does not allow division by 0
254 returns -1 if the denominator is 0
255 '''
256 if v2 == 0:
257 return -1
258 else:
259 return float(v1) / float(v2)
260
262 '''
263 Tags each sentence in a list of sentences
264
265 @param data:list of list of words
266 @type data: [[string,],]
267 @return: list of list of (word, tag) tuples
268
269 Invokes tag(sent) function for each sentence
270 compiles the results into a list of tagged sentences
271 each tagged sentence is a list of (word, tag) tuples
272 '''
273 res = []
274 for sent in data:
275 res1 = self.tag(sent)
276 res.append(res1)
277 return res
278
279
280 - def tag(self, data):
281 '''
282 Tags a single sentence
283
284 @param data: list of words
285 @type data: [string,]
286
287 @return: [(word, tag),]
288
289 Calls recursive function '_tagword'
290 to produce a list of tags
291
292 Associates the sequence of returned tags
293 with the correct words in the input sequence
294
295 returns a list of (word, tag) tuples
296 '''
297
298 current_state = [(['BOS', 'BOS'], 1.0)]
299
300 sent = list(data)
301
302 tags = self._tagword(sent, current_state)
303
304 res = []
305 for i in range(len(sent)):
306
307 (t,C) = tags[i+2]
308 res.append((sent[i], t))
309
310 return res
311
312
313 - def _tagword(self, sent, current_states):
314 '''
315 @param sent : List of words remaining in the sentence
316 @type sent : [word,]
317 @param current_states : List of possible tag combinations for
318 the sentence so far, and the probability
319 associated with each tag combination
320 @type current_states : [([tag, ],prob), ]
321
322 Tags the first word in the sentence and
323 recursively tags the reminder of sentence
324
325 Uses formula specified above to calculate the probability
326 of a particular tag
327 '''
328
329
330
331 if sent == []:
332 (h,p) = current_states[0]
333 return h
334
335
336 word = sent[0]
337 sent = sent[1:]
338 new_states = []
339
340
341
342 C = False
343 if self._C and word[0].isupper(): C=True
344
345
346
347
348 if word in self._wd.conditions():
349 self.known += 1
350
351 for (history, curr_sent_prob) in current_states:
352 probs = []
353
354 for t in self._wd[word].samples():
355 p_uni = self._uni.freq((t,C))
356 p_bi = self._bi[history[-1]].freq((t,C))
357 p_tri = self._tri[tuple(history[-2:])].freq((t,C))
358 p_wd = float(self._wd[word][t])/float(self._uni[(t,C)])
359 p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri
360 p2 = p * p_wd
361
362 probs.append(((t,C), p2))
363
364
365
366 for (tag, prob) in probs:
367 new_states.append((history + [tag], curr_sent_prob*prob))
368
369
370
371
372
373 else:
374 self.unknown += 1
375
376
377
378
379
380 p = 1
381
382
383
384 if self._unk == None:
385 tag = ('Unk',C)
386
387
388 else :
389 [(_w, t)] = list(self._unk.tag([word]))
390 tag = (t,C)
391
392 for (history, prob) in current_states:
393 history.append(tag)
394
395 new_states = current_states
396
397
398
399
400
401
402
403
404 new_states.sort(self._cmp_tup)
405
406
407
408 if len(new_states) > self._N:
409 new_states = new_states[:self._N]
410
411
412
413
414 return self._tagword(sent, new_states)
415
416
417
418 - def _cmp_tup(self, (_hq, p1), (_h2, p2)):
419 '''
420 comparison function
421
422 @params : (_, prob)
423 @types : (_, int) tuple
424
425 used to sort a list of these tuples
426 into descending order
427 '''
428 if (p2-p1) > 0:
429 return 1
430 else:
431 return -1
432
433
434
435
436
437
439 '''
440 Basic method for tokenizing input into sentences
441 for this tagger:
442
443 @param data: list of tokens
444 tokens can be either
445 words or (word, tag) tuples
446 @type data: [string,]
447 or [(string, string),]
448
449 @param raw: boolean flag marking the input data
450 as a list of words or a list of tagged words
451 @type raw: Boolean
452
453 @ret : list of sentences
454 sentences are a list of tokens
455 tokens are the same as the input
456
457 Function takes a list of tokens and separates the tokens into lists
458 where each list represents a sentence fragment
459 This function can separate both tagged and raw sequences into
460 basic sentences.
461
462 Sentence markers are the set of [,.!?]
463
464 This is a simple method which enhances the performance of the TnT
465 tagger. Better sentence tokenization will further enhance the results.
466 '''
467
468 new_data = []
469 curr_sent = []
470 sent_mark = [',','.','?','!']
471
472
473 if raw:
474 for word in data:
475 if word in sent_mark:
476 curr_sent.append(word)
477 new_data.append(curr_sent)
478 curr_sent = []
479 else:
480 curr_sent.append(word)
481
482 else:
483 for (word,tag) in data:
484 if word in sent_mark:
485 curr_sent.append((word,tag))
486 new_data.append(curr_sent)
487 curr_sent = []
488 else:
489 curr_sent.append((word,tag))
490 return new_data
491
492
493
514
515
517 from nltk import tag
518 from nltk.tag import tnt
519 from nltk.corpus import treebank
520
521 d = list(treebank.tagged_sents())
522
523 t = tnt.TnT(N=1000, C=False)
524 s = tnt.TnT(N=1000, C=True)
525 t.train(d[(11)*100:])
526 s.train(d[(11)*100:])
527
528 for i in range(10):
529 tacc = tag.accuracy(t, d[i*100:((i+1)*100)])
530 tp_un = float(t.unknown) / float(t.known +t.unknown)
531 tp_kn = float(t.known) / float(t.known + t.unknown)
532 t.unknown = 0
533 t.known = 0
534
535 print 'Capitalization off:'
536 print 'Accuracy:', tacc
537 print 'Percentage known:', tp_kn
538 print 'Percentage unknown:', tp_un
539 print 'Accuracy over known words:', (tacc / tp_kn)
540
541 sacc = tag.accuracy(s, d[i*100:((i+1)*100)])
542 sp_un = float(s.unknown) / float(s.known +s.unknown)
543 sp_kn = float(s.known) / float(s.known + s.unknown)
544 s.unknown = 0
545 s.known = 0
546
547 print 'Capitalization on:'
548 print 'Accuracy:', sacc
549 print 'Percentage known:', sp_kn
550 print 'Percentage unknown:', sp_un
551 print 'Accuracy over known words:', (sacc / sp_kn)
552
554 from nltk import tag
555 from nltk.corpus import treebank, brown
556 from nltk.tag import tnt
557
558 d = list(treebank.tagged_sents())
559 e = list(brown.tagged_sents())
560
561 d = d[:1000]
562 e = e[:1000]
563
564 d10 = int(len(d)*0.1)
565 e10 = int(len(e)*0.1)
566
567 tknacc = 0
568 sknacc = 0
569 tallacc = 0
570 sallacc = 0
571 tknown = 0
572 sknown = 0
573
574 for i in range(10):
575
576 t = tnt.TnT(N=1000, C=False)
577 s = tnt.TnT(N=1000, C=False)
578
579 dtest = d[(i*d10):((i+1)*d10)]
580 etest = e[(i*e10):((i+1)*e10)]
581
582 dtrain = d[:(i*d10)] + d[((i+1)*d10):]
583 etrain = e[:(i*e10)] + e[((i+1)*e10):]
584
585 t.train(dtrain)
586 s.train(etrain)
587
588 tacc = tag.accuracy(t, dtest)
589 tp_un = float(t.unknown) / float(t.known +t.unknown)
590 tp_kn = float(t.known) / float(t.known + t.unknown)
591 tknown += tp_kn
592 t.unknown = 0
593 t.known = 0
594
595 sacc = tag.accuracy(s, etest)
596 sp_un = float(s.unknown) / float(s.known + s.unknown)
597 sp_kn = float(s.known) / float(s.known + s.unknown)
598 sknown += sp_kn
599 s.unknown = 0
600 s.known = 0
601
602 tknacc += (tacc / tp_kn)
603 sknacc += (sacc / tp_kn)
604 tallacc += tacc
605 sallacc += sacc
606
607
608
609
610 print "brown: acc over words known:", 10 * tknacc
611 print " : overall accuracy:", 10 * tallacc
612 print " : words known:", 10 * tknown
613 print "treebank: acc over words known:", 10 * sknacc
614 print " : overall accuracy:", 10 * sallacc
615 print " : words known:", 10 * sknown
616
617
618 if __name__ == "__main__":
619 demo()
620