Package nltk :: Package chunk :: Module util
[hide private]
[frames] | no frames]

Source Code for Module nltk.chunk.util

  1  # Natural Language Toolkit: Chunk format conversions 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (minor additions) 
  6  # URL: <http://www.nltk.org/> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import re 
 10  import string 
 11   
 12  from nltk.tree import Tree 
 13  import nltk.tag.util 
 14   
 15  from api import * 
 16   
 17  ##////////////////////////////////////////////////////// 
 18  ## EVALUATION 
 19  ##////////////////////////////////////////////////////// 
 20   
 21  from nltk.metrics import accuracy as _accuracy 
22 -def accuracy(chunker, gold):
23 """ 24 Score the accuracy of the chunker against the gold standard. 25 Strip the chunk information from the gold standard and rechunk it using 26 the chunker, then compute the accuracy score. 27 28 @type chunker: C{ChunkParserI} 29 @param chunker: The chunker being evaluated. 30 @type gold: C{tree} 31 @param gold: The chunk structures to score the chunker on. 32 @rtype: C{float} 33 """ 34 35 gold_tags = [] 36 test_tags = [] 37 for gold_tree in gold: 38 test_tree = chunker.parse(gold_tree.flatten()) 39 gold_tags += tree2conlltags(gold_tree) 40 test_tags += tree2conlltags(test_tree) 41 42 # print 'GOLD:', gold_tags[:50] 43 # print 'TEST:', test_tags[:50] 44 return _accuracy(gold_tags, test_tags)
45 46 47 # Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13 48 # -- statistics are evaluated only on demand, instead of at every sentence evaluation 49 # 50 # SB: use nltk.metrics for precision/recall scoring? 51 #
52 -class ChunkScore(object):
53 """ 54 A utility class for scoring chunk parsers. C{ChunkScore} can 55 evaluate a chunk parser's output, based on a number of statistics 56 (precision, recall, f-measure, misssed chunks, incorrect chunks). 57 It can also combine the scores from the parsing of multiple texts; 58 this makes it signifigantly easier to evaluate a chunk parser that 59 operates one sentence at a time. 60 61 Texts are evaluated with the C{score} method. The results of 62 evaluation can be accessed via a number of accessor methods, such 63 as C{precision} and C{f_measure}. A typical use of the 64 C{ChunkScore} class is:: 65 66 >>> chunkscore = ChunkScore() 67 >>> for correct in correct_sentences: 68 ... guess = chunkparser.parse(correct.leaves()) 69 ... chunkscore.score(correct, guess) 70 >>> print 'F Measure:', chunkscore.f_measure() 71 F Measure: 0.823 72 73 @ivar kwargs: Keyword arguments: 74 75 - max_tp_examples: The maximum number actual examples of true 76 positives to record. This affects the C{correct} member 77 function: C{correct} will not return more than this number 78 of true positive examples. This does *not* affect any of 79 the numerical metrics (precision, recall, or f-measure) 80 81 - max_fp_examples: The maximum number actual examples of false 82 positives to record. This affects the C{incorrect} member 83 function and the C{guessed} member function: C{incorrect} 84 will not return more than this number of examples, and 85 C{guessed} will not return more than this number of true 86 positive examples. This does *not* affect any of the 87 numerical metrics (precision, recall, or f-measure) 88 89 - max_fn_examples: The maximum number actual examples of false 90 negatives to record. This affects the C{missed} member 91 function and the C{correct} member function: C{missed} 92 will not return more than this number of examples, and 93 C{correct} will not return more than this number of true 94 negative examples. This does *not* affect any of the 95 numerical metrics (precision, recall, or f-measure) 96 97 - chunk_node: A regular expression indicating which chunks 98 should be compared. Defaults to C{'.*'} (i.e., all chunks). 99 100 @type _tp: C{list} of C{Token} 101 @ivar _tp: List of true positives 102 @type _fp: C{list} of C{Token} 103 @ivar _fp: List of false positives 104 @type _fn: C{list} of C{Token} 105 @ivar _fn: List of false negatives 106 107 @type _tp_num: C{int} 108 @ivar _tp_num: Number of true positives 109 @type _fp_num: C{int} 110 @ivar _fp_num: Number of false positives 111 @type _fn_num: C{int} 112 @ivar _fn_num: Number of false negatives. 113 """
114 - def __init__(self, **kwargs):
115 self._correct = set() 116 self._guessed = set() 117 self._tp = set() 118 self._fp = set() 119 self._fn = set() 120 self._max_tp = kwargs.get('max_tp_examples', 100) 121 self._max_fp = kwargs.get('max_fp_examples', 100) 122 self._max_fn = kwargs.get('max_fn_examples', 100) 123 self._chunk_node = kwargs.get('chunk_node', '.*') 124 self._tp_num = 0 125 self._fp_num = 0 126 self._fn_num = 0 127 self._count = 0 128 self._tags_correct = 0.0 129 self._tags_total = 0.0 130 131 self._measuresNeedUpdate = False
132
133 - def _updateMeasures(self):
134 if (self._measuresNeedUpdate): 135 self._tp = self._guessed & self._correct 136 self._fn = self._correct - self._guessed 137 self._fp = self._guessed - self._correct 138 self._tp_num = len(self._tp) 139 self._fp_num = len(self._fp) 140 self._fn_num = len(self._fn) 141 self._measuresNeedUpdate = False
142
143 - def score(self, correct, guessed):
144 """ 145 Given a correctly chunked sentence, score another chunked 146 version of the same sentence. 147 148 @type correct: chunk structure 149 @param correct: The known-correct ("gold standard") chunked 150 sentence. 151 @type guessed: chunk structure 152 @param guessed: The chunked sentence to be scored. 153 """ 154 self._correct |= _chunksets(correct, self._count, self._chunk_node) 155 self._guessed |= _chunksets(guessed, self._count, self._chunk_node) 156 self._count += 1 157 self._measuresNeedUpdate = True 158 # Keep track of per-tag accuracy (if possible) 159 try: 160 correct_tags = tree2conlltags(correct) 161 guessed_tags = tree2conlltags(guessed) 162 except ValueError: 163 # This exception case is for nested chunk structures, 164 # where tree2conlltags will fail with a ValueError: "Tree 165 # is too deeply nested to be printed in CoNLL format." 166 correct_tags = guessed_tags = () 167 self._tags_total += len(correct_tags) 168 self._tags_correct += sum(1 for (t,g) in zip(guessed_tags, 169 correct_tags) 170 if t==g)
171
172 - def accuracy(self):
173 """ 174 @return: The overall tag-based accuracy for all text that have 175 been scored by this C{ChunkScore}, using the IOB (conll2000) 176 tag encoding. 177 """ 178 if self._tags_total == 0: return 1 179 return self._tags_correct/self._tags_total
180
181 - def precision(self):
182 """ 183 @return: the overall precision for all texts that have been 184 scored by this C{ChunkScore}. 185 @rtype: C{float} 186 """ 187 self._updateMeasures() 188 div = self._tp_num + self._fp_num 189 if div == 0: return 0 190 else: return float(self._tp_num) / div
191
192 - def recall(self):
193 """ 194 @return: the overall recall for all texts that have been 195 scored by this C{ChunkScore}. 196 @rtype: C{float} 197 """ 198 self._updateMeasures() 199 div = self._tp_num + self._fn_num 200 if div == 0: return 0 201 else: return float(self._tp_num) / div
202
203 - def f_measure(self, alpha=0.5):
204 """ 205 @return: the overall F measure for all texts that have been 206 scored by this C{ChunkScore}. 207 @rtype: C{float} 208 209 @param alpha: the relative weighting of precision and recall. 210 Larger alpha biases the score towards the precision value, 211 while smaller alpha biases the score towards the recall 212 value. C{alpha} should have a value in the range [0,1]. 213 @type alpha: C{float} 214 """ 215 self._updateMeasures() 216 p = self.precision() 217 r = self.recall() 218 if p == 0 or r == 0: # what if alpha is 0 or 1? 219 return 0 220 return 1/(alpha/p + (1-alpha)/r)
221
222 - def missed(self):
223 """ 224 @rtype: C{list} of chunks 225 @return: the chunks which were included in the 226 correct chunk structures, but not in the guessed chunk 227 structures, listed in input order. 228 """ 229 self._updateMeasures() 230 chunks = list(self._fn) 231 return [c[1] for c in chunks] # discard position information
232
233 - def incorrect(self):
234 """ 235 @rtype: C{list} of chunks 236 @return: the chunks which were included in the 237 guessed chunk structures, but not in the correct chunk 238 structures, listed in input order. 239 """ 240 self._updateMeasures() 241 chunks = list(self._fp) 242 return [c[1] for c in chunks] # discard position information
243
244 - def correct(self):
245 """ 246 @rtype: C{list} of chunks 247 @return: the chunks which were included in the correct 248 chunk structures, listed in input order. 249 """ 250 chunks = list(self._correct) 251 return [c[1] for c in chunks] # discard position information
252
253 - def guessed(self):
254 """ 255 @rtype: C{list} of chunks 256 @return: the chunks which were included in the guessed 257 chunk structures, listed in input order. 258 """ 259 chunks = list(self._guessed) 260 return [c[1] for c in chunks] # discard position information
261
262 - def __len__(self):
263 self._updateMeasures() 264 return self._tp_num + self._fn_num
265
266 - def __repr__(self):
267 """ 268 @rtype: C{String} 269 @return: a concise representation of this C{ChunkScoring}. 270 """ 271 return '<ChunkScoring of '+`len(self)`+' chunks>'
272
273 - def __str__(self):
274 """ 275 @rtype: C{String} 276 @return: a verbose representation of this C{ChunkScoring}. 277 This representation includes the precision, recall, and 278 f-measure scores. For other information about the score, 279 use the accessor methods (e.g., C{missed()} and 280 C{incorrect()}). 281 """ 282 return ("ChunkParse score:\n" + 283 (" IOB Accuracy: %5.1f%%\n" % (self.accuracy()*100)) + 284 (" Precision: %5.1f%%\n" % (self.precision()*100)) + 285 (" Recall: %5.1f%%\n" % (self.recall()*100))+ 286 (" F-Measure: %5.1f%%" % (self.f_measure()*100)))
287 288 # extract chunks, and assign unique id, the absolute position of 289 # the first word of the chunk
290 -def _chunksets(t, count, chunk_node):
291 pos = 0 292 chunks = [] 293 for child in t: 294 if isinstance(child, Tree): 295 if re.match(chunk_node, child.node): 296 chunks.append(((count, pos), child.freeze())) 297 pos += len(child.leaves()) 298 else: 299 pos += 1 300 return set(chunks)
301 302
303 -def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
304 """ 305 Divide a string of bracketted tagged text into 306 chunks and unchunked tokens, and produce a C{Tree}. 307 Chunks are marked by square brackets (C{[...]}). Words are 308 delimited by whitespace, and each word should have the form 309 C{I{text}/I{tag}}. Words that do not contain a slash are 310 assigned a C{tag} of C{None}. 311 312 @return: A tree corresponding to the string representation. 313 @rtype: C{tree} 314 @param s: The string to be converted 315 @type s: C{string} 316 @param chunk_node: The label to use for chunk nodes 317 @type chunk_node: C{string} 318 @param top_node: The label to use for the root of the tree 319 @type top_node: C{string} 320 """ 321 322 WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') 323 324 stack = [Tree(top_node, [])] 325 for match in WORD_OR_BRACKET.finditer(s): 326 text = match.group() 327 if text[0] == '[': 328 if len(stack) != 1: 329 raise ValueError('Unexpected [ at char %d' % match.start()) 330 chunk = Tree(chunk_node, []) 331 stack[-1].append(chunk) 332 stack.append(chunk) 333 elif text[0] == ']': 334 if len(stack) != 2: 335 raise ValueError('Unexpected ] at char %d' % match.start()) 336 stack.pop() 337 else: 338 if sep is None: 339 stack[-1].append(text) 340 else: 341 stack[-1].append(nltk.tag.util.str2tuple(text, sep)) 342 343 if len(stack) != 1: 344 raise ValueError('Expected ] at char %d' % len(s)) 345 return stack[0]
346 347 ### CONLL 348 349 _LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
350 -def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), top_node="S"):
351 """ 352 Convert a CoNLL IOB string into a tree. Uses the specified chunk types 353 (defaults to NP, PP and VP), and creates a tree rooted at a node 354 labeled S (by default). 355 356 @param s: The CoNLL string to be converted. 357 @type s: C{string} 358 @param chunk_types: The chunk types to be converted. 359 @type chunk_types: C{tuple} 360 @param top_node: The node label to use for the root. 361 @type top_node: C{string} 362 @return: A chunk structure for a single sentence 363 encoded in the given CONLL 2000 style string. 364 @rtype: L{Tree} 365 """ 366 367 stack = [Tree(top_node, [])] 368 369 for lineno, line in enumerate(s.split('\n')): 370 if not line.strip(): continue 371 372 # Decode the line. 373 match = _LINE_RE.match(line) 374 if match is None: 375 raise ValueError, 'Error on line %d' % lineno 376 (word, tag, state, chunk_type) = match.groups() 377 378 # If it's a chunk type we don't care about, treat it as O. 379 if (chunk_types is not None and 380 chunk_type not in chunk_types): 381 state = 'O' 382 383 # For "Begin"/"Outside", finish any completed chunks - 384 # also do so for "Inside" which don't match the previous token. 385 mismatch_I = state == 'I' and chunk_type != stack[-1].node 386 if state in 'BO' or mismatch_I: 387 if len(stack) == 2: stack.pop() 388 389 # For "Begin", start a new chunk. 390 if state == 'B' or mismatch_I: 391 chunk = Tree(chunk_type, []) 392 stack[-1].append(chunk) 393 stack.append(chunk) 394 395 # Add the new word token. 396 stack[-1].append((word, tag)) 397 398 return stack[0]
399
400 -def tree2conlltags(t):
401 """ 402 Convert a tree to the CoNLL IOB tag format 403 404 @param t: The tree to be converted. 405 @type t: C{Tree} 406 @return: A list of 3-tuples containing word, tag and IOB tag. 407 @rtype: C{list} of C{tuple} 408 """ 409 410 tags = [] 411 for child in t: 412 try: 413 category = child.node 414 prefix = "B-" 415 for contents in child: 416 if isinstance(contents, Tree): 417 raise ValueError, "Tree is too deeply nested to be printed in CoNLL format" 418 tags.append((contents[0], contents[1], prefix+category)) 419 prefix = "I-" 420 except AttributeError: 421 tags.append((child[0], child[1], "O")) 422 return tags
423
424 -def conlltags2tree(sentence, chunk_types=('NP','PP','VP'), 425 top_node='S', strict=False):
426 """ 427 Convert the CoNLL IOB format to a tree. 428 """ 429 tree = nltk.Tree(top_node, []) 430 for (word, postag, chunktag) in sentence: 431 if chunktag is None: 432 if strict: 433 raise ValueError("Bad conll tag sequence") 434 else: 435 # Treat as O 436 tree.append((word,postag)) 437 elif chunktag.startswith('B-'): 438 tree.append(nltk.Tree(chunktag[2:], [(word,postag)])) 439 elif chunktag.startswith('I-'): 440 if (len(tree)==0 or not isinstance(tree[-1], nltk.Tree) or 441 tree[-1].node != chunktag[2:]): 442 if strict: 443 raise ValueError("Bad conll tag sequence") 444 else: 445 # Treat as B-* 446 tree.append(nltk.Tree(chunktag[2:], [(word,postag)])) 447 else: 448 tree[-1].append((word,postag)) 449 elif chunktag == 'O': 450 tree.append((word,postag)) 451 else: 452 raise ValueError("Bad conll tag %r" % chunktag) 453 return tree
454
455 -def tree2conllstr(t):
456 """ 457 Convert a tree to the CoNLL IOB string format 458 459 @param t: The tree to be converted. 460 @type t: C{Tree} 461 @return: A multiline string where each line contains a word, tag and IOB tag. 462 @rtype: C{string} 463 """ 464 lines = [string.join(token) for token in tree2conlltags(t)] 465 return '\n'.join(lines)
466 467 ### IEER 468 469 _IEER_DOC_RE = re.compile(r'<DOC>\s*' 470 r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?' 471 r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?' 472 r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?' 473 r'<BODY>\s*' 474 r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?' 475 r'<TEXT>(?P<text>.*?)</TEXT>\s*' 476 r'</BODY>\s*</DOC>\s*', re.DOTALL) 477 478 _IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"') 479
480 -def _ieer_read_text(s, top_node):
481 stack = [Tree(top_node, [])] 482 # s will be None if there is no headline in the text 483 # return the empty list in place of a Tree 484 if s is None: 485 return [] 486 for piece_m in re.finditer('<[^>]+>|[^\s<]+', s): 487 piece = piece_m.group() 488 try: 489 if piece.startswith('<b_'): 490 m = _IEER_TYPE_RE.match(piece) 491 if m is None: print 'XXXX', piece 492 chunk = Tree(m.group('type'), []) 493 stack[-1].append(chunk) 494 stack.append(chunk) 495 elif piece.startswith('<e_'): 496 stack.pop() 497 # elif piece.startswith('<'): 498 # print "ERROR:", piece 499 # raise ValueError # Unexpected HTML 500 else: 501 stack[-1].append(piece) 502 except (IndexError, ValueError): 503 raise ValueError('Bad IEER string (error at character %d)' % 504 piece_m.start()) 505 if len(stack) != 1: 506 raise ValueError('Bad IEER string') 507 return stack[0]
508
509 -def ieerstr2tree(s, chunk_types = ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION', 510 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], top_node="S"):
511 """ 512 Convert a string of chunked tagged text in the IEER named 513 entity format into a chunk structure. Chunks are of several 514 types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL, 515 PERCENT, MONEY, and MEASURE. 516 517 @return: A chunk structure containing the chunked tagged text that is 518 encoded in the given IEER style string. 519 @rtype: L{Tree} 520 """ 521 522 # Try looking for a single document. If that doesn't work, then just 523 # treat everything as if it was within the <TEXT>...</TEXT>. 524 m = _IEER_DOC_RE.match(s) 525 if m: 526 return { 527 'text': _ieer_read_text(m.group('text'), top_node), 528 'docno': m.group('docno'), 529 'doctype': m.group('doctype'), 530 'date_time': m.group('date_time'), 531 #'headline': m.group('headline') 532 # we want to capture NEs in the headline too! 533 'headline': _ieer_read_text(m.group('headline'), top_node), 534 } 535 else: 536 return _ieer_read_text(s, top_node)
537 538
539 -def demo():
540 541 s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./." 542 import nltk 543 t = nltk.chunk.tagstr2tree(s, chunk_node='NP') 544 print t.pprint() 545 print 546 547 s = """ 548 These DT B-NP 549 research NN I-NP 550 protocols NNS I-NP 551 offer VBP B-VP 552 to TO B-PP 553 the DT B-NP 554 patient NN I-NP 555 not RB O 556 only RB O 557 the DT B-NP 558 very RB I-NP 559 best JJS I-NP 560 therapy NN I-NP 561 which WDT B-NP 562 we PRP B-NP 563 have VBP B-VP 564 established VBN I-VP 565 today NN B-NP 566 but CC B-NP 567 also RB I-NP 568 the DT B-NP 569 hope NN I-NP 570 of IN B-PP 571 something NN B-NP 572 still RB B-ADJP 573 better JJR I-ADJP 574 . . O 575 """ 576 577 conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP')) 578 print conll_tree.pprint() 579 580 # Demonstrate CoNLL output 581 print "CoNLL output:" 582 print nltk.chunk.tree2conllstr(conll_tree) 583 print
584 585 586 if __name__ == '__main__': 587 demo() 588