Package nltk :: Package chunk :: Module named_entity
[hide private]
[frames] | no frames]

Source Code for Module nltk.chunk.named_entity

  1  # Natural Language Toolkit: Chunk parsing API 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  Named entity chunker 
 10  """ 
 11   
 12  import os, re, pickle 
 13  from nltk.etree import ElementTree as ET 
 14  from nltk.chunk.api import * 
 15  from nltk.chunk.util import * 
 16  import nltk 
 17   
 18  # This really shouldn't be loaded at import time.  But it's used by a 
 19  # static method.  Do a lazy loading? 
 20  _short_en_wordlist = set(nltk.corpus.words.words('en-basic')) 
21 22 23 -class NEChunkParserTagger(nltk.tag.ClassifierBasedTagger):
24 """ 25 The IOB tagger used by the chunk parser. 26 """
27 - def __init__(self, train):
28 nltk.tag.ClassifierBasedTagger.__init__( 29 self, train=train, 30 classifier_builder=self._classifier_builder)
31
32 - def _classifier_builder(self, train):
33 return nltk.MaxentClassifier.train(train, algorithm='megam', 34 gaussian_prior_sigma=1, 35 trace=2)
36
37 - def _feature_detector(self, tokens, index, history):
38 word = tokens[index][0] 39 pos = simplify_pos(tokens[index][1]) 40 if index == 0: 41 prevword = prevprevword = None 42 prevpos = prevprevpos = None 43 prevtag = prevprevtag = None 44 elif index == 1: 45 prevword = tokens[index-1][0].lower() 46 prevprevword = None 47 prevpos = simplify_pos(tokens[index-1][1]) 48 prevprevpos = None 49 prevtag = history[index-1][0] 50 prevprevtag = None 51 else: 52 prevword = tokens[index-1][0].lower() 53 prevprevword = tokens[index-2][0].lower() 54 prevpos = simplify_pos(tokens[index-1][1]) 55 prevprevpos = simplify_pos(tokens[index-2][1]) 56 prevtag = history[index-1] 57 prevprevtag = history[index-2] 58 if index == len(tokens)-1: 59 nextword = nextnextword = None 60 nextpos = nextnextpos = None 61 elif index == len(tokens)-2: 62 nextword = tokens[index+1][0].lower() 63 nextpos = tokens[index+1][1].lower() 64 nextnextword = None 65 nextnextpos = None 66 else: 67 nextword = tokens[index+1][0].lower() 68 nextpos = tokens[index+1][1].lower() 69 nextnextword = tokens[index+2][0].lower() 70 nextnextpos = tokens[index+2][1].lower() 71 72 # 89.6 73 features = { 74 'bias': True, 75 'shape': shape(word), 76 'wordlen': len(word), 77 'prefix3': word[:3].lower(), 78 'suffix3': word[-3:].lower(), 79 'pos': pos, 80 'word': word, 81 'en-wordlist': (word in _short_en_wordlist), # xx! 82 'prevtag': prevtag, 83 'prevpos': prevpos, 84 'nextpos': nextpos, 85 'prevword': prevword, 86 'nextword': nextword, 87 'word+nextpos': '%s+%s' % (word.lower(), nextpos), 88 'pos+prevtag': '%s+%s' % (pos, prevtag), 89 'shape+prevtag': '%s+%s' % (shape, prevtag), 90 } 91 92 return features
93
94 -class NEChunkParser(ChunkParserI):
95 """ 96 Expected input: list of pos-tagged words 97 """
98 - def __init__(self, train):
99 self._train(train)
100
101 - def parse(self, tokens):
102 """ 103 Each token should be a pos-tagged word 104 """ 105 tagged = self._tagger.tag(tokens) 106 tree = self._tagged_to_parse(tagged) 107 return tree
108
109 - def _train(self, corpus):
110 # Convert to tagged sequence 111 corpus = [self._parse_to_tagged(s) for s in corpus] 112 113 self._tagger = NEChunkParserTagger(train=corpus)
114
115 - def _tagged_to_parse(self, tagged_tokens):
116 """ 117 Convert a list of tagged tokens to a chunk-parse tree. 118 """ 119 sent = nltk.Tree('S', []) 120 121 for (tok,tag) in tagged_tokens: 122 if tag == 'O': 123 sent.append(tok) 124 elif tag.startswith('B-'): 125 sent.append(nltk.Tree(tag[2:], [tok])) 126 elif tag.startswith('I-'): 127 if (sent and isinstance(sent[-1], Tree) and 128 sent[-1].node == tag[2:]): 129 sent[-1].append(tok) 130 else: 131 sent.append(nltk.Tree(tag[2:], [tok])) 132 return sent
133 134 @staticmethod
135 - def _parse_to_tagged(sent):
136 """ 137 Convert a chunk-parse tree to a list of tagged tokens. 138 """ 139 toks = [] 140 for child in sent: 141 if isinstance(child, nltk.Tree): 142 if len(child) == 0: 143 print "Warning -- empty chunk in sentence" 144 continue 145 toks.append((child[0], 'B-%s' % child.node)) 146 for tok in child[1:]: 147 toks.append((tok, 'I-%s' % child.node)) 148 else: 149 toks.append((child, 'O')) 150 return toks
151
152 -def shape(word):
153 if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word): 154 return 'number' 155 elif re.match('\W+$', word): 156 return 'punct' 157 elif re.match('[A-Z][a-z]+$', word): 158 return 'upcase' 159 elif re.match('[a-z]+$', word): 160 return 'downcase' 161 elif re.match('\w+$', word): 162 return 'mixedcase' 163 else: 164 return 'other'
165
166 -def simplify_pos(s):
167 if s.startswith('V'): return "V" 168 else: return s.split('-')[0]
169
170 -def postag_tree(tree):
171 # Part-of-speech tagging. 172 words = tree.leaves() 173 tag_iter = (pos for (word, pos) in nltk.pos_tag(words)) 174 newtree = Tree('S', []) 175 for child in tree: 176 if isinstance(child, nltk.Tree): 177 newtree.append(Tree(child.node, [])) 178 for subchild in child: 179 newtree[-1].append( (subchild, tag_iter.next()) ) 180 else: 181 newtree.append( (child, tag_iter.next()) ) 182 return newtree
183
184 -def load_ace_data(roots, fmt='binary', skip_bnews=True):
185 for root in roots: 186 for root, dirs, files in os.walk(root): 187 if root.endswith('bnews') and skip_bnews: 188 continue 189 for f in files: 190 if f.endswith('.sgm'): 191 for sent in load_ace_file(os.path.join(root, f), fmt): 192 yield sent
193
194 -def load_ace_file(textfile, fmt):
195 print ' - %s' % os.path.split(textfile)[1] 196 annfile = textfile+'.tmx.rdc.xml' 197 198 # Read the xml file, and get a list of entities 199 entities = [] 200 xml = ET.parse(open(annfile)).getroot() 201 for entity in xml.findall('document/entity'): 202 typ = entity.find('entity_type').text 203 for mention in entity.findall('entity_mention'): 204 if mention.get('TYPE') != 'NAME': continue # only NEs 205 s = int(mention.find('head/charseq/start').text) 206 e = int(mention.find('head/charseq/end').text)+1 207 entities.append( (s, e, typ) ) 208 209 # Read the text file, and mark the entities. 210 text = open(textfile).read() 211 212 # Strip XML tags, since they don't count towards the indices 213 text = re.sub('<(?!/?TEXT)[^>]+>', '', text) 214 215 # Blank out anything before/after <TEXT> 216 def subfunc(m): return ' '*(m.end()-m.start()-6) 217 text = re.sub('[\s\S]*<TEXT>', subfunc, text) 218 text = re.sub('</TEXT>[\s\S]*', '', text) 219 220 # Simplify quotes 221 text = re.sub("``", ' "', text) 222 text = re.sub("''", '" ', text) 223 224 entity_types = set(typ for (s,e,typ) in entities) 225 226 # Binary distinction (NE or not NE) 227 if fmt == 'binary': 228 i = 0 229 toks = nltk.Tree('S', []) 230 for (s,e,typ) in sorted(entities): 231 if s < i: s = i # Overlapping! Deal with this better? 232 if e <= s: continue 233 toks.extend(nltk.word_tokenize(text[i:s])) 234 toks.append(nltk.Tree('NE', text[s:e].split())) 235 i = e 236 toks.extend(nltk.word_tokenize(text[i:])) 237 yield toks 238 239 # Multiclass distinction (NE type) 240 elif fmt == 'multiclass': 241 i = 0 242 toks = nltk.Tree('S', []) 243 for (s,e,typ) in sorted(entities): 244 if s < i: s = i # Overlapping! Deal with this better? 245 if e <= s: continue 246 toks.extend(nltk.word_tokenize(text[i:s])) 247 toks.append(nltk.Tree(typ, text[s:e].split())) 248 i = e 249 toks.extend(nltk.word_tokenize(text[i:])) 250 yield toks 251 252 else: 253 raise ValueError('bad fmt value')
254
255 # This probably belongs in a more general-purpose location (as does 256 # the parse_to_tagged function). 257 -def cmp_chunks(correct, guessed):
258 correct = NEChunkParser._parse_to_tagged(correct) 259 guessed = NEChunkParser._parse_to_tagged(guessed) 260 ellipsis = False 261 for (w, ct), (w, gt) in zip(correct, guessed): 262 if ct == gt == 'O': 263 if not ellipsis: 264 print " %-15s %-15s %s" % (ct, gt, w) 265 print ' %-15s %-15s %s' % ('...', '...', '...') 266 ellipsis = True 267 else: 268 ellipsis = False 269 print " %-15s %-15s %s" % (ct, gt, w)
270
271 -def build_model(fmt='binary'):
272 print 'Loading training data...' 273 train_paths = [nltk.data.find('corpora/ace_data/ace.dev'), 274 nltk.data.find('corpora/ace_data/ace.heldout'), 275 nltk.data.find('corpora/ace_data/bbn.dev'), 276 nltk.data.find('corpora/ace_data/muc.dev')] 277 train_trees = load_ace_data(train_paths, fmt) 278 train_data = [postag_tree(t) for t in train_trees] 279 print 'Training...' 280 cp = NEChunkParser(train_data) 281 del train_data 282 283 print 'Loading eval data...' 284 eval_paths = [nltk.data.find('corpora/ace_data/ace.eval')] 285 eval_trees = load_ace_data(eval_paths, fmt) 286 eval_data = [postag_tree(t) for t in eval_trees] 287 288 print 'Evaluating...' 289 chunkscore = ChunkScore() 290 for i, correct in enumerate(eval_data): 291 guess = cp.parse(correct.leaves()) 292 chunkscore.score(correct, guess) 293 if i < 3: cmp_chunks(correct, guess) 294 print chunkscore 295 296 outfilename = '/tmp/ne_chunker_%s.pickle' % fmt 297 print 'Saving chunker to %s...' % outfilename 298 out = open(outfilename, 'wb') 299 pickle.dump(cp, out, -1) 300 out.close() 301 302 return cp
303 304 305 if __name__ == '__main__': 306 # Make sure that the pickled object has the right class name: 307 from nltk.chunk.named_entity import build_model 308 309 build_model('binary') 310 build_model('multiclass') 311