1
2
3
4
5
6
7
8 """
9 Named entity chunker
10 """
11
12 import os, re, pickle
13 from nltk.etree import ElementTree as ET
14 from nltk.chunk.api import *
15 from nltk.chunk.util import *
16 import nltk
17
18
19
20 _short_en_wordlist = set(nltk.corpus.words.words('en-basic'))
24 """
25 The IOB tagger used by the chunk parser.
26 """
31
36
38 word = tokens[index][0]
39 pos = simplify_pos(tokens[index][1])
40 if index == 0:
41 prevword = prevprevword = None
42 prevpos = prevprevpos = None
43 prevtag = prevprevtag = None
44 elif index == 1:
45 prevword = tokens[index-1][0].lower()
46 prevprevword = None
47 prevpos = simplify_pos(tokens[index-1][1])
48 prevprevpos = None
49 prevtag = history[index-1][0]
50 prevprevtag = None
51 else:
52 prevword = tokens[index-1][0].lower()
53 prevprevword = tokens[index-2][0].lower()
54 prevpos = simplify_pos(tokens[index-1][1])
55 prevprevpos = simplify_pos(tokens[index-2][1])
56 prevtag = history[index-1]
57 prevprevtag = history[index-2]
58 if index == len(tokens)-1:
59 nextword = nextnextword = None
60 nextpos = nextnextpos = None
61 elif index == len(tokens)-2:
62 nextword = tokens[index+1][0].lower()
63 nextpos = tokens[index+1][1].lower()
64 nextnextword = None
65 nextnextpos = None
66 else:
67 nextword = tokens[index+1][0].lower()
68 nextpos = tokens[index+1][1].lower()
69 nextnextword = tokens[index+2][0].lower()
70 nextnextpos = tokens[index+2][1].lower()
71
72
73 features = {
74 'bias': True,
75 'shape': shape(word),
76 'wordlen': len(word),
77 'prefix3': word[:3].lower(),
78 'suffix3': word[-3:].lower(),
79 'pos': pos,
80 'word': word,
81 'en-wordlist': (word in _short_en_wordlist),
82 'prevtag': prevtag,
83 'prevpos': prevpos,
84 'nextpos': nextpos,
85 'prevword': prevword,
86 'nextword': nextword,
87 'word+nextpos': '%s+%s' % (word.lower(), nextpos),
88 'pos+prevtag': '%s+%s' % (pos, prevtag),
89 'shape+prevtag': '%s+%s' % (shape, prevtag),
90 }
91
92 return features
93
95 """
96 Expected input: list of pos-tagged words
97 """
100
101 - def parse(self, tokens):
108
114
133
134 @staticmethod
151
153 if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
154 return 'number'
155 elif re.match('\W+$', word):
156 return 'punct'
157 elif re.match('[A-Z][a-z]+$', word):
158 return 'upcase'
159 elif re.match('[a-z]+$', word):
160 return 'downcase'
161 elif re.match('\w+$', word):
162 return 'mixedcase'
163 else:
164 return 'other'
165
167 if s.startswith('V'): return "V"
168 else: return s.split('-')[0]
169
170 -def postag_tree(tree):
171
172 words = tree.leaves()
173 tag_iter = (pos for (word, pos) in nltk.pos_tag(words))
174 newtree = Tree('S', [])
175 for child in tree:
176 if isinstance(child, nltk.Tree):
177 newtree.append(Tree(child.node, []))
178 for subchild in child:
179 newtree[-1].append( (subchild, tag_iter.next()) )
180 else:
181 newtree.append( (child, tag_iter.next()) )
182 return newtree
183
193
195 print ' - %s' % os.path.split(textfile)[1]
196 annfile = textfile+'.tmx.rdc.xml'
197
198
199 entities = []
200 xml = ET.parse(open(annfile)).getroot()
201 for entity in xml.findall('document/entity'):
202 typ = entity.find('entity_type').text
203 for mention in entity.findall('entity_mention'):
204 if mention.get('TYPE') != 'NAME': continue
205 s = int(mention.find('head/charseq/start').text)
206 e = int(mention.find('head/charseq/end').text)+1
207 entities.append( (s, e, typ) )
208
209
210 text = open(textfile).read()
211
212
213 text = re.sub('<(?!/?TEXT)[^>]+>', '', text)
214
215
216 def subfunc(m): return ' '*(m.end()-m.start()-6)
217 text = re.sub('[\s\S]*<TEXT>', subfunc, text)
218 text = re.sub('</TEXT>[\s\S]*', '', text)
219
220
221 text = re.sub("``", ' "', text)
222 text = re.sub("''", '" ', text)
223
224 entity_types = set(typ for (s,e,typ) in entities)
225
226
227 if fmt == 'binary':
228 i = 0
229 toks = nltk.Tree('S', [])
230 for (s,e,typ) in sorted(entities):
231 if s < i: s = i
232 if e <= s: continue
233 toks.extend(nltk.word_tokenize(text[i:s]))
234 toks.append(nltk.Tree('NE', text[s:e].split()))
235 i = e
236 toks.extend(nltk.word_tokenize(text[i:]))
237 yield toks
238
239
240 elif fmt == 'multiclass':
241 i = 0
242 toks = nltk.Tree('S', [])
243 for (s,e,typ) in sorted(entities):
244 if s < i: s = i
245 if e <= s: continue
246 toks.extend(nltk.word_tokenize(text[i:s]))
247 toks.append(nltk.Tree(typ, text[s:e].split()))
248 i = e
249 toks.extend(nltk.word_tokenize(text[i:]))
250 yield toks
251
252 else:
253 raise ValueError('bad fmt value')
254
258 correct = NEChunkParser._parse_to_tagged(correct)
259 guessed = NEChunkParser._parse_to_tagged(guessed)
260 ellipsis = False
261 for (w, ct), (w, gt) in zip(correct, guessed):
262 if ct == gt == 'O':
263 if not ellipsis:
264 print " %-15s %-15s %s" % (ct, gt, w)
265 print ' %-15s %-15s %s' % ('...', '...', '...')
266 ellipsis = True
267 else:
268 ellipsis = False
269 print " %-15s %-15s %s" % (ct, gt, w)
270
272 print 'Loading training data...'
273 train_paths = [nltk.data.find('corpora/ace_data/ace.dev'),
274 nltk.data.find('corpora/ace_data/ace.heldout'),
275 nltk.data.find('corpora/ace_data/bbn.dev'),
276 nltk.data.find('corpora/ace_data/muc.dev')]
277 train_trees = load_ace_data(train_paths, fmt)
278 train_data = [postag_tree(t) for t in train_trees]
279 print 'Training...'
280 cp = NEChunkParser(train_data)
281 del train_data
282
283 print 'Loading eval data...'
284 eval_paths = [nltk.data.find('corpora/ace_data/ace.eval')]
285 eval_trees = load_ace_data(eval_paths, fmt)
286 eval_data = [postag_tree(t) for t in eval_trees]
287
288 print 'Evaluating...'
289 chunkscore = ChunkScore()
290 for i, correct in enumerate(eval_data):
291 guess = cp.parse(correct.leaves())
292 chunkscore.score(correct, guess)
293 if i < 3: cmp_chunks(correct, guess)
294 print chunkscore
295
296 outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
297 print 'Saving chunker to %s...' % outfilename
298 out = open(outfilename, 'wb')
299 pickle.dump(cp, out, -1)
300 out.close()
301
302 return cp
303
304
305 if __name__ == '__main__':
306
307 from nltk.chunk.named_entity import build_model
308
309 build_model('binary')
310 build_model('multiclass')
311