1
2
3
4
5
6
7
8
9 """
10 Read CoNLL-style chunk fileids.
11 """
12
13 import os
14 import codecs
15 import textwrap
16
17 from nltk.tree import Tree
18 from nltk.util import LazyMap, LazyConcatenation
19
20 from util import *
21 from api import *
24 """
25 A corpus reader for CoNLL-style files. These files consist of a
26 series of sentences, separated by blank lines. Each sentence is
27 encoded using a table (or I{grid}) of values, where each line
28 corresponds to a single word, and each column corresponds to an
29 annotation type. The set of columns used by CoNLL-style files can
30 vary from corpus to corpus; the C{ConllCorpusReader} constructor
31 therefore takes an argument, C{columntypes}, which is used to
32 specify the columns that are used by a given corpus.
33
34 @todo: Add support for reading from corpora where different
35 parallel files contain different columns.
36 @todo: Possibly add caching of the grid corpus view? This would
37 allow the same grid view to be used by different data access
38 methods (eg words() and parsed_sents() could both share the
39 same grid corpus view object).
40 @todo: Better support for -DOCSTART-. Currently, we just ignore
41 it, but it could be used to define methods that retrieve a
42 document at a time (eg parsed_documents()).
43 """
44
45
46
47
48
49 WORDS = 'words'
50 POS = 'pos'
51 TREE = 'tree'
52 CHUNK = 'chunk'
53 NE = 'ne'
54 SRL = 'srl'
55 IGNORE = 'ignore'
56
57
58 COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
59
60
61
62
63
64 - def __init__(self, root, fileids, columntypes,
65 chunk_types=None, top_node='S', pos_in_tree=False,
66 srl_includes_roleset=True, encoding=None,
67 tree_class=Tree, tag_mapping_function=None):
68 for columntype in columntypes:
69 if columntype not in self.COLUMN_TYPES:
70 raise ValueError('Bad column type %r' % columntype)
71 if isinstance(chunk_types, basestring):
72 chunk_types = [chunk_types]
73 self._chunk_types = chunk_types
74 self._colmap = dict((c,i) for (i,c) in enumerate(columntypes))
75 self._pos_in_tree = pos_in_tree
76 self._top_node = top_node
77 self._srl_includes_roleset = srl_includes_roleset
78 self._tree_class = tree_class
79 CorpusReader.__init__(self, root, fileids, encoding)
80 self._tag_mapping_function = tag_mapping_function
81
82
83
84
85
86 - def raw(self, fileids=None):
90
91 - def words(self, fileids=None):
94
95 - def sents(self, fileids=None):
98
103 return LazyConcatenation(LazyMap(get_tagged_words,
104 self._grids(fileids)))
105
110 return LazyMap(get_tagged_words, self._grids(fileids))
111
112 - def chunked_words(self, fileids=None, chunk_types=None,
113 simplify_tags=False):
118 return LazyConcatenation(LazyMap(get_chunked_words,
119 self._grids(fileids)))
120
121 - def chunked_sents(self, fileids=None, chunk_types=None,
122 simplify_tags=False):
127 return LazyMap(get_chunked_words, self._grids(fileids))
128
129 - def parsed_sents(self, fileids=None, pos_in_tree=None, simplify_tags=False):
130 self._require(self.WORDS, self.POS, self.TREE)
131 if pos_in_tree is None: pos_in_tree = self._pos_in_tree
132 def get_parsed_sent(grid):
133 return self._get_parsed_sent(grid, pos_in_tree, simplify_tags)
134 return LazyMap(get_parsed_sent, self._grids(fileids))
135
139
140 - def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
145 result = LazyMap(get_srl_instances, self._grids(fileids))
146 if flatten: result = LazyConcatenation(result)
147 return result
148
149 - def iob_words(self, fileids=None, simplify_tags=False):
150 """
151 @return: a list of word/tag/IOB tuples
152 @rtype: C{list} of C{tuple}
153 @param fileids: the list of fileids that make up this corpus
154 @type fileids: C{None} or C{str} or C{list}
155 """
156 self._require(self.WORDS, self.POS, self.CHUNK)
157 def get_iob_words(grid):
158 return self._get_iob_words(grid, simplify_tags)
159 return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
160
161 - def iob_sents(self, fileids=None, simplify_tags=False):
162 """
163 @return: a list of lists of word/tag/IOB tuples
164 @rtype: C{list} of C{list}
165 @param fileids: the list of fileids that make up this corpus
166 @type fileids: C{None} or C{str} or C{list}
167 """
168 self._require(self.WORDS, self.POS, self.CHUNK)
169 def get_iob_words(grid):
170 return self._get_iob_words(grid, simplify_tags)
171 return LazyMap(get_iob_words, self._grids(fileids))
172
173
174
175
176
177 - def _grids(self, fileids=None):
184
186 grids = []
187 for block in read_blankline_block(stream):
188 block = block.strip()
189 if not block: continue
190
191 grid = [line.split() for line in block.split('\n')]
192
193
194
195 if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-':
196 del grid[0]
197
198
199 for row in grid:
200 if len(row) != len(grid[0]):
201 raise ValueError('Inconsistent number of columns:\n%s'
202 % block)
203 grids.append(grid)
204 return grids
205
206
207
208
209
210
211
214
216 pos_tags = self._get_column(grid, self._colmap['pos'])
217 if simplify_tags:
218 pos_tags = [self._tag_mapping_function(t) for t in pos_tags]
219 return zip(self._get_column(grid, self._colmap['words']), pos_tags)
220
222 pos_tags = self._get_column(grid, self._colmap['pos'])
223 if simplify_tags:
224 pos_tags = [self._tag_mapping_function(t) for t in pos_tags]
225 return zip(self._get_column(grid, self._colmap['words']), pos_tags,
226 self._get_column(grid, self._colmap['chunk']))
227
229
230 words = self._get_column(grid, self._colmap['words'])
231 pos_tags = self._get_column(grid, self._colmap['pos'])
232 if simplify_tags:
233 pos_tags = [self._tag_mapping_function(t) for t in pos_tags]
234 chunk_tags = self._get_column(grid, self._colmap['chunk'])
235
236 stack = [Tree(self._top_node, [])]
237
238 for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
239 if chunk_tag == 'O':
240 state, chunk_type = 'O', ''
241 else:
242 (state, chunk_type) = chunk_tag.split('-')
243
244 if chunk_types is not None and chunk_type not in chunk_types:
245 state = 'O'
246
247 if state == 'I' and chunk_type != stack[-1].node:
248 state = 'B'
249
250 if state in 'BO' and len(stack) == 2:
251 stack.pop()
252
253 if state == 'B':
254 new_chunk = Tree(chunk_type, [])
255 stack[-1].append(new_chunk)
256 stack.append(new_chunk)
257
258 stack[-1].append((word, pos_tag))
259
260 return stack[0]
261
263 words = self._get_column(grid, self._colmap['words'])
264 pos_tags = self._get_column(grid, self._colmap['pos'])
265 if simplify_tags:
266 pos_tags = [self._tag_mapping_function(t) for t in pos_tags]
267 parse_tags = self._get_column(grid, self._colmap['tree'])
268
269 treestr = ''
270 for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
271 if word == '(': word = '-LRB-'
272 if word == ')': word = '-RRB-'
273 if pos_tag == '(': pos_tag = '-LRB-'
274 if pos_tag == ')': pos_tag = '-RRB-'
275 (left, right) = parse_tag.split('*')
276 right = right.count(')')*')'
277 treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
278 try:
279 tree = self._tree_class.parse(treestr)
280 except (ValueError, IndexError):
281 tree = self._tree_class.parse('(%s %s)' %
282 (self._top_node, treestr))
283
284 if not pos_in_tree:
285 for subtree in tree.subtrees():
286 for i, child in enumerate(subtree):
287 if (isinstance(child, nltk.Tree) and len(child)==1 and
288 isinstance(child[0], basestring)):
289 subtree[i] = (child[0], child.node)
290
291 return tree
292
294 """
295 list of list of (start, end), tag) tuples
296 """
297 if self._srl_includes_roleset:
298 predicates = self._get_column(grid, self._colmap['srl']+1)
299 start_col = self._colmap['srl']+2
300 else:
301 predicates = self._get_column(grid, self._colmap['srl'])
302 start_col = self._colmap['srl']+1
303
304
305
306 num_preds = len([p for p in predicates if p != '-'])
307
308 spanlists = []
309 for i in range(num_preds):
310 col = self._get_column(grid, start_col+i)
311 spanlist = []
312 stack = []
313 for wordnum, srl_tag in enumerate(col):
314 (left, right) = srl_tag.split('*')
315 for tag in left.split('('):
316 if tag:
317 stack.append((tag, wordnum))
318 for i in range(right.count(')')):
319 (tag, start) = stack.pop()
320 spanlist.append( ((start, wordnum+1), tag) )
321 spanlists.append(spanlist)
322
323 return spanlists
324
326 tree = self._get_parsed_sent(grid, pos_in_tree)
327 spanlists = self._get_srl_spans(grid)
328 if self._srl_includes_roleset:
329 predicates = self._get_column(grid, self._colmap['srl']+1)
330 rolesets = self._get_column(grid, self._colmap['srl'])
331 else:
332 predicates = self._get_column(grid, self._colmap['srl'])
333 rolesets = [None] * len(predicates)
334
335 instances = ConllSRLInstanceList(tree)
336 for wordnum, predicate in enumerate(predicates):
337 if predicate == '-': continue
338
339
340
341 for spanlist in spanlists:
342 for (start, end), tag in spanlist:
343 if wordnum in range(start,end) and tag in ('V', 'C-V'):
344 break
345 else: continue
346 break
347 else:
348 raise ValueError('No srl column found for %r' % predicate)
349 instances.append(ConllSRLInstance(tree, wordnum, predicate,
350 rolesets[wordnum], spanlist))
351
352 return instances
353
354
355
356
357
359 for columntype in columntypes:
360 if columntype not in self._colmap:
361 raise ValueError('This corpus does not contain a %s '
362 'column.' % columntype)
363
364 @staticmethod
367
370 """
371 An SRL instance from a CoNLL corpus, which identifies and
372 providing labels for the arguments of a single verb.
373 """
374
375
376 - def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
377 self.verb = []
378 """A list of the word indices of the words that compose the
379 verb whose arguments are identified by this instance.
380 This will contain multiple word indices when multi-word
381 verbs are used (e.g. 'turn on')."""
382
383 self.verb_head = verb_head
384 """The word index of the head word of the verb whose arguments
385 are identified by this instance. E.g., for a sentence that
386 uses the verb 'turn on,' C{verb_head} will be the word index
387 of the word 'turn'."""
388
389 self.verb_stem = verb_stem
390
391 self.roleset = roleset
392
393 self.arguments = []
394 """A list of C{(argspan, argid)} tuples, specifying the location
395 and type for each of the arguments identified by this
396 instance. C{argspan} is a tuple C{start, end}, indicating
397 that the argument consists of the C{words[start:end]}."""
398
399 self.tagged_spans = tagged_spans
400 """A list of C{(span, id)} tuples, specifying the location and
401 type for each of the arguments, as well as the verb pieces,
402 that make up this instance."""
403
404 self.tree = tree
405 """The parse tree for the sentence containing this instance."""
406
407 self.words = tree.leaves()
408 """A list of the words in the sentence containing this
409 instance."""
410
411
412 for (start, end), tag in tagged_spans:
413 if tag in ('V', 'C-V'):
414 self.verb += range(start, end)
415 else:
416 self.arguments.append( ((start, end), tag) )
417
419 plural = len(self.arguments)!=1 and 's' or ''
420 return '<ConllSRLInstance for %r with %d argument%s>' % (
421 (self.verb_stem, len(self.arguments), plural))
422
424 verbstr = ' '.join(self.words[i][0] for i in self.verb)
425 hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem)
426 s = ''
427 for i, word in enumerate(self.words):
428 if isinstance(word, tuple): word = word[0]
429 for (start, end), argid in self.arguments:
430 if i == start: s += '[%s ' % argid
431 if i == end: s += '] '
432 if i in self.verb: word = '<<%s>>' % word
433 s += word + ' '
434 return hdr + textwrap.fill(s.replace(' ]', ']'),
435 initial_indent=' ',
436 subsequent_indent=' ')
437
439 """
440 Set of instances for a single sentence
441 """
442 - def __init__(self, tree, instances=()):
445
448
449 - def pprint(self, include_tree=False):
450
451 for inst in self:
452 if inst.tree != self.tree:
453 raise ValueError('Tree mismatch!')
454
455
456 if include_tree:
457 words = self.tree.leaves()
458 pos = [None] * len(words)
459 synt = ['*'] * len(words)
460 self._tree2conll(self.tree, 0, words, pos, synt)
461
462 s = ''
463 for i in range(len(words)):
464
465 if include_tree:
466 s += '%-20s ' % words[i]
467 s += '%-8s ' % pos[i]
468 s += '%15s*%-8s ' % tuple(synt[i].split('*'))
469
470
471 for inst in self:
472 if i == inst.verb_head:
473 s += '%-20s ' % inst.verb_stem
474 break
475 else:
476 s += '%-20s ' % '-'
477
478 for inst in self:
479 argstr = '*'
480 for (start, end), argid in inst.tagged_spans:
481 if i==start: argstr = '(%s%s' % (argid, argstr)
482 if i==(end-1): argstr += ')'
483 s += '%-12s ' % argstr
484 s += '\n'
485 return s
486
487 - def _tree2conll(self, tree, wordnum, words, pos, synt):
488 assert isinstance(tree, Tree)
489 if len(tree) == 1 and isinstance(tree[0], basestring):
490 pos[wordnum] = tree.node
491 assert words[wordnum] == tree[0]
492 return wordnum+1
493 elif len(tree) == 1 and isinstance(tree[0], tuple):
494 assert len(tree[0]) == 2
495 pos[wordnum], pos[wordnum] = tree[0]
496 return wordnum+1
497 else:
498 synt[wordnum] = '(%s%s' % (tree.node, synt[wordnum])
499 for child in tree:
500 wordnum = self._tree2conll(child, wordnum, words,
501 pos, synt)
502 synt[wordnum-1] += ')'
503 return wordnum
504
506 """
507 A ConllCorpusReader whose data file contains three columns: words,
508 pos, and chunk.
509 """
510 - def __init__(self, root, fileids, chunk_types, encoding=None,
511 tag_mapping_function=None):
516