Package nltk :: Package corpus :: Package reader :: Module conll
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.conll

  1  # Natural Language Toolkit: CONLL Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
  6  # URL: <http://www.nltk.org/> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Read CoNLL-style chunk fileids. 
 11  """        
 12   
 13  import os 
 14  import codecs 
 15  import textwrap 
 16   
 17  from nltk.tree import Tree 
 18  from nltk.util import LazyMap, LazyConcatenation 
 19   
 20  from util import * 
 21  from api import * 
22 23 -class ConllCorpusReader(CorpusReader):
24 """ 25 A corpus reader for CoNLL-style files. These files consist of a 26 series of sentences, separated by blank lines. Each sentence is 27 encoded using a table (or I{grid}) of values, where each line 28 corresponds to a single word, and each column corresponds to an 29 annotation type. The set of columns used by CoNLL-style files can 30 vary from corpus to corpus; the C{ConllCorpusReader} constructor 31 therefore takes an argument, C{columntypes}, which is used to 32 specify the columns that are used by a given corpus. 33 34 @todo: Add support for reading from corpora where different 35 parallel files contain different columns. 36 @todo: Possibly add caching of the grid corpus view? This would 37 allow the same grid view to be used by different data access 38 methods (eg words() and parsed_sents() could both share the 39 same grid corpus view object). 40 @todo: Better support for -DOCSTART-. Currently, we just ignore 41 it, but it could be used to define methods that retrieve a 42 document at a time (eg parsed_documents()). 43 """ 44 45 #///////////////////////////////////////////////////////////////// 46 # Column Types 47 #///////////////////////////////////////////////////////////////// 48 49 WORDS = 'words' #: column type for words 50 POS = 'pos' #: column type for part-of-speech tags 51 TREE = 'tree' #: column type for parse trees 52 CHUNK = 'chunk' #: column type for chunk structures 53 NE = 'ne' #: column type for named entities 54 SRL = 'srl' #: column type for semantic role labels 55 IGNORE = 'ignore' #: column type for column that should be ignored 56 57 #: A list of all column types supported by the conll corpus reader. 58 COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE) 59 60 #///////////////////////////////////////////////////////////////// 61 # Constructor 62 #///////////////////////////////////////////////////////////////// 63
64 - def __init__(self, root, fileids, columntypes, 65 chunk_types=None, top_node='S', pos_in_tree=False, 66 srl_includes_roleset=True, encoding=None, 67 tree_class=Tree, tag_mapping_function=None):
68 for columntype in columntypes: 69 if columntype not in self.COLUMN_TYPES: 70 raise ValueError('Bad column type %r' % columntype) 71 if isinstance(chunk_types, basestring): 72 chunk_types = [chunk_types] 73 self._chunk_types = chunk_types 74 self._colmap = dict((c,i) for (i,c) in enumerate(columntypes)) 75 self._pos_in_tree = pos_in_tree 76 self._top_node = top_node # for chunks 77 self._srl_includes_roleset = srl_includes_roleset 78 self._tree_class = tree_class 79 CorpusReader.__init__(self, root, fileids, encoding) 80 self._tag_mapping_function = tag_mapping_function
81 82 #///////////////////////////////////////////////////////////////// 83 # Data Access Methods 84 #///////////////////////////////////////////////////////////////// 85
86 - def raw(self, fileids=None):
87 if fileids is None: fileids = self._fileids 88 elif isinstance(fileids, basestring): fileids = [fileids] 89 return concat([self.open(f).read() for f in fileids])
90
91 - def words(self, fileids=None):
92 self._require(self.WORDS) 93 return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
94
95 - def sents(self, fileids=None):
96 self._require(self.WORDS) 97 return LazyMap(self._get_words, self._grids(fileids))
98
99 - def tagged_words(self, fileids=None, simplify_tags=False):
100 self._require(self.WORDS, self.POS) 101 def get_tagged_words(grid): 102 return self._get_tagged_words(grid, simplify_tags)
103 return LazyConcatenation(LazyMap(get_tagged_words, 104 self._grids(fileids)))
105
106 - def tagged_sents(self, fileids=None, simplify_tags=False):
107 self._require(self.WORDS, self.POS) 108 def get_tagged_words(grid): 109 return self._get_tagged_words(grid, simplify_tags)
110 return LazyMap(get_tagged_words, self._grids(fileids)) 111
112 - def chunked_words(self, fileids=None, chunk_types=None, 113 simplify_tags=False):
114 self._require(self.WORDS, self.POS, self.CHUNK) 115 if chunk_types is None: chunk_types = self._chunk_types 116 def get_chunked_words(grid): # capture chunk_types as local var 117 return self._get_chunked_words(grid, chunk_types, simplify_tags)
118 return LazyConcatenation(LazyMap(get_chunked_words, 119 self._grids(fileids))) 120
121 - def chunked_sents(self, fileids=None, chunk_types=None, 122 simplify_tags=False):
123 self._require(self.WORDS, self.POS, self.CHUNK) 124 if chunk_types is None: chunk_types = self._chunk_types 125 def get_chunked_words(grid): # capture chunk_types as local var 126 return self._get_chunked_words(grid, chunk_types, simplify_tags)
127 return LazyMap(get_chunked_words, self._grids(fileids)) 128
129 - def parsed_sents(self, fileids=None, pos_in_tree=None, simplify_tags=False):
130 self._require(self.WORDS, self.POS, self.TREE) 131 if pos_in_tree is None: pos_in_tree = self._pos_in_tree 132 def get_parsed_sent(grid): # capture pos_in_tree as local var 133 return self._get_parsed_sent(grid, pos_in_tree, simplify_tags)
134 return LazyMap(get_parsed_sent, self._grids(fileids)) 135
136 - def srl_spans(self, fileids=None):
137 self._require(self.SRL) 138 return LazyMap(self._get_srl_spans, self._grids(fileids))
139
140 - def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
141 self._require(self.WORDS, self.POS, self.TREE, self.SRL) 142 if pos_in_tree is None: pos_in_tree = self._pos_in_tree 143 def get_srl_instances(grid): # capture pos_in_tree as local var 144 return self._get_srl_instances(grid, pos_in_tree)
145 result = LazyMap(get_srl_instances, self._grids(fileids)) 146 if flatten: result = LazyConcatenation(result) 147 return result 148
149 - def iob_words(self, fileids=None, simplify_tags=False):
150 """ 151 @return: a list of word/tag/IOB tuples 152 @rtype: C{list} of C{tuple} 153 @param fileids: the list of fileids that make up this corpus 154 @type fileids: C{None} or C{str} or C{list} 155 """ 156 self._require(self.WORDS, self.POS, self.CHUNK) 157 def get_iob_words(grid): 158 return self._get_iob_words(grid, simplify_tags)
159 return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids))) 160
161 - def iob_sents(self, fileids=None, simplify_tags=False):
162 """ 163 @return: a list of lists of word/tag/IOB tuples 164 @rtype: C{list} of C{list} 165 @param fileids: the list of fileids that make up this corpus 166 @type fileids: C{None} or C{str} or C{list} 167 """ 168 self._require(self.WORDS, self.POS, self.CHUNK) 169 def get_iob_words(grid): 170 return self._get_iob_words(grid, simplify_tags)
171 return LazyMap(get_iob_words, self._grids(fileids)) 172 173 #///////////////////////////////////////////////////////////////// 174 # Grid Reading 175 #///////////////////////////////////////////////////////////////// 176
177 - def _grids(self, fileids=None):
178 # n.b.: we could cache the object returned here (keyed on 179 # fileids), which would let us reuse the same corpus view for 180 # different things (eg srl and parse trees). 181 return concat([StreamBackedCorpusView(fileid, self._read_grid_block, 182 encoding=enc) 183 for (fileid, enc) in self.abspaths(fileids, True)])
184
185 - def _read_grid_block(self, stream):
186 grids = [] 187 for block in read_blankline_block(stream): 188 block = block.strip() 189 if not block: continue 190 191 grid = [line.split() for line in block.split('\n')] 192 193 # If there's a docstart row, then discard. ([xx] eventually it 194 # would be good to actually use it) 195 if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-': 196 del grid[0] 197 198 # Check that the grid is consistent. 199 for row in grid: 200 if len(row) != len(grid[0]): 201 raise ValueError('Inconsistent number of columns:\n%s' 202 % block) 203 grids.append(grid) 204 return grids
205 206 #///////////////////////////////////////////////////////////////// 207 # Transforms 208 #///////////////////////////////////////////////////////////////// 209 # given a grid, transform it into some representation (e.g., 210 # a list of words or a parse tree). 211
212 - def _get_words(self, grid):
213 return self._get_column(grid, self._colmap['words'])
214
215 - def _get_tagged_words(self, grid, simplify_tags=False):
216 pos_tags = self._get_column(grid, self._colmap['pos']) 217 if simplify_tags: 218 pos_tags = [self._tag_mapping_function(t) for t in pos_tags] 219 return zip(self._get_column(grid, self._colmap['words']), pos_tags)
220
221 - def _get_iob_words(self, grid, simplify_tags=False):
222 pos_tags = self._get_column(grid, self._colmap['pos']) 223 if simplify_tags: 224 pos_tags = [self._tag_mapping_function(t) for t in pos_tags] 225 return zip(self._get_column(grid, self._colmap['words']), pos_tags, 226 self._get_column(grid, self._colmap['chunk']))
227
228 - def _get_chunked_words(self, grid, chunk_types, simplify_tags=False):
229 # n.b.: this method is very similar to conllstr2tree. 230 words = self._get_column(grid, self._colmap['words']) 231 pos_tags = self._get_column(grid, self._colmap['pos']) 232 if simplify_tags: 233 pos_tags = [self._tag_mapping_function(t) for t in pos_tags] 234 chunk_tags = self._get_column(grid, self._colmap['chunk']) 235 236 stack = [Tree(self._top_node, [])] 237 238 for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags): 239 if chunk_tag == 'O': 240 state, chunk_type = 'O', '' 241 else: 242 (state, chunk_type) = chunk_tag.split('-') 243 # If it's a chunk we don't care about, treat it as O. 244 if chunk_types is not None and chunk_type not in chunk_types: 245 state = 'O' 246 # Treat a mismatching I like a B. 247 if state == 'I' and chunk_type != stack[-1].node: 248 state = 'B' 249 # For B or I: close any open chunks 250 if state in 'BO' and len(stack) == 2: 251 stack.pop() 252 # For B: start a new chunk. 253 if state == 'B': 254 new_chunk = Tree(chunk_type, []) 255 stack[-1].append(new_chunk) 256 stack.append(new_chunk) 257 # Add the word token. 258 stack[-1].append((word, pos_tag)) 259 260 return stack[0]
261
262 - def _get_parsed_sent(self, grid, pos_in_tree, simplify_tags=False):
263 words = self._get_column(grid, self._colmap['words']) 264 pos_tags = self._get_column(grid, self._colmap['pos']) 265 if simplify_tags: 266 pos_tags = [self._tag_mapping_function(t) for t in pos_tags] 267 parse_tags = self._get_column(grid, self._colmap['tree']) 268 269 treestr = '' 270 for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): 271 if word == '(': word = '-LRB-' 272 if word == ')': word = '-RRB-' 273 if pos_tag == '(': pos_tag = '-LRB-' 274 if pos_tag == ')': pos_tag = '-RRB-' 275 (left, right) = parse_tag.split('*') 276 right = right.count(')')*')' # only keep ')'. 277 treestr += '%s (%s %s) %s' % (left, pos_tag, word, right) 278 try: 279 tree = self._tree_class.parse(treestr) 280 except (ValueError, IndexError): 281 tree = self._tree_class.parse('(%s %s)' % 282 (self._top_node, treestr)) 283 284 if not pos_in_tree: 285 for subtree in tree.subtrees(): 286 for i, child in enumerate(subtree): 287 if (isinstance(child, nltk.Tree) and len(child)==1 and 288 isinstance(child[0], basestring)): 289 subtree[i] = (child[0], child.node) 290 291 return tree
292
293 - def _get_srl_spans(self, grid):
294 """ 295 list of list of (start, end), tag) tuples 296 """ 297 if self._srl_includes_roleset: 298 predicates = self._get_column(grid, self._colmap['srl']+1) 299 start_col = self._colmap['srl']+2 300 else: 301 predicates = self._get_column(grid, self._colmap['srl']) 302 start_col = self._colmap['srl']+1 303 304 # Count how many predicates there are. This tells us how many 305 # columns to expect for SRL data. 306 num_preds = len([p for p in predicates if p != '-']) 307 308 spanlists = [] 309 for i in range(num_preds): 310 col = self._get_column(grid, start_col+i) 311 spanlist = [] 312 stack = [] 313 for wordnum, srl_tag in enumerate(col): 314 (left, right) = srl_tag.split('*') 315 for tag in left.split('('): 316 if tag: 317 stack.append((tag, wordnum)) 318 for i in range(right.count(')')): 319 (tag, start) = stack.pop() 320 spanlist.append( ((start, wordnum+1), tag) ) 321 spanlists.append(spanlist) 322 323 return spanlists
324
325 - def _get_srl_instances(self, grid, pos_in_tree):
326 tree = self._get_parsed_sent(grid, pos_in_tree) 327 spanlists = self._get_srl_spans(grid) 328 if self._srl_includes_roleset: 329 predicates = self._get_column(grid, self._colmap['srl']+1) 330 rolesets = self._get_column(grid, self._colmap['srl']) 331 else: 332 predicates = self._get_column(grid, self._colmap['srl']) 333 rolesets = [None] * len(predicates) 334 335 instances = ConllSRLInstanceList(tree) 336 for wordnum, predicate in enumerate(predicates): 337 if predicate == '-': continue 338 # Decide which spanlist to use. Don't assume that they're 339 # sorted in the same order as the predicates (even though 340 # they usually are). 341 for spanlist in spanlists: 342 for (start, end), tag in spanlist: 343 if wordnum in range(start,end) and tag in ('V', 'C-V'): 344 break 345 else: continue 346 break 347 else: 348 raise ValueError('No srl column found for %r' % predicate) 349 instances.append(ConllSRLInstance(tree, wordnum, predicate, 350 rolesets[wordnum], spanlist)) 351 352 return instances
353 354 #///////////////////////////////////////////////////////////////// 355 # Helper Methods 356 #///////////////////////////////////////////////////////////////// 357
358 - def _require(self, *columntypes):
359 for columntype in columntypes: 360 if columntype not in self._colmap: 361 raise ValueError('This corpus does not contain a %s ' 362 'column.' % columntype)
363 364 @staticmethod
365 - def _get_column(grid, column_index):
366 return [grid[i][column_index] for i in range(len(grid))]
367
368 369 -class ConllSRLInstance(object):
370 """ 371 An SRL instance from a CoNLL corpus, which identifies and 372 providing labels for the arguments of a single verb. 373 """ 374 # [xx] add inst.core_arguments, inst.argm_arguments? 375
376 - def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
377 self.verb = [] 378 """A list of the word indices of the words that compose the 379 verb whose arguments are identified by this instance. 380 This will contain multiple word indices when multi-word 381 verbs are used (e.g. 'turn on').""" 382 383 self.verb_head = verb_head 384 """The word index of the head word of the verb whose arguments 385 are identified by this instance. E.g., for a sentence that 386 uses the verb 'turn on,' C{verb_head} will be the word index 387 of the word 'turn'.""" 388 389 self.verb_stem = verb_stem 390 391 self.roleset = roleset 392 393 self.arguments = [] 394 """A list of C{(argspan, argid)} tuples, specifying the location 395 and type for each of the arguments identified by this 396 instance. C{argspan} is a tuple C{start, end}, indicating 397 that the argument consists of the C{words[start:end]}.""" 398 399 self.tagged_spans = tagged_spans 400 """A list of C{(span, id)} tuples, specifying the location and 401 type for each of the arguments, as well as the verb pieces, 402 that make up this instance.""" 403 404 self.tree = tree 405 """The parse tree for the sentence containing this instance.""" 406 407 self.words = tree.leaves() 408 """A list of the words in the sentence containing this 409 instance.""" 410 411 # Fill in the self.verb and self.arguments values. 412 for (start, end), tag in tagged_spans: 413 if tag in ('V', 'C-V'): 414 self.verb += range(start, end) 415 else: 416 self.arguments.append( ((start, end), tag) )
417
418 - def __repr__(self):
419 plural = len(self.arguments)!=1 and 's' or '' 420 return '<ConllSRLInstance for %r with %d argument%s>' % ( 421 (self.verb_stem, len(self.arguments), plural))
422
423 - def pprint(self):
424 verbstr = ' '.join(self.words[i][0] for i in self.verb) 425 hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem) 426 s = '' 427 for i, word in enumerate(self.words): 428 if isinstance(word, tuple): word = word[0] 429 for (start, end), argid in self.arguments: 430 if i == start: s += '[%s ' % argid 431 if i == end: s += '] ' 432 if i in self.verb: word = '<<%s>>' % word 433 s += word + ' ' 434 return hdr + textwrap.fill(s.replace(' ]', ']'), 435 initial_indent=' ', 436 subsequent_indent=' ')
437
438 -class ConllSRLInstanceList(list):
439 """ 440 Set of instances for a single sentence 441 """
442 - def __init__(self, tree, instances=()):
443 self.tree = tree 444 list.__init__(self, instances)
445
446 - def __str__(self):
447 return self.pprint()
448
449 - def pprint(self, include_tree=False):
450 # Sanity check: trees should be the same 451 for inst in self: 452 if inst.tree != self.tree: 453 raise ValueError('Tree mismatch!') 454 455 # If desired, add trees: 456 if include_tree: 457 words = self.tree.leaves() 458 pos = [None] * len(words) 459 synt = ['*'] * len(words) 460 self._tree2conll(self.tree, 0, words, pos, synt) 461 462 s = '' 463 for i in range(len(words)): 464 # optional tree columns 465 if include_tree: 466 s += '%-20s ' % words[i] 467 s += '%-8s ' % pos[i] 468 s += '%15s*%-8s ' % tuple(synt[i].split('*')) 469 470 # verb head column 471 for inst in self: 472 if i == inst.verb_head: 473 s += '%-20s ' % inst.verb_stem 474 break 475 else: 476 s += '%-20s ' % '-' 477 # Remaining columns: self 478 for inst in self: 479 argstr = '*' 480 for (start, end), argid in inst.tagged_spans: 481 if i==start: argstr = '(%s%s' % (argid, argstr) 482 if i==(end-1): argstr += ')' 483 s += '%-12s ' % argstr 484 s += '\n' 485 return s
486
487 - def _tree2conll(self, tree, wordnum, words, pos, synt):
488 assert isinstance(tree, Tree) 489 if len(tree) == 1 and isinstance(tree[0], basestring): 490 pos[wordnum] = tree.node 491 assert words[wordnum] == tree[0] 492 return wordnum+1 493 elif len(tree) == 1 and isinstance(tree[0], tuple): 494 assert len(tree[0]) == 2 495 pos[wordnum], pos[wordnum] = tree[0] 496 return wordnum+1 497 else: 498 synt[wordnum] = '(%s%s' % (tree.node, synt[wordnum]) 499 for child in tree: 500 wordnum = self._tree2conll(child, wordnum, words, 501 pos, synt) 502 synt[wordnum-1] += ')' 503 return wordnum
504
505 -class ConllChunkCorpusReader(ConllCorpusReader):
506 """ 507 A ConllCorpusReader whose data file contains three columns: words, 508 pos, and chunk. 509 """
510 - def __init__(self, root, fileids, chunk_types, encoding=None, 511 tag_mapping_function=None):
512 ConllCorpusReader.__init__( 513 self, root, fileids, ('words', 'pos', 'chunk'), 514 chunk_types=chunk_types, encoding=encoding, 515 tag_mapping_function=tag_mapping_function)
516