Package nltk :: Package corpus :: Package reader :: Module propbank
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.propbank

  1  # Natural Language Toolkit: PropBank Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import re 
  9  import codecs 
 10   
 11  from nltk.tree import Tree 
 12  from nltk.etree import ElementTree 
 13   
 14  from util import * 
 15  from api import * 
16 17 -class PropbankCorpusReader(CorpusReader):
18 """ 19 Corpus reader for the propbank corpus, which augments the Penn 20 Treebank with information about the predicate argument structure 21 of every verb instance. The corpus consists of two parts: the 22 predicate-argument annotations themselves, and a set of X{frameset 23 files} which define the argument labels used by the annotations, 24 on a per-verb basis. Each X{frameset file} contains one or more 25 predicates, such as C{'turn'} or C{'turn_on'}, each of which is 26 divided into coarse-grained word senses called X{rolesets}. For 27 each X{roleset}, the frameset file provides descriptions of the 28 argument roles, along with examples. 29 """
30 - def __init__(self, root, propfile, framefiles='', 31 verbsfile=None, parse_fileid_xform=None, 32 parse_corpus=None, encoding=None):
33 """ 34 @param root: The root directory for this corpus. 35 @param propfile: The name of the file containing the predicate- 36 argument annotations (relative to C{root}). 37 @param framefiles: A list or regexp specifying the frameset 38 fileids for this corpus. 39 @param parse_fileid_xform: A transform that should be applied 40 to the fileids in this corpus. This should be a function 41 of one argument (a fileid) that returns a string (the new 42 fileid). 43 @param parse_corpus: The corpus containing the parse trees 44 corresponding to this corpus. These parse trees are 45 necessary to resolve the tree pointers used by propbank. 46 """ 47 # If framefiles is specified as a regexp, expand it. 48 if isinstance(framefiles, basestring): 49 framefiles = find_corpus_fileids(root, framefiles) 50 framefiles = list(framefiles) 51 # Initialze the corpus reader. 52 CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, 53 encoding) 54 55 # Record our frame fileids & prop file. 56 self._propfile = propfile 57 self._framefiles = framefiles 58 self._verbsfile = verbsfile 59 self._parse_fileid_xform = parse_fileid_xform 60 self._parse_corpus = parse_corpus
61
62 - def raw(self, fileids=None):
63 """ 64 @return: the text contents of the given fileids, as a single string. 65 """ 66 if fileids is None: fileids = self._fileids 67 elif isinstance(fileids, basestring): fileids = [fileids] 68 return concat([self.open(f).read() for f in fileids])
69
70 - def instances(self):
71 """ 72 @return: a corpus view that acts as a list of 73 L{PropbankInstance} objects, one for each verb in the corpus. 74 """ 75 return StreamBackedCorpusView(self.abspath(self._propfile), 76 self._read_instance_block, 77 encoding=self.encoding(self._propfile))
78
79 - def lines(self):
80 """ 81 @return: a corpus view that acts as a list of strings, one for 82 each line in the predicate-argument annotation file. 83 """ 84 return StreamBackedCorpusView(self.abspath(self._propfile), 85 read_line_block, 86 encoding=self.encoding(self._propfile))
87
88 - def roleset(self, roleset_id):
89 """ 90 @return: the xml description for the given roleset. 91 """ 92 lemma = roleset_id.split('.')[0] 93 framefile = 'frames/%s.xml' % lemma 94 if framefile not in self._framefiles: 95 raise ValueError('Frameset file for %s not found' % 96 roleset_id) 97 98 # n.b.: The encoding for XML fileids is specified by the file 99 # itself; so we ignore self._encoding here. 100 etree = ElementTree.parse(self.abspath(framefile).open()).getroot() 101 for roleset in etree.findall('predicate/roleset'): 102 if roleset.attrib['id'] == roleset_id: 103 return roleset 104 else: 105 raise ValueError('Roleset %s not found in %s' % 106 (roleset_id, framefile))
107
108 - def verbs(self):
109 """ 110 @return: a corpus view that acts as a list of all verb lemmas 111 in this corpus (from the verbs.txt file). 112 """ 113 return StreamBackedCorpusView(self.abspath(self._verbsfile), 114 read_line_block, 115 encoding=self.encoding(self._verbsfile))
116
117 - def _read_instance_block(self, stream):
118 block = [] 119 120 # Read 100 at a time. 121 for i in range(100): 122 line = stream.readline().strip() 123 if line: 124 block.append(PropbankInstance.parse( 125 line, self._parse_fileid_xform, 126 self._parse_corpus)) 127 128 return block
129
130 ###################################################################### 131 #{ Propbank Instance & related datatypes 132 ###################################################################### 133 134 -class PropbankInstance(object):
135
136 - def __init__(self, fileid, sentnum, wordnum, tagger, roleset, 137 inflection, predicate, arguments, parse_corpus=None):
138 139 self.fileid = fileid 140 """The name of the file containing the parse tree for this 141 instance's sentence.""" 142 143 self.sentnum = sentnum 144 """The sentence number of this sentence within L{fileid}. 145 Indexing starts from zero.""" 146 147 self.wordnum = wordnum 148 """The word number of this instance's predicate within its 149 containing sentence. Word numbers are indexed starting from 150 zero, and include traces and other empty parse elements.""" 151 152 self.tagger = tagger 153 """An identifier for the tagger who tagged this instance; or 154 C{'gold'} if this is an adjuticated instance.""" 155 156 self.roleset = roleset 157 """The name of the roleset used by this instance's predicate. 158 Use L{propbank.roleset() <PropbankCorpusReader.roleset>} to 159 look up information about the roleset.""" 160 161 self.inflection = inflection 162 """A {PropbankInflection} object describing the inflection of 163 this instance's predicate.""" 164 165 self.predicate = predicate 166 """A L{PropbankTreePointer} indicating the position of this 167 instance's predicate within its containing sentence.""" 168 169 self.arguments = tuple(arguments) 170 """A list of tuples (argloc, argid), specifying the location 171 and identifier for each of the predicate's argument in the 172 containing sentence. Argument identifiers are strings such as 173 C{'ARG0'} or C{'ARGM-TMP'}. This list does *not* contain 174 the predicate.""" 175 176 self.parse_corpus = parse_corpus 177 """A corpus reader for the parse trees corresponding to the 178 instances in this propbank corpus."""
179
180 - def __repr__(self):
181 return ('<PropbankInstance: %s, sent %s, word %s>' % 182 (self.fileid, self.sentnum, self.wordnum))
183
184 - def __str__(self):
185 s = '%s %s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum, 186 self.tagger, self.roleset, self.inflection) 187 items = self.arguments + ((self.predicate, 'rel'),) 188 for (argloc, argid) in sorted(items): 189 s += ' %s-%s' % (argloc, argid) 190 return s
191
192 - def _get_tree(self):
193 if self.parse_corpus is None: return None 194 if self.fileid not in self.parse_corpus.fileids(): return None 195 return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
196 tree = property(_get_tree, doc=""" 197 The parse tree corresponding to this instance, or C{None} if 198 the corresponding tree is not available.""") 199 200 @staticmethod
201 - def parse(s, parse_fileid_xform=None, parse_corpus=None):
202 pieces = s.split() 203 if len(pieces) < 7: 204 raise ValueError('Badly formatted propbank line: %r' % s) 205 206 # Divide the line into its basic pieces. 207 (fileid, sentnum, wordnum, 208 tagger, roleset, inflection) = pieces[:6] 209 rel = [p for p in pieces[6:] if p.endswith('-rel')] 210 args = [p for p in pieces[6:] if not p.endswith('-rel')] 211 if len(rel) != 1: 212 raise ValueError('Badly formatted propbank line: %r' % s) 213 214 # Apply the fileid selector, if any. 215 if parse_fileid_xform is not None: 216 fileid = parse_fileid_xform(fileid) 217 218 # Convert sentence & word numbers to ints. 219 sentnum = int(sentnum) 220 wordnum = int(wordnum) 221 222 # Parse the inflection 223 inflection = PropbankInflection.parse(inflection) 224 225 # Parse the predicate location. 226 predicate = PropbankTreePointer.parse(rel[0][:-4]) 227 228 # Parse the arguments. 229 arguments = [] 230 for arg in args: 231 argloc, argid = arg.split('-', 1) 232 arguments.append( (PropbankTreePointer.parse(argloc), argid) ) 233 234 # Put it all together. 235 return PropbankInstance(fileid, sentnum, wordnum, tagger, 236 roleset, inflection, predicate, 237 arguments, parse_corpus)
238
239 -class PropbankPointer(object):
240 """ 241 A pointer used by propbank to identify one or more constituents in 242 a parse tree. C{PropbankPointer} is an abstract base class with 243 three concrete subclasses: 244 245 - L{PropbankTreePointer} is used to point to single constituents. 246 - L{PropbankSplitTreePointer} is used to point to 'split' 247 constituents, which consist of a sequence of two or more 248 C{PropbankTreePointer}s. 249 - L{PropbankChainTreePointer} is used to point to entire trace 250 chains in a tree. It consists of a sequence of pieces, which 251 can be C{PropbankTreePointer}s or C{PropbankSplitTreePointer}s. 252 """
253 - def __init__(self):
254 if self.__class__ == PropbankPoitner: 255 raise AssertionError('PropbankPointer is an abstract base class')
256
257 -class PropbankChainTreePointer(PropbankPointer):
258 - def __init__(self, pieces):
259 self.pieces = pieces 260 """A list of the pieces that make up this chain. Elements may 261 be either L{PropbankSplitTreePointer}s or 262 L{PropbankTreePointer}s."""
263
264 - def __str__(self):
265 return '*'.join('%s' % p for p in self.pieces)
266 - def __repr__(self):
267 return '<PropbankChainTreePointer: %s>' % self
268 - def select(self, tree):
269 if tree is None: raise ValueError('Parse tree not avaialable') 270 return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
271
272 -class PropbankSplitTreePointer(PropbankPointer):
273 - def __init__(self, pieces):
274 self.pieces = pieces 275 """A list of the pieces that make up this chain. Elements are 276 all L{PropbankTreePointer}s."""
277
278 - def __str__(self):
279 return ','.join('%s' % p for p in self.pieces)
280 - def __repr__(self):
281 return '<PropbankSplitTreePointer: %s>' % self
282 - def select(self, tree):
283 if tree is None: raise ValueError('Parse tree not avaialable') 284 return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
285
286 -class PropbankTreePointer(PropbankPointer):
287 """ 288 wordnum:height*wordnum:height*... 289 wordnum:height, 290 291 """
292 - def __init__(self, wordnum, height):
293 self.wordnum = wordnum 294 self.height = height
295 296 @staticmethod
297 - def parse(s):
298 # Deal with chains (xx*yy*zz) 299 pieces = s.split('*') 300 if len(pieces) > 1: 301 return PropbankChainTreePointer([PropbankTreePointer.parse(elt) 302 for elt in pieces]) 303 304 # Deal with split args (xx,yy,zz) 305 pieces = s.split(',') 306 if len(pieces) > 1: 307 return PropbankSplitTreePointer([PropbankTreePointer.parse(elt) 308 for elt in pieces]) 309 310 # Deal with normal pointers. 311 pieces = s.split(':') 312 if len(pieces) != 2: raise ValueError('bad propbank pointer %r' % s) 313 return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
314
315 - def __str__(self):
316 return '%s:%s' % (self.wordnum, self.height)
317
318 - def __repr__(self):
319 return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
320
321 - def __cmp__(self, other):
322 while isinstance(other, (PropbankChainTreePointer, 323 PropbankSplitTreePointer)): 324 other = other.pieces[0] 325 326 if not isinstance(other, PropbankTreePointer): 327 return cmp(id(self), id(other)) 328 329 return cmp( (self.wordnum, -self.height), 330 (other.wordnum, -other.height) )
331
332 - def select(self, tree):
333 if tree is None: raise ValueError('Parse tree not avaialable') 334 return tree[self.treepos(tree)]
335
336 - def treepos(self, tree):
337 """ 338 Convert this pointer to a standard 'tree position' pointer, 339 given that it points to the given tree. 340 """ 341 if tree is None: raise ValueError('Parse tree not avaialable') 342 stack = [tree] 343 treepos = [] 344 345 wordnum = 0 346 while True: 347 #print treepos 348 #print stack[-1] 349 # tree node: 350 if isinstance(stack[-1], Tree): 351 # Select the next child. 352 if len(treepos) < len(stack): 353 treepos.append(0) 354 else: 355 treepos[-1] += 1 356 # Update the stack. 357 if treepos[-1] < len(stack[-1]): 358 stack.append(stack[-1][treepos[-1]]) 359 else: 360 # End of node's child list: pop up a level. 361 stack.pop() 362 treepos.pop() 363 # word node: 364 else: 365 if wordnum == self.wordnum: 366 return tuple(treepos[:len(treepos)-self.height-1]) 367 else: 368 wordnum += 1 369 stack.pop()
370
371 -class PropbankInflection(object):
372 #{ Inflection Form 373 INFINITIVE = 'i' 374 GERUND = 'g' 375 PARTICIPLE = 'p' 376 FINITE = 'v' 377 #{ Inflection Tense 378 FUTURE = 'f' 379 PAST = 'p' 380 PRESENT = 'n' 381 #{ Inflection Aspect 382 PERFECT = 'p' 383 PROGRESSIVE = 'o' 384 PERFECT_AND_PROGRESSIVE = 'b' 385 #{ Inflection Person 386 THIRD_PERSON = '3' 387 #{ Inflection Voice 388 ACTIVE = 'a' 389 PASSIVE = 'p' 390 #{ Inflection 391 NONE = '-' 392 #} 393
394 - def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
395 self.form = form 396 self.tense = tense 397 self.aspect = aspect 398 self.person = person 399 self.voice = voice
400
401 - def __str__(self):
402 return self.form+self.tense+self.aspect+self.person+self.voice
403
404 - def __repr__(self):
405 return '<PropbankInflection: %s>' % self
406 407 _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$') 408 409 @staticmethod
410 - def parse(s):
411 if not isinstance(s, basestring): 412 raise TypeError('expected a string') 413 if (len(s) != 5 or 414 not PropbankInflection._VALIDATE.match(s)): 415 raise ValueError('Bad propbank inflection string %r' % s) 416 return PropbankInflection(*s)
417