Package nltk :: Package corpus :: Package reader :: Module nombank
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.nombank

  1  # Natural Language Toolkit: NomBank Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Authors: Paul Bedaride <paul.bedaride@gmail.com>  
  5  #          Edward Loper <edloper@gradient.cis.upenn.edu> 
  6  # URL: <http://www.nltk.org/> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import re 
 10  import codecs 
 11   
 12  from nltk.tree import Tree 
 13  from nltk.etree import ElementTree 
 14   
 15  from nltk.corpus.reader.util import * 
 16  from nltk.corpus.reader.api import * 
17 18 -class NombankCorpusReader(CorpusReader):
19 """ 20 Corpus reader for the nombank corpus, which augments the Penn 21 Treebank with information about the predicate argument structure 22 of every noun instance. The corpus consists of two parts: the 23 predicate-argument annotations themselves, and a set of X{frameset 24 files} which define the argument labels used by the annotations, 25 on a per-noun basis. Each X{frameset file} contains one or more 26 predicates, such as C{'turn'} or C{'turn_on'}, each of which is 27 divided into coarse-grained word senses called X{rolesets}. For 28 each X{roleset}, the frameset file provides descriptions of the 29 argument roles, along with examples. 30 """
31 - def __init__(self, root, nomfile, framefiles='', 32 nounsfile=None, parse_fileid_xform=None, 33 parse_corpus=None, encoding=None):
34 """ 35 @param root: The root directory for this corpus. 36 @param nomfile: The name of the file containing the predicate- 37 argument annotations (relative to C{root}). 38 @param framefiles: A list or regexp specifying the frameset 39 fileids for this corpus. 40 @param parse_fileid_xform: A transform that should be applied 41 to the fileids in this corpus. This should be a function 42 of one argument (a fileid) that returns a string (the new 43 fileid). 44 @param parse_corpus: The corpus containing the parse trees 45 corresponding to this corpus. These parse trees are 46 necessary to resolve the tree pointers used by nombank. 47 """ 48 # If framefiles is specified as a regexp, expand it. 49 if isinstance(framefiles, basestring): 50 framefiles = find_corpus_fileids(root, framefiles) 51 framefiles = list(framefiles) 52 # Initialze the corpus reader. 53 CorpusReader.__init__(self, root, [nomfile, nounsfile] + framefiles, 54 encoding) 55 56 # Record our frame fileids & nom file. 57 self._nomfile = nomfile 58 self._framefiles = framefiles 59 self._nounsfile = nounsfile 60 self._parse_fileid_xform = parse_fileid_xform 61 self._parse_corpus = parse_corpus
62
63 - def raw(self, fileids=None):
64 """ 65 @return: the text contents of the given fileids, as a single string. 66 """ 67 if fileids is None: fileids = self._fileids 68 elif isinstance(fileids, basestring): fileids = [fileids] 69 return concat([self.open(f).read() for f in fileids])
70
71 - def instances(self):
72 """ 73 @return: a corpus view that acts as a list of 74 L{NombankInstance} objects, one for each noun in the corpus. 75 """ 76 return StreamBackedCorpusView(self.abspath(self._nomfile), 77 self._read_instance_block, 78 encoding=self.encoding(self._nomfile))
79
80 - def lines(self):
81 """ 82 @return: a corpus view that acts as a list of strings, one for 83 each line in the predicate-argument annotation file. 84 """ 85 return StreamBackedCorpusView(self.abspath(self._nomfile), 86 read_line_block, 87 encoding=self.encoding(self._nomfile))
88
89 - def roleset(self, roleset_id):
90 """ 91 @return: the xml description for the given roleset. 92 """ 93 lemma = roleset_id.split('.')[0] 94 framefile = 'frames/%s.xml' % lemma 95 if framefile not in self._framefiles: 96 raise ValueError('Frameset file for %s not found' % 97 roleset_id) 98 99 # n.b.: The encoding for XML fileids is specified by the file 100 # itself; so we ignore self._encoding here. 101 etree = ElementTree.parse(self.abspath(framefile).open()).getroot() 102 for roleset in etree.findall('predicate/roleset'): 103 if roleset.attrib['id'] == roleset_id: 104 return roleset 105 else: 106 raise ValueError('Roleset %s not found in %s' % 107 (roleset_id, framefile))
108
109 - def nouns(self):
110 """ 111 @return: a corpus view that acts as a list of all noun lemmas 112 in this corpus (from the nombank.1.0.words file). 113 """ 114 return StreamBackedCorpusView(self.abspath(self._nounsfile), 115 read_line_block, 116 encoding=self.encoding(self._nounsfile))
117
118 - def _read_instance_block(self, stream):
119 block = [] 120 121 # Read 100 at a time. 122 for i in range(100): 123 line = stream.readline().strip() 124 if line: 125 block.append(NombankInstance.parse( 126 line, self._parse_fileid_xform, 127 self._parse_corpus)) 128 129 return block
130
131 ###################################################################### 132 #{ Nombank Instance & related datatypes 133 ###################################################################### 134 135 -class NombankInstance(object):
136
137 - def __init__(self, fileid, sentnum, wordnum, baseform, sensenumber, 138 predicate, predid, arguments, parse_corpus=None):
139 140 self.fileid = fileid 141 """The name of the file containing the parse tree for this 142 instance's sentence.""" 143 144 self.sentnum = sentnum 145 """The sentence number of this sentence within L{fileid}. 146 Indexing starts from zero.""" 147 148 self.wordnum = wordnum 149 """The word number of this instance's predicate within its 150 containing sentence. Word numbers are indexed starting from 151 zero, and include traces and other empty parse elements.""" 152 153 self.baseform = baseform 154 """The baseform of the predicate.""" 155 156 self.sensenumber = sensenumber 157 """The sense number os the predicate""" 158 159 self.predicate = predicate 160 """A L{NombankTreePointer} indicating the position of this 161 instance's predicate within its containing sentence.""" 162 163 self.predid = predid 164 """Identifier of the predicate """ 165 166 self.arguments = tuple(arguments) 167 """A list of tuples (argloc, argid), specifying the location 168 and identifier for each of the predicate's argument in the 169 containing sentence. Argument identifiers are strings such as 170 C{'ARG0'} or C{'ARGM-TMP'}. This list does *not* contain 171 the predicate.""" 172 173 self.parse_corpus = parse_corpus 174 """A corpus reader for the parse trees corresponding to the 175 instances in this nombank corpus."""
176 177 @property
178 - def roleset(self):
179 """The name of the roleset used by this instance's predicate. 180 Use L{nombank.roleset() <NombankCorpusReader.roleset>} to 181 look up information about the roleset.""" 182 return '%s.%s'%(self.baseform, self.sensenumber)
183
184 - def __repr__(self):
185 return ('<NombankInstance: %s, sent %s, word %s>' % 186 (self.fileid, self.sentnum, self.wordnum))
187
188 - def __str__(self):
189 s = '%s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum, 190 self.basename, self.sensenumber) 191 items = self.arguments + ((self.predicate, 'rel'),) 192 for (argloc, argid) in sorted(items): 193 s += ' %s-%s' % (argloc, argid) 194 return s
195
196 - def _get_tree(self):
197 if self.parse_corpus is None: return None 198 if self.fileid not in self.parse_corpus.fileids(): return None 199 return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
200 tree = property(_get_tree, doc=""" 201 The parse tree corresponding to this instance, or C{None} if 202 the corresponding tree is not available.""") 203 204 @staticmethod
205 - def parse(s, parse_fileid_xform=None, parse_corpus=None):
206 pieces = s.split() 207 if len(pieces) < 6: 208 raise ValueError('Badly formatted nombank line: %r' % s) 209 210 # Divide the line into its basic pieces. 211 (fileid, sentnum, wordnum, 212 baseform, sensenumber) = pieces[:5] 213 214 args = pieces[5:] 215 rel = [args.pop(i) for i,p in enumerate(args) if '-rel' in p] 216 if len(rel) != 1: 217 raise ValueError('Badly formatted nombank line: %r' % s) 218 219 # Apply the fileid selector, if any. 220 if parse_fileid_xform is not None: 221 fileid = parse_fileid_xform(fileid) 222 223 # Convert sentence & word numbers to ints. 224 sentnum = int(sentnum) 225 wordnum = int(wordnum) 226 227 # Parse the predicate location. 228 229 predloc, predid = rel[0].split('-', 1) 230 predicate = NombankTreePointer.parse(predloc) 231 232 # Parse the arguments. 233 arguments = [] 234 for arg in args: 235 argloc, argid = arg.split('-', 1) 236 arguments.append( (NombankTreePointer.parse(argloc), argid) ) 237 238 # Put it all together. 239 return NombankInstance(fileid, sentnum, wordnum, baseform, sensenumber, 240 predicate, predid, arguments, parse_corpus)
241
242 -class NombankPointer(object):
243 """ 244 A pointer used by nombank to identify one or more constituents in 245 a parse tree. C{NombankPointer} is an abstract base class with 246 three concrete subclasses: 247 248 - L{NombankTreePointer} is used to point to single constituents. 249 - L{NombankSplitTreePointer} is used to point to 'split' 250 constituents, which consist of a sequence of two or more 251 C{NombankTreePointer}s. 252 - L{NombankChainTreePointer} is used to point to entire trace 253 chains in a tree. It consists of a sequence of pieces, which 254 can be C{NombankTreePointer}s or C{NombankSplitTreePointer}s. 255 """
256 - def __init__(self):
257 if self.__class__ == NombankPoitner: 258 raise AssertionError('NombankPointer is an abstract base class')
259
260 -class NombankChainTreePointer(NombankPointer):
261 - def __init__(self, pieces):
262 self.pieces = pieces 263 """A list of the pieces that make up this chain. Elements may 264 be either L{NombankSplitTreePointer}s or 265 L{NombankTreePointer}s."""
266
267 - def __str__(self):
268 return '*'.join('%s' % p for p in self.pieces)
269 - def __repr__(self):
270 return '<NombankChainTreePointer: %s>' % self
271 - def select(self, tree):
272 if tree is None: raise ValueError('Parse tree not avaialable') 273 return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
274
275 -class NombankSplitTreePointer(NombankPointer):
276 - def __init__(self, pieces):
277 self.pieces = pieces 278 """A list of the pieces that make up this chain. Elements are 279 all L{NombankTreePointer}s."""
280
281 - def __str__(self):
282 return ','.join('%s' % p for p in self.pieces)
283 - def __repr__(self):
284 return '<NombankSplitTreePointer: %s>' % self
285 - def select(self, tree):
286 if tree is None: raise ValueError('Parse tree not avaialable') 287 return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
288
289 -class NombankTreePointer(NombankPointer):
290 """ 291 wordnum:height*wordnum:height*... 292 wordnum:height, 293 294 """
295 - def __init__(self, wordnum, height):
296 self.wordnum = wordnum 297 self.height = height
298 299 @staticmethod
300 - def parse(s):
301 # Deal with chains (xx*yy*zz) 302 pieces = s.split('*') 303 if len(pieces) > 1: 304 return NombankChainTreePointer([NombankTreePointer.parse(elt) 305 for elt in pieces]) 306 307 # Deal with split args (xx,yy,zz) 308 pieces = s.split(',') 309 if len(pieces) > 1: 310 return NombankSplitTreePointer([NombankTreePointer.parse(elt) 311 for elt in pieces]) 312 313 # Deal with normal pointers. 314 pieces = s.split(':') 315 if len(pieces) != 2: raise ValueError('bad nombank pointer %r' % s) 316 return NombankTreePointer(int(pieces[0]), int(pieces[1]))
317
318 - def __str__(self):
319 return '%s:%s' % (self.wordnum, self.height)
320
321 - def __repr__(self):
322 return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height)
323
324 - def __cmp__(self, other):
325 while isinstance(other, (NombankChainTreePointer, 326 NombankSplitTreePointer)): 327 other = other.pieces[0] 328 329 if not isinstance(other, NombankTreePointer): 330 return cmp(id(self), id(other)) 331 332 return cmp( (self.wordnum, -self.height), 333 (other.wordnum, -other.height) )
334
335 - def select(self, tree):
336 if tree is None: raise ValueError('Parse tree not avaialable') 337 return tree[self.treepos(tree)]
338
339 - def treepos(self, tree):
340 """ 341 Convert this pointer to a standard 'tree position' pointer, 342 given that it points to the given tree. 343 """ 344 if tree is None: raise ValueError('Parse tree not avaialable') 345 stack = [tree] 346 treepos = [] 347 348 wordnum = 0 349 while True: 350 #print treepos 351 #print stack[-1] 352 # tree node: 353 if isinstance(stack[-1], Tree): 354 # Select the next child. 355 if len(treepos) < len(stack): 356 treepos.append(0) 357 else: 358 treepos[-1] += 1 359 # Update the stack. 360 if treepos[-1] < len(stack[-1]): 361 stack.append(stack[-1][treepos[-1]]) 362 else: 363 # End of node's child list: pop up a level. 364 stack.pop() 365 treepos.pop() 366 # word node: 367 else: 368 if wordnum == self.wordnum: 369 return tuple(treepos[:len(treepos)-self.height-1]) 370 else: 371 wordnum += 1 372 stack.pop()
373