1
2
3
4
5
6
7
8
9 import re
10 import codecs
11
12 from nltk.tree import Tree
13 from nltk.etree import ElementTree
14
15 from nltk.corpus.reader.util import *
16 from nltk.corpus.reader.api import *
19 """
20 Corpus reader for the nombank corpus, which augments the Penn
21 Treebank with information about the predicate argument structure
22 of every noun instance. The corpus consists of two parts: the
23 predicate-argument annotations themselves, and a set of X{frameset
24 files} which define the argument labels used by the annotations,
25 on a per-noun basis. Each X{frameset file} contains one or more
26 predicates, such as C{'turn'} or C{'turn_on'}, each of which is
27 divided into coarse-grained word senses called X{rolesets}. For
28 each X{roleset}, the frameset file provides descriptions of the
29 argument roles, along with examples.
30 """
31 - def __init__(self, root, nomfile, framefiles='',
32 nounsfile=None, parse_fileid_xform=None,
33 parse_corpus=None, encoding=None):
34 """
35 @param root: The root directory for this corpus.
36 @param nomfile: The name of the file containing the predicate-
37 argument annotations (relative to C{root}).
38 @param framefiles: A list or regexp specifying the frameset
39 fileids for this corpus.
40 @param parse_fileid_xform: A transform that should be applied
41 to the fileids in this corpus. This should be a function
42 of one argument (a fileid) that returns a string (the new
43 fileid).
44 @param parse_corpus: The corpus containing the parse trees
45 corresponding to this corpus. These parse trees are
46 necessary to resolve the tree pointers used by nombank.
47 """
48
49 if isinstance(framefiles, basestring):
50 framefiles = find_corpus_fileids(root, framefiles)
51 framefiles = list(framefiles)
52
53 CorpusReader.__init__(self, root, [nomfile, nounsfile] + framefiles,
54 encoding)
55
56
57 self._nomfile = nomfile
58 self._framefiles = framefiles
59 self._nounsfile = nounsfile
60 self._parse_fileid_xform = parse_fileid_xform
61 self._parse_corpus = parse_corpus
62
63 - def raw(self, fileids=None):
70
79
88
90 """
91 @return: the xml description for the given roleset.
92 """
93 lemma = roleset_id.split('.')[0]
94 framefile = 'frames/%s.xml' % lemma
95 if framefile not in self._framefiles:
96 raise ValueError('Frameset file for %s not found' %
97 roleset_id)
98
99
100
101 etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
102 for roleset in etree.findall('predicate/roleset'):
103 if roleset.attrib['id'] == roleset_id:
104 return roleset
105 else:
106 raise ValueError('Roleset %s not found in %s' %
107 (roleset_id, framefile))
108
117
130
136
137 - def __init__(self, fileid, sentnum, wordnum, baseform, sensenumber,
138 predicate, predid, arguments, parse_corpus=None):
139
140 self.fileid = fileid
141 """The name of the file containing the parse tree for this
142 instance's sentence."""
143
144 self.sentnum = sentnum
145 """The sentence number of this sentence within L{fileid}.
146 Indexing starts from zero."""
147
148 self.wordnum = wordnum
149 """The word number of this instance's predicate within its
150 containing sentence. Word numbers are indexed starting from
151 zero, and include traces and other empty parse elements."""
152
153 self.baseform = baseform
154 """The baseform of the predicate."""
155
156 self.sensenumber = sensenumber
157 """The sense number os the predicate"""
158
159 self.predicate = predicate
160 """A L{NombankTreePointer} indicating the position of this
161 instance's predicate within its containing sentence."""
162
163 self.predid = predid
164 """Identifier of the predicate """
165
166 self.arguments = tuple(arguments)
167 """A list of tuples (argloc, argid), specifying the location
168 and identifier for each of the predicate's argument in the
169 containing sentence. Argument identifiers are strings such as
170 C{'ARG0'} or C{'ARGM-TMP'}. This list does *not* contain
171 the predicate."""
172
173 self.parse_corpus = parse_corpus
174 """A corpus reader for the parse trees corresponding to the
175 instances in this nombank corpus."""
176
177 @property
179 """The name of the roleset used by this instance's predicate.
180 Use L{nombank.roleset() <NombankCorpusReader.roleset>} to
181 look up information about the roleset."""
182 return '%s.%s'%(self.baseform, self.sensenumber)
183
185 return ('<NombankInstance: %s, sent %s, word %s>' %
186 (self.fileid, self.sentnum, self.wordnum))
187
189 s = '%s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum,
190 self.basename, self.sensenumber)
191 items = self.arguments + ((self.predicate, 'rel'),)
192 for (argloc, argid) in sorted(items):
193 s += ' %s-%s' % (argloc, argid)
194 return s
195
197 if self.parse_corpus is None: return None
198 if self.fileid not in self.parse_corpus.fileids(): return None
199 return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
200 tree = property(_get_tree, doc="""
201 The parse tree corresponding to this instance, or C{None} if
202 the corresponding tree is not available.""")
203
204 @staticmethod
205 - def parse(s, parse_fileid_xform=None, parse_corpus=None):
206 pieces = s.split()
207 if len(pieces) < 6:
208 raise ValueError('Badly formatted nombank line: %r' % s)
209
210
211 (fileid, sentnum, wordnum,
212 baseform, sensenumber) = pieces[:5]
213
214 args = pieces[5:]
215 rel = [args.pop(i) for i,p in enumerate(args) if '-rel' in p]
216 if len(rel) != 1:
217 raise ValueError('Badly formatted nombank line: %r' % s)
218
219
220 if parse_fileid_xform is not None:
221 fileid = parse_fileid_xform(fileid)
222
223
224 sentnum = int(sentnum)
225 wordnum = int(wordnum)
226
227
228
229 predloc, predid = rel[0].split('-', 1)
230 predicate = NombankTreePointer.parse(predloc)
231
232
233 arguments = []
234 for arg in args:
235 argloc, argid = arg.split('-', 1)
236 arguments.append( (NombankTreePointer.parse(argloc), argid) )
237
238
239 return NombankInstance(fileid, sentnum, wordnum, baseform, sensenumber,
240 predicate, predid, arguments, parse_corpus)
241
243 """
244 A pointer used by nombank to identify one or more constituents in
245 a parse tree. C{NombankPointer} is an abstract base class with
246 three concrete subclasses:
247
248 - L{NombankTreePointer} is used to point to single constituents.
249 - L{NombankSplitTreePointer} is used to point to 'split'
250 constituents, which consist of a sequence of two or more
251 C{NombankTreePointer}s.
252 - L{NombankChainTreePointer} is used to point to entire trace
253 chains in a tree. It consists of a sequence of pieces, which
254 can be C{NombankTreePointer}s or C{NombankSplitTreePointer}s.
255 """
257 if self.__class__ == NombankPoitner:
258 raise AssertionError('NombankPointer is an abstract base class')
259
262 self.pieces = pieces
263 """A list of the pieces that make up this chain. Elements may
264 be either L{NombankSplitTreePointer}s or
265 L{NombankTreePointer}s."""
266
268 return '*'.join('%s' % p for p in self.pieces)
270 return '<NombankChainTreePointer: %s>' % self
272 if tree is None: raise ValueError('Parse tree not avaialable')
273 return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
274
277 self.pieces = pieces
278 """A list of the pieces that make up this chain. Elements are
279 all L{NombankTreePointer}s."""
280
282 return ','.join('%s' % p for p in self.pieces)
284 return '<NombankSplitTreePointer: %s>' % self
286 if tree is None: raise ValueError('Parse tree not avaialable')
287 return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
288
290 """
291 wordnum:height*wordnum:height*...
292 wordnum:height,
293
294 """
298
299 @staticmethod
317
319 return '%s:%s' % (self.wordnum, self.height)
320
322 return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height)
323
334
336 if tree is None: raise ValueError('Parse tree not avaialable')
337 return tree[self.treepos(tree)]
338
373