1
2
3
4
5
6
7
8 import re
9 import codecs
10
11 from nltk.tree import Tree
12 from nltk.etree import ElementTree
13
14 from util import *
15 from api import *
18 """
19 Corpus reader for the propbank corpus, which augments the Penn
20 Treebank with information about the predicate argument structure
21 of every verb instance. The corpus consists of two parts: the
22 predicate-argument annotations themselves, and a set of X{frameset
23 files} which define the argument labels used by the annotations,
24 on a per-verb basis. Each X{frameset file} contains one or more
25 predicates, such as C{'turn'} or C{'turn_on'}, each of which is
26 divided into coarse-grained word senses called X{rolesets}. For
27 each X{roleset}, the frameset file provides descriptions of the
28 argument roles, along with examples.
29 """
30 - def __init__(self, root, propfile, framefiles='',
31 verbsfile=None, parse_fileid_xform=None,
32 parse_corpus=None, encoding=None):
33 """
34 @param root: The root directory for this corpus.
35 @param propfile: The name of the file containing the predicate-
36 argument annotations (relative to C{root}).
37 @param framefiles: A list or regexp specifying the frameset
38 fileids for this corpus.
39 @param parse_fileid_xform: A transform that should be applied
40 to the fileids in this corpus. This should be a function
41 of one argument (a fileid) that returns a string (the new
42 fileid).
43 @param parse_corpus: The corpus containing the parse trees
44 corresponding to this corpus. These parse trees are
45 necessary to resolve the tree pointers used by propbank.
46 """
47
48 if isinstance(framefiles, basestring):
49 framefiles = find_corpus_fileids(root, framefiles)
50 framefiles = list(framefiles)
51
52 CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles,
53 encoding)
54
55
56 self._propfile = propfile
57 self._framefiles = framefiles
58 self._verbsfile = verbsfile
59 self._parse_fileid_xform = parse_fileid_xform
60 self._parse_corpus = parse_corpus
61
62 - def raw(self, fileids=None):
69
78
87
89 """
90 @return: the xml description for the given roleset.
91 """
92 lemma = roleset_id.split('.')[0]
93 framefile = 'frames/%s.xml' % lemma
94 if framefile not in self._framefiles:
95 raise ValueError('Frameset file for %s not found' %
96 roleset_id)
97
98
99
100 etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
101 for roleset in etree.findall('predicate/roleset'):
102 if roleset.attrib['id'] == roleset_id:
103 return roleset
104 else:
105 raise ValueError('Roleset %s not found in %s' %
106 (roleset_id, framefile))
107
116
129
135
136 - def __init__(self, fileid, sentnum, wordnum, tagger, roleset,
137 inflection, predicate, arguments, parse_corpus=None):
138
139 self.fileid = fileid
140 """The name of the file containing the parse tree for this
141 instance's sentence."""
142
143 self.sentnum = sentnum
144 """The sentence number of this sentence within L{fileid}.
145 Indexing starts from zero."""
146
147 self.wordnum = wordnum
148 """The word number of this instance's predicate within its
149 containing sentence. Word numbers are indexed starting from
150 zero, and include traces and other empty parse elements."""
151
152 self.tagger = tagger
153 """An identifier for the tagger who tagged this instance; or
154 C{'gold'} if this is an adjuticated instance."""
155
156 self.roleset = roleset
157 """The name of the roleset used by this instance's predicate.
158 Use L{propbank.roleset() <PropbankCorpusReader.roleset>} to
159 look up information about the roleset."""
160
161 self.inflection = inflection
162 """A {PropbankInflection} object describing the inflection of
163 this instance's predicate."""
164
165 self.predicate = predicate
166 """A L{PropbankTreePointer} indicating the position of this
167 instance's predicate within its containing sentence."""
168
169 self.arguments = tuple(arguments)
170 """A list of tuples (argloc, argid), specifying the location
171 and identifier for each of the predicate's argument in the
172 containing sentence. Argument identifiers are strings such as
173 C{'ARG0'} or C{'ARGM-TMP'}. This list does *not* contain
174 the predicate."""
175
176 self.parse_corpus = parse_corpus
177 """A corpus reader for the parse trees corresponding to the
178 instances in this propbank corpus."""
179
181 return ('<PropbankInstance: %s, sent %s, word %s>' %
182 (self.fileid, self.sentnum, self.wordnum))
183
185 s = '%s %s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum,
186 self.tagger, self.roleset, self.inflection)
187 items = self.arguments + ((self.predicate, 'rel'),)
188 for (argloc, argid) in sorted(items):
189 s += ' %s-%s' % (argloc, argid)
190 return s
191
193 if self.parse_corpus is None: return None
194 if self.fileid not in self.parse_corpus.fileids(): return None
195 return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
196 tree = property(_get_tree, doc="""
197 The parse tree corresponding to this instance, or C{None} if
198 the corresponding tree is not available.""")
199
200 @staticmethod
201 - def parse(s, parse_fileid_xform=None, parse_corpus=None):
202 pieces = s.split()
203 if len(pieces) < 7:
204 raise ValueError('Badly formatted propbank line: %r' % s)
205
206
207 (fileid, sentnum, wordnum,
208 tagger, roleset, inflection) = pieces[:6]
209 rel = [p for p in pieces[6:] if p.endswith('-rel')]
210 args = [p for p in pieces[6:] if not p.endswith('-rel')]
211 if len(rel) != 1:
212 raise ValueError('Badly formatted propbank line: %r' % s)
213
214
215 if parse_fileid_xform is not None:
216 fileid = parse_fileid_xform(fileid)
217
218
219 sentnum = int(sentnum)
220 wordnum = int(wordnum)
221
222
223 inflection = PropbankInflection.parse(inflection)
224
225
226 predicate = PropbankTreePointer.parse(rel[0][:-4])
227
228
229 arguments = []
230 for arg in args:
231 argloc, argid = arg.split('-', 1)
232 arguments.append( (PropbankTreePointer.parse(argloc), argid) )
233
234
235 return PropbankInstance(fileid, sentnum, wordnum, tagger,
236 roleset, inflection, predicate,
237 arguments, parse_corpus)
238
240 """
241 A pointer used by propbank to identify one or more constituents in
242 a parse tree. C{PropbankPointer} is an abstract base class with
243 three concrete subclasses:
244
245 - L{PropbankTreePointer} is used to point to single constituents.
246 - L{PropbankSplitTreePointer} is used to point to 'split'
247 constituents, which consist of a sequence of two or more
248 C{PropbankTreePointer}s.
249 - L{PropbankChainTreePointer} is used to point to entire trace
250 chains in a tree. It consists of a sequence of pieces, which
251 can be C{PropbankTreePointer}s or C{PropbankSplitTreePointer}s.
252 """
254 if self.__class__ == PropbankPoitner:
255 raise AssertionError('PropbankPointer is an abstract base class')
256
259 self.pieces = pieces
260 """A list of the pieces that make up this chain. Elements may
261 be either L{PropbankSplitTreePointer}s or
262 L{PropbankTreePointer}s."""
263
265 return '*'.join('%s' % p for p in self.pieces)
267 return '<PropbankChainTreePointer: %s>' % self
269 if tree is None: raise ValueError('Parse tree not avaialable')
270 return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
271
274 self.pieces = pieces
275 """A list of the pieces that make up this chain. Elements are
276 all L{PropbankTreePointer}s."""
277
279 return ','.join('%s' % p for p in self.pieces)
281 return '<PropbankSplitTreePointer: %s>' % self
283 if tree is None: raise ValueError('Parse tree not avaialable')
284 return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
285
287 """
288 wordnum:height*wordnum:height*...
289 wordnum:height,
290
291 """
295
296 @staticmethod
314
316 return '%s:%s' % (self.wordnum, self.height)
317
319 return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
320
331
333 if tree is None: raise ValueError('Parse tree not avaialable')
334 return tree[self.treepos(tree)]
335
370
417