1
2
3
4
5
6
7
8 import re
9 import textwrap
10
11 from nltk.compat import *
12
13 from util import *
14 from api import *
15 from xmldocs import *
16
18
19
20 - def __init__(self, root, fileids, wrap_etree=False):
21 XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
22
23 self._lemma_to_class = defaultdict(list)
24 """A dictionary mapping from verb lemma strings to lists of
25 verbnet class identifiers."""
26
27 self._wordnet_to_class = defaultdict(list)
28 """A dictionary mapping from wordnet identifier strings to
29 lists of verbnet class identifiers."""
30
31 self._class_to_fileid = {}
32 """A dictionary mapping from class identifiers to
33 corresponding file identifiers. The keys of this dictionary
34 provide a complete list of all classes and subclasses."""
35
36 self._shortid_to_longid = {}
37
38
39
40
41 self._quick_index()
42
43 _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$')
44 """Regular expression that matches (and decomposes) longids"""
45
46 _SHORTID_RE = re.compile(r'[\d+.\-]+$')
47 """Regular expression that matches shortids"""
48
49 _INDEX_RE = re.compile(r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|'
50 r'<VNSUBCLASS ID="([^"]+)"/?>')
51 """Regular expression used by L{_index()} to quickly scan the corpus
52 for basic information."""
53
54 - def lemmas(self, classid=None):
55 """
56 Return a list of all verb lemmas that appear in any class, or
57 in the C{classid} if specified.
58 """
59 if classid is None:
60 return sorted(self._lemma_to_class.keys())
61 else:
62
63 vnclass = self.vnclass(classid)
64 return [member.get('name') for member in
65 vnclass.findall('MEMBERS/MEMBER')]
66
68 """
69 Return a list of all wordnet identifiers that appear in any
70 class, or in C{classid} if specified.
71 """
72 if classid is None:
73 return sorted(self._wordnet_to_class.keys())
74 else:
75
76 vnclass = self.vnclass(classid)
77 return sum([member.get('wn','').split() for member in
78 vnclass.findall('MEMBERS/MEMBER')], [])
79
80 - def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
81 """
82 Return a list of the verbnet class identifiers. If a file
83 identifier is specified, then return only the verbnet class
84 identifiers for classes (and subclasses) defined by that file.
85 If a lemma is specified, then return only verbnet class
86 identifiers for classes that contain that lemma as a member.
87 If a wordnetid is specified, then return only identifiers for
88 classes that contain that wordnetid as a member. If a classid
89 is specified, then return only identifiers for subclasses of
90 the specified verbnet class.
91 """
92 if len([x for x in [lemma, wordnetid, fileid, classid]
93 if x is not None]) > 1:
94 raise ValueError('Specify at most one of: fileid, wordnetid, '
95 'fileid, classid')
96 if fileid is not None:
97 return [c for (c,f) in self._class_to_fileid.items()
98 if f == fileid]
99 elif lemma is not None:
100 return self._lemma_to_class[lemma]
101 elif wordnetid is not None:
102 return self._wordnet_to_class[wordnetid]
103 elif classid is not None:
104 xmltree = self.vnclass(classid)
105 return [subclass.get('ID') for subclass in
106 xmltree.findall('SUBCLASSES/VNSUBCLASS')]
107 else:
108 return sorted(self._class_to_fileid.keys())
109
110 - def vnclass(self, fileid_or_classid):
111 """
112 Return an ElementTree containing the xml for the specified
113 verbnet class.
114
115 @param fileid_or_classid: An identifier specifying which class
116 should be returned. Can be a file identifier (such as
117 C{'put-9.1.xml'}), or a verbnet class identifier (such as
118 C{'put-9.1'}) or a short verbnet class identifier (such as
119 C{'9.1'}).
120 """
121
122 if fileid_or_classid in self._fileids:
123 return self.xml(fileid_or_classid)
124
125
126 classid = self.longid(fileid_or_classid)
127 if classid in self._class_to_fileid:
128 fileid = self._class_to_fileid[self.longid(classid)]
129 tree = self.xml(fileid)
130 if classid == tree.get('ID'):
131 return tree
132 else:
133 for subclass in tree.findall('.//VNSUBCLASS'):
134 if classid == subclass.get('ID'):
135 return subclass
136 else:
137 assert False
138
139 else:
140 raise ValueError('Unknown identifier %s' % fileid_or_classid)
141
142 - def fileids(self, vnclass_ids=None):
143 """
144 Return a list of fileids that make up this corpus. If
145 C{vnclass_ids} is specified, then return the fileids that make
146 up the specified verbnet class(es).
147 """
148 if vnclass_ids is None:
149 return self._fileids
150 elif isinstance(vnclass_ids, basestring):
151 return [self._class_to_fileid[self.longid(vnclass_ids)]]
152 else:
153 return [self._class_to_fileid[self.longid(vnclass_id)]
154 for vnclass_id in vnclass_ids]
155
156
157
158
159
160
162 """
163 Initialize the indexes L{_lemma_to_class},
164 L{_wordnet_to_class}, and L{_class_to_fileid} by scanning
165 through the corpus fileids. This is fast with cElementTree
166 (<0.1 secs), but quite slow (>10 secs) with the python
167 implementation of ElementTree.
168 """
169 for fileid in self._fileids:
170 self._index_helper(self.xml(fileid), fileid)
171
183
185 """
186 Initialize the indexes L{_lemma_to_class},
187 L{_wordnet_to_class}, and L{_class_to_fileid} by scanning
188 through the corpus fileids. This doesn't do proper xml parsing,
189 but is good enough to find everything in the standard verbnet
190 corpus -- and it runs about 30 times faster than xml parsing
191 (with the python ElementTree; only 2-3 times faster with
192 cElementTree).
193 """
194
195
196 for fileid in self._fileids:
197 vnclass = fileid[:-4]
198 self._class_to_fileid[vnclass] = fileid
199 self._shortid_to_longid[self.shortid(vnclass)] = vnclass
200 for m in self._INDEX_RE.finditer(self.open(fileid).read()):
201 groups = m.groups()
202 if groups[0] is not None:
203 self._lemma_to_class[groups[0]].append(vnclass)
204 for wn in groups[1].split():
205 self._wordnet_to_class[wn].append(vnclass)
206 elif groups[2] is not None:
207 self._class_to_fileid[groups[2]] = fileid
208 vnclass = groups[2]
209 self._shortid_to_longid[self.shortid(vnclass)] = vnclass
210 else:
211 assert False, 'unexpected match condition'
212
213
214
215
216
218 """Given a short verbnet class identifier (eg '37.10'), map it
219 to a long id (eg 'confess-37.10'). If C{shortid} is already a
220 long id, then return it as-is"""
221 if self._LONGID_RE.match(shortid):
222 return shortid
223 elif not self._SHORTID_RE.match(shortid):
224 raise ValueError('vnclass identifier %r not found' % shortid)
225 try:
226 return self._shortid_to_longid[shortid]
227 except KeyError:
228 raise ValueError('vnclass identifier %r not found' % shortid)
229
231 """Given a long verbnet class identifier (eg 'confess-37.10'),
232 map it to a short id (eg '37.10'). If C{longid} is already a
233 short id, then return it as-is."""
234 if self._SHORTID_RE.match(longid):
235 return longid
236 m = self._LONGID_RE.match(longid)
237 if m:
238 return m.group(2)
239 else:
240 raise ValueError('vnclass identifier %r not found' % longid)
241
242
243
244
245
247 """
248 Return a string containing a pretty-printed representation of
249 the given verbnet class.
250
251 @param vnclass: A verbnet class identifier; or an ElementTree
252 containing the xml contents of a verbnet class.
253 """
254 if isinstance(vnclass, basestring):
255 vnclass = self.vnclass(vnclass)
256
257 s = vnclass.get('ID') + '\n'
258 s += self.pprint_subclasses(vnclass, indent=' ') + '\n'
259 s += self.pprint_members(vnclass, indent=' ') + '\n'
260 s += ' Thematic roles:\n'
261 s += self.pprint_themroles(vnclass, indent=' ') + '\n'
262 s += ' Frames:\n'
263 s += '\n'.join(self.pprint_frame(vnframe, indent=' ')
264 for vnframe in vnclass.findall('FRAMES/FRAME'))
265 return s
266
268 """
269 Return a string containing a pretty-printed representation of
270 the given verbnet class's subclasses.
271
272 @param vnclass: A verbnet class identifier; or an ElementTree
273 containing the xml contents of a verbnet class.
274 """
275 if isinstance(vnclass, basestring):
276 vnclass = self.vnclass(vnclass)
277
278 subclasses = [subclass.get('ID') for subclass in
279 vnclass.findall('SUBCLASSES/VNSUBCLASS')]
280 if not subclasses: subclasses = ['(none)']
281 s = 'Subclasses: ' + ' '.join(subclasses)
282 return textwrap.fill(s, 70, initial_indent=indent,
283 subsequent_indent=indent+' ')
284
286 """
287 Return a string containing a pretty-printed representation of
288 the given verbnet class's member verbs.
289
290 @param vnclass: A verbnet class identifier; or an ElementTree
291 containing the xml contents of a verbnet class.
292 """
293 if isinstance(vnclass, basestring):
294 vnclass = self.vnclass(vnclass)
295
296 members = [member.get('name') for member in
297 vnclass.findall('MEMBERS/MEMBER')]
298 if not members: members = ['(none)']
299 s = 'Members: ' + ' '.join(members)
300 return textwrap.fill(s, 70, initial_indent=indent,
301 subsequent_indent=indent+' ')
302
304 """
305 Return a string containing a pretty-printed representation of
306 the given verbnet class's thematic roles.
307
308 @param vnclass: A verbnet class identifier; or an ElementTree
309 containing the xml contents of a verbnet class.
310 """
311 if isinstance(vnclass, basestring):
312 vnclass = self.vnclass(vnclass)
313
314 pieces = []
315 for themrole in vnclass.findall('THEMROLES/THEMROLE'):
316 piece = indent + '* ' + themrole.get('type')
317 modifiers = ['%(Value)s%(type)s' % restr.attrib
318 for restr in themrole.findall('SELRESTRS/SELRESTR')]
319 if modifiers:
320 piece += '[%s]' % ' '.join(modifiers)
321 pieces.append(piece)
322
323 return '\n'.join(pieces)
324
326 """
327 Return a string containing a pretty-printed representation of
328 the given verbnet frame.
329
330 @param vnframe: An ElementTree containing the xml contents of
331 a verbnet frame.
332 """
333 s = self.pprint_description(vnframe, indent) + '\n'
334 s += self.pprint_syntax(vnframe, indent+' Syntax: ') + '\n'
335 s += indent + ' Semantics:\n'
336 s += self.pprint_semantics(vnframe, indent+' ')
337 return s
338
340 """
341 Return a string containing a pretty-printed representation of
342 the given verbnet frame description.
343
344 @param vnframe: An ElementTree containing the xml contents of
345 a verbnet frame.
346 """
347 descr = vnframe.find('DESCRIPTION')
348 s = indent + descr.attrib['primary']
349 if descr.get('secondary', ''):
350 s += ' (%s)' % descr.get('secondary')
351 return s
352
354 """
355 Return a string containing a pretty-printed representation of
356 the given verbnet frame syntax.
357
358 @param vnframe: An ElementTree containing the xml contents of
359 a verbnet frame.
360 """
361 pieces = []
362 for elt in vnframe.find('SYNTAX'):
363 piece = elt.tag
364 modifiers = []
365 if 'value' in elt.attrib:
366 modifiers.append(elt.get('value'))
367 modifiers += ['%(Value)s%(type)s' % restr.attrib
368 for restr in (elt.findall('SELRESTRS/SELRESTR') +
369 elt.findall('SYNRESTRS/SYNRESTR'))]
370 if modifiers:
371 piece += '[%s]' % ' '.join(modifiers)
372 pieces.append(piece)
373
374 return indent + ' '.join(pieces)
375
377 """
378 Return a string containing a pretty-printed representation of
379 the given verbnet frame semantics.
380
381 @param vnframe: An ElementTree containing the xml contents of
382 a verbnet frame.
383 """
384 pieces = []
385 for pred in vnframe.findall('SEMANTICS/PRED'):
386 args = [arg.get('value') for arg in pred.findall('ARGS/ARG')]
387 pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args)))
388 return '\n'.join(['%s* %s' % (indent, piece) for piece in pieces])
389