Package nltk :: Package corpus :: Package reader :: Module verbnet
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.verbnet

  1  # Natural Language Toolkit: Verbnet Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import re 
  9  import textwrap 
 10   
 11  from nltk.compat import * 
 12   
 13  from util import * 
 14  from api import * 
 15  from xmldocs import * 
 16   
17 -class VerbnetCorpusReader(XMLCorpusReader):
18 19 # No unicode encoding param, since the data files are all XML.
20 - def __init__(self, root, fileids, wrap_etree=False):
21 XMLCorpusReader.__init__(self, root, fileids, wrap_etree) 22 23 self._lemma_to_class = defaultdict(list) 24 """A dictionary mapping from verb lemma strings to lists of 25 verbnet class identifiers.""" 26 27 self._wordnet_to_class = defaultdict(list) 28 """A dictionary mapping from wordnet identifier strings to 29 lists of verbnet class identifiers.""" 30 31 self._class_to_fileid = {} 32 """A dictionary mapping from class identifiers to 33 corresponding file identifiers. The keys of this dictionary 34 provide a complete list of all classes and subclasses.""" 35 36 self._shortid_to_longid = {} 37 38 # Initialize the dictionaries. Use the quick (regexp-based) 39 # method instead of the slow (xml-based) method, because it 40 # runs 2-30 times faster. 41 self._quick_index()
42 43 _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$') 44 """Regular expression that matches (and decomposes) longids""" 45 46 _SHORTID_RE = re.compile(r'[\d+.\-]+$') 47 """Regular expression that matches shortids""" 48 49 _INDEX_RE = re.compile(r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|' 50 r'<VNSUBCLASS ID="([^"]+)"/?>') 51 """Regular expression used by L{_index()} to quickly scan the corpus 52 for basic information.""" 53
54 - def lemmas(self, classid=None):
55 """ 56 Return a list of all verb lemmas that appear in any class, or 57 in the C{classid} if specified. 58 """ 59 if classid is None: 60 return sorted(self._lemma_to_class.keys()) 61 else: 62 # [xx] should this include subclass members? 63 vnclass = self.vnclass(classid) 64 return [member.get('name') for member in 65 vnclass.findall('MEMBERS/MEMBER')]
66
67 - def wordnetids(self, classid=None):
68 """ 69 Return a list of all wordnet identifiers that appear in any 70 class, or in C{classid} if specified. 71 """ 72 if classid is None: 73 return sorted(self._wordnet_to_class.keys()) 74 else: 75 # [xx] should this include subclass members? 76 vnclass = self.vnclass(classid) 77 return sum([member.get('wn','').split() for member in 78 vnclass.findall('MEMBERS/MEMBER')], [])
79
80 - def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
81 """ 82 Return a list of the verbnet class identifiers. If a file 83 identifier is specified, then return only the verbnet class 84 identifiers for classes (and subclasses) defined by that file. 85 If a lemma is specified, then return only verbnet class 86 identifiers for classes that contain that lemma as a member. 87 If a wordnetid is specified, then return only identifiers for 88 classes that contain that wordnetid as a member. If a classid 89 is specified, then return only identifiers for subclasses of 90 the specified verbnet class. 91 """ 92 if len([x for x in [lemma, wordnetid, fileid, classid] 93 if x is not None]) > 1: 94 raise ValueError('Specify at most one of: fileid, wordnetid, ' 95 'fileid, classid') 96 if fileid is not None: 97 return [c for (c,f) in self._class_to_fileid.items() 98 if f == fileid] 99 elif lemma is not None: 100 return self._lemma_to_class[lemma] 101 elif wordnetid is not None: 102 return self._wordnet_to_class[wordnetid] 103 elif classid is not None: 104 xmltree = self.vnclass(classid) 105 return [subclass.get('ID') for subclass in 106 xmltree.findall('SUBCLASSES/VNSUBCLASS')] 107 else: 108 return sorted(self._class_to_fileid.keys())
109
110 - def vnclass(self, fileid_or_classid):
111 """ 112 Return an ElementTree containing the xml for the specified 113 verbnet class. 114 115 @param fileid_or_classid: An identifier specifying which class 116 should be returned. Can be a file identifier (such as 117 C{'put-9.1.xml'}), or a verbnet class identifier (such as 118 C{'put-9.1'}) or a short verbnet class identifier (such as 119 C{'9.1'}). 120 """ 121 # File identifier: just return the xml. 122 if fileid_or_classid in self._fileids: 123 return self.xml(fileid_or_classid) 124 125 # Class identifier: get the xml, and find the right elt. 126 classid = self.longid(fileid_or_classid) 127 if classid in self._class_to_fileid: 128 fileid = self._class_to_fileid[self.longid(classid)] 129 tree = self.xml(fileid) 130 if classid == tree.get('ID'): 131 return tree 132 else: 133 for subclass in tree.findall('.//VNSUBCLASS'): 134 if classid == subclass.get('ID'): 135 return subclass 136 else: 137 assert False # we saw it during _index()! 138 139 else: 140 raise ValueError('Unknown identifier %s' % fileid_or_classid)
141
142 - def fileids(self, vnclass_ids=None):
143 """ 144 Return a list of fileids that make up this corpus. If 145 C{vnclass_ids} is specified, then return the fileids that make 146 up the specified verbnet class(es). 147 """ 148 if vnclass_ids is None: 149 return self._fileids 150 elif isinstance(vnclass_ids, basestring): 151 return [self._class_to_fileid[self.longid(vnclass_ids)]] 152 else: 153 return [self._class_to_fileid[self.longid(vnclass_id)] 154 for vnclass_id in vnclass_ids]
155 156 157 ###################################################################### 158 #{ Index Initialization 159 ###################################################################### 160
161 - def _index(self):
162 """ 163 Initialize the indexes L{_lemma_to_class}, 164 L{_wordnet_to_class}, and L{_class_to_fileid} by scanning 165 through the corpus fileids. This is fast with cElementTree 166 (<0.1 secs), but quite slow (>10 secs) with the python 167 implementation of ElementTree. 168 """ 169 for fileid in self._fileids: 170 self._index_helper(self.xml(fileid), fileid)
171
172 - def _index_helper(self, xmltree, fileid):
173 """Helper for L{_index()}""" 174 vnclass = xmltree.get('ID') 175 self._class_to_fileid[vnclass] = fileid 176 self._shortid_to_longid[self.shortid(vnclass)] = vnclass 177 for member in xmltree.findall('MEMBERS/MEMBER'): 178 self._lemma_to_class[member.get('name')].append(vnclass) 179 for wn in member.get('wn', '').split(): 180 self._wordnet_to_class[wn].append(vnclass) 181 for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'): 182 self._index_helper(subclass, fileid)
183
184 - def _quick_index(self):
185 """ 186 Initialize the indexes L{_lemma_to_class}, 187 L{_wordnet_to_class}, and L{_class_to_fileid} by scanning 188 through the corpus fileids. This doesn't do proper xml parsing, 189 but is good enough to find everything in the standard verbnet 190 corpus -- and it runs about 30 times faster than xml parsing 191 (with the python ElementTree; only 2-3 times faster with 192 cElementTree). 193 """ 194 # nb: if we got rid of wordnet_to_class, this would run 2-3 195 # times faster. 196 for fileid in self._fileids: 197 vnclass = fileid[:-4] # strip the '.xml' 198 self._class_to_fileid[vnclass] = fileid 199 self._shortid_to_longid[self.shortid(vnclass)] = vnclass 200 for m in self._INDEX_RE.finditer(self.open(fileid).read()): 201 groups = m.groups() 202 if groups[0] is not None: 203 self._lemma_to_class[groups[0]].append(vnclass) 204 for wn in groups[1].split(): 205 self._wordnet_to_class[wn].append(vnclass) 206 elif groups[2] is not None: 207 self._class_to_fileid[groups[2]] = fileid 208 vnclass = groups[2] # for <MEMBER> elts. 209 self._shortid_to_longid[self.shortid(vnclass)] = vnclass 210 else: 211 assert False, 'unexpected match condition'
212 213 ###################################################################### 214 #{ Identifier conversion 215 ###################################################################### 216
217 - def longid(self, shortid):
218 """Given a short verbnet class identifier (eg '37.10'), map it 219 to a long id (eg 'confess-37.10'). If C{shortid} is already a 220 long id, then return it as-is""" 221 if self._LONGID_RE.match(shortid): 222 return shortid # it's already a longid. 223 elif not self._SHORTID_RE.match(shortid): 224 raise ValueError('vnclass identifier %r not found' % shortid) 225 try: 226 return self._shortid_to_longid[shortid] 227 except KeyError: 228 raise ValueError('vnclass identifier %r not found' % shortid)
229
230 - def shortid(self, longid):
231 """Given a long verbnet class identifier (eg 'confess-37.10'), 232 map it to a short id (eg '37.10'). If C{longid} is already a 233 short id, then return it as-is.""" 234 if self._SHORTID_RE.match(longid): 235 return longid # it's already a shortid. 236 m = self._LONGID_RE.match(longid) 237 if m: 238 return m.group(2) 239 else: 240 raise ValueError('vnclass identifier %r not found' % longid)
241 242 ###################################################################### 243 #{ Pretty Printing 244 ###################################################################### 245
246 - def pprint(self, vnclass):
247 """ 248 Return a string containing a pretty-printed representation of 249 the given verbnet class. 250 251 @param vnclass: A verbnet class identifier; or an ElementTree 252 containing the xml contents of a verbnet class. 253 """ 254 if isinstance(vnclass, basestring): 255 vnclass = self.vnclass(vnclass) 256 257 s = vnclass.get('ID') + '\n' 258 s += self.pprint_subclasses(vnclass, indent=' ') + '\n' 259 s += self.pprint_members(vnclass, indent=' ') + '\n' 260 s += ' Thematic roles:\n' 261 s += self.pprint_themroles(vnclass, indent=' ') + '\n' 262 s += ' Frames:\n' 263 s += '\n'.join(self.pprint_frame(vnframe, indent=' ') 264 for vnframe in vnclass.findall('FRAMES/FRAME')) 265 return s
266
267 - def pprint_subclasses(self, vnclass, indent=''):
268 """ 269 Return a string containing a pretty-printed representation of 270 the given verbnet class's subclasses. 271 272 @param vnclass: A verbnet class identifier; or an ElementTree 273 containing the xml contents of a verbnet class. 274 """ 275 if isinstance(vnclass, basestring): 276 vnclass = self.vnclass(vnclass) 277 278 subclasses = [subclass.get('ID') for subclass in 279 vnclass.findall('SUBCLASSES/VNSUBCLASS')] 280 if not subclasses: subclasses = ['(none)'] 281 s = 'Subclasses: ' + ' '.join(subclasses) 282 return textwrap.fill(s, 70, initial_indent=indent, 283 subsequent_indent=indent+' ')
284
285 - def pprint_members(self, vnclass, indent=''):
286 """ 287 Return a string containing a pretty-printed representation of 288 the given verbnet class's member verbs. 289 290 @param vnclass: A verbnet class identifier; or an ElementTree 291 containing the xml contents of a verbnet class. 292 """ 293 if isinstance(vnclass, basestring): 294 vnclass = self.vnclass(vnclass) 295 296 members = [member.get('name') for member in 297 vnclass.findall('MEMBERS/MEMBER')] 298 if not members: members = ['(none)'] 299 s = 'Members: ' + ' '.join(members) 300 return textwrap.fill(s, 70, initial_indent=indent, 301 subsequent_indent=indent+' ')
302
303 - def pprint_themroles(self, vnclass, indent=''):
304 """ 305 Return a string containing a pretty-printed representation of 306 the given verbnet class's thematic roles. 307 308 @param vnclass: A verbnet class identifier; or an ElementTree 309 containing the xml contents of a verbnet class. 310 """ 311 if isinstance(vnclass, basestring): 312 vnclass = self.vnclass(vnclass) 313 314 pieces = [] 315 for themrole in vnclass.findall('THEMROLES/THEMROLE'): 316 piece = indent + '* ' + themrole.get('type') 317 modifiers = ['%(Value)s%(type)s' % restr.attrib 318 for restr in themrole.findall('SELRESTRS/SELRESTR')] 319 if modifiers: 320 piece += '[%s]' % ' '.join(modifiers) 321 pieces.append(piece) 322 323 return '\n'.join(pieces)
324
325 - def pprint_frame(self, vnframe, indent=''):
326 """ 327 Return a string containing a pretty-printed representation of 328 the given verbnet frame. 329 330 @param vnframe: An ElementTree containing the xml contents of 331 a verbnet frame. 332 """ 333 s = self.pprint_description(vnframe, indent) + '\n' 334 s += self.pprint_syntax(vnframe, indent+' Syntax: ') + '\n' 335 s += indent + ' Semantics:\n' 336 s += self.pprint_semantics(vnframe, indent+' ') 337 return s
338
339 - def pprint_description(self, vnframe, indent=''):
340 """ 341 Return a string containing a pretty-printed representation of 342 the given verbnet frame description. 343 344 @param vnframe: An ElementTree containing the xml contents of 345 a verbnet frame. 346 """ 347 descr = vnframe.find('DESCRIPTION') 348 s = indent + descr.attrib['primary'] 349 if descr.get('secondary', ''): 350 s += ' (%s)' % descr.get('secondary') 351 return s
352
353 - def pprint_syntax(self, vnframe, indent=''):
354 """ 355 Return a string containing a pretty-printed representation of 356 the given verbnet frame syntax. 357 358 @param vnframe: An ElementTree containing the xml contents of 359 a verbnet frame. 360 """ 361 pieces = [] 362 for elt in vnframe.find('SYNTAX'): 363 piece = elt.tag 364 modifiers = [] 365 if 'value' in elt.attrib: 366 modifiers.append(elt.get('value')) 367 modifiers += ['%(Value)s%(type)s' % restr.attrib 368 for restr in (elt.findall('SELRESTRS/SELRESTR') + 369 elt.findall('SYNRESTRS/SYNRESTR'))] 370 if modifiers: 371 piece += '[%s]' % ' '.join(modifiers) 372 pieces.append(piece) 373 374 return indent + ' '.join(pieces)
375
376 - def pprint_semantics(self, vnframe, indent=''):
377 """ 378 Return a string containing a pretty-printed representation of 379 the given verbnet frame semantics. 380 381 @param vnframe: An ElementTree containing the xml contents of 382 a verbnet frame. 383 """ 384 pieces = [] 385 for pred in vnframe.findall('SEMANTICS/PRED'): 386 args = [arg.get('value') for arg in pred.findall('ARGS/ARG')] 387 pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args))) 388 return '\n'.join(['%s* %s' % (indent, piece) for piece in pieces])
389