Package nltk :: Package corpus :: Package reader :: Module wordnet
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.wordnet

   1  # Natural Language Toolkit: WordNet 
   2  # 
   3  # Copyright (C) 2001-2011 NLTK Project 
   4  # Author: Steven Bethard <Steven.Bethard@colorado.edu> 
   5  #         Steven Bird <sb@csse.unimelb.edu.au> 
   6  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
   7  #         Nitin Madnani <nmadnani@ets.org> 
   8  # URL: <http://www.nltk.org/> 
   9  # For license information, see LICENSE.TXT 
  10   
  11  import math 
  12  import re 
  13  from itertools import islice, chain 
  14  from operator import itemgetter 
  15   
  16  from nltk.compat import defaultdict 
  17  from nltk.corpus.reader import CorpusReader 
  18  from nltk.util import binary_search_file as _binary_search_file 
  19  from nltk.probability import FreqDist 
  20   
  21  ###################################################################### 
  22  ## Table of Contents 
  23  ###################################################################### 
  24  ## - Constants 
  25  ## - Data Classes 
  26  ##   - WordNetError 
  27  ##   - Lemma 
  28  ##   - Synset 
  29  ## - WordNet Corpus Reader 
  30  ## - WordNet Information Content Corpus Reader 
  31  ## - Similarity Metrics 
  32  ## - Demo 
  33   
  34  ###################################################################### 
  35  ## Constants 
  36  ###################################################################### 
  37   
  38  #: Positive infinity (for similarity functions) 
  39  _INF = 1e300 
  40   
  41  #{ Part-of-speech constants 
  42  ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v' 
  43  #} 
  44   
  45  POS_LIST = [NOUN, VERB, ADJ, ADV] 
  46   
  47  #: A table of strings that are used to express verb frames. 
  48  VERB_FRAME_STRINGS = ( 
  49      None, 
  50      "Something %s", 
  51      "Somebody %s", 
  52      "It is %sing", 
  53      "Something is %sing PP", 
  54      "Something %s something Adjective/Noun", 
  55      "Something %s Adjective/Noun", 
  56      "Somebody %s Adjective", 
  57      "Somebody %s something", 
  58      "Somebody %s somebody", 
  59      "Something %s somebody", 
  60      "Something %s something", 
  61      "Something %s to somebody", 
  62      "Somebody %s on something", 
  63      "Somebody %s somebody something", 
  64      "Somebody %s something to somebody", 
  65      "Somebody %s something from somebody", 
  66      "Somebody %s somebody with something", 
  67      "Somebody %s somebody of something", 
  68      "Somebody %s something on somebody", 
  69      "Somebody %s somebody PP", 
  70      "Somebody %s something PP", 
  71      "Somebody %s PP", 
  72      "Somebody's (body part) %s", 
  73      "Somebody %s somebody to INFINITIVE", 
  74      "Somebody %s somebody INFINITIVE", 
  75      "Somebody %s that CLAUSE", 
  76      "Somebody %s to somebody", 
  77      "Somebody %s to INFINITIVE", 
  78      "Somebody %s whether INFINITIVE", 
  79      "Somebody %s somebody into V-ing something", 
  80      "Somebody %s something with something", 
  81      "Somebody %s INFINITIVE", 
  82      "Somebody %s VERB-ing", 
  83      "It %s that CLAUSE", 
  84      "Something %s INFINITIVE") 
  85   
  86  ###################################################################### 
  87  ## Data Classes 
  88  ###################################################################### 
  89   
90 -class WordNetError(Exception):
91 """An exception class for wordnet-related errors."""
92 93
94 -class _WordNetObject(object):
95 """A common base class for lemmas and synsets.""" 96
97 - def hypernyms(self):
98 return self._related('@')
99
100 - def instance_hypernyms(self):
101 return self._related('@i')
102
103 - def hyponyms(self):
104 return self._related('~')
105
106 - def instance_hyponyms(self):
107 return self._related('~i')
108
109 - def member_holonyms(self):
110 return self._related('#m')
111
112 - def substance_holonyms(self):
113 return self._related('#s')
114
115 - def part_holonyms(self):
116 return self._related('#p')
117
118 - def member_meronyms(self):
119 return self._related('%m')
120
121 - def substance_meronyms(self):
122 return self._related('%s')
123
124 - def part_meronyms(self):
125 return self._related('%p')
126
127 - def topic_domains(self):
128 return self._related(';c')
129
130 - def region_domains(self):
131 return self._related(';r')
132
133 - def usage_domains(self):
134 return self._related(';u')
135
136 - def attributes(self):
137 return self._related('=')
138
139 - def entailments(self):
140 return self._related('*')
141
142 - def causes(self):
143 return self._related('>')
144
145 - def also_sees(self):
146 return self._related('^')
147
148 - def verb_groups(self):
149 return self._related('$')
150
151 - def similar_tos(self):
152 return self._related('&')
153
154 - def __hash__(self):
155 return hash(self.name)
156
157 - def __eq__(self, other):
158 return self.name == other.name
159
160 - def __ne__(self, other):
161 return self.name != other.name
162
163 -class Lemma(_WordNetObject):
164 """ 165 The lexical entry for a single morphological form of a 166 sense-disambiguated word. 167 168 Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where: 169 <word> is the morphological stem identifying the synset 170 <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB 171 <number> is the sense number, counting from 0. 172 <lemma> is the morphological form of interest 173 174 Note that <word> and <lemma> can be different, e.g. the Synset 175 'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and 176 'salt.n.03.salinity'. 177 178 Lemma attributes 179 ---------------- 180 name - The canonical name of this lemma. 181 synset - The synset that this lemma belongs to. 182 syntactic_marker - For adjectives, the WordNet string identifying the 183 syntactic position relative modified noun. See: 184 http://wordnet.princeton.edu/man/wninput.5WN.html#sect10 185 For all other parts of speech, this attribute is None. 186 187 Lemma methods 188 ------------- 189 Lemmas have the following methods for retrieving related Lemmas. They 190 correspond to the names for the pointer symbols defined here: 191 http://wordnet.princeton.edu/man/wninput.5WN.html#sect3 192 These methods all return lists of Lemmas. 193 194 antonyms 195 hypernyms 196 instance_hypernyms 197 hyponyms 198 instance_hyponyms 199 member_holonyms 200 substance_holonyms 201 part_holonyms 202 member_meronyms 203 substance_meronyms 204 part_meronyms 205 topic_domains 206 region_domains 207 usage_domains 208 attributes 209 derivationally_related_forms 210 entailments 211 causes 212 also_sees 213 verb_groups 214 similar_tos 215 pertainyms 216 """ 217 218 # formerly _from_synset_info
219 - def __init__(self, wordnet_corpus_reader, synset, name, 220 lexname_index, lex_id, syntactic_marker):
221 self._wordnet_corpus_reader = wordnet_corpus_reader 222 self.name = name 223 self.syntactic_marker = syntactic_marker 224 self.synset = synset 225 self.frame_strings = [] 226 self.frame_ids = [] 227 self._lexname_index = lexname_index 228 self._lex_id = lex_id 229 230 self.key = None # gets set later.
231
232 - def __repr__(self):
233 tup = type(self).__name__, self.synset.name, self.name 234 return "%s('%s.%s')" % tup
235 241
242 - def count(self):
243 """Return the frequency count for this Lemma""" 244 return self._wordnet_corpus_reader.lemma_count(self)
245
246 - def antonyms(self):
247 return self._related('!')
248 251
252 - def pertainyms(self):
253 return self._related('\\')
254 255
256 -class Synset(_WordNetObject):
257 """Create a Synset from a "<lemma>.<pos>.<number>" string where: 258 <lemma> is the word's morphological stem 259 <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB 260 <number> is the sense number, counting from 0. 261 262 Synset attributes 263 ----------------- 264 name - The canonical name of this synset, formed using the first lemma 265 of this synset. Note that this may be different from the name 266 passed to the constructor if that string used a different lemma to 267 identify the synset. 268 pos - The synset's part of speech, matching one of the module level 269 attributes ADJ, ADJ_SAT, ADV, NOUN or VERB. 270 lemmas - A list of the Lemma objects for this synset. 271 definition - The definition for this synset. 272 examples - A list of example strings for this synset. 273 offset - The offset in the WordNet dict file of this synset. 274 #lexname - The name of the lexicographer file containing this synset. 275 276 Synset methods 277 -------------- 278 Synsets have the following methods for retrieving related Synsets. 279 They correspond to the names for the pointer symbols defined here: 280 http://wordnet.princeton.edu/man/wninput.5WN.html#sect3 281 These methods all return lists of Synsets. 282 283 hypernyms 284 instance_hypernyms 285 hyponyms 286 instance_hyponyms 287 member_holonyms 288 substance_holonyms 289 part_holonyms 290 member_meronyms 291 substance_meronyms 292 part_meronyms 293 attributes 294 entailments 295 causes 296 also_sees 297 verb_groups 298 similar_tos 299 300 Additionally, Synsets support the following methods specific to the 301 hypernym relation: 302 303 root_hypernyms 304 common_hypernyms 305 lowest_common_hypernyms 306 307 Note that Synsets do not support the following relations because 308 these are defined by WordNet as lexical relations: 309 310 antonyms 311 derivationally_related_forms 312 pertainyms 313 """ 314
315 - def __init__(self, wordnet_corpus_reader):
316 self._wordnet_corpus_reader = wordnet_corpus_reader 317 # All of these attributes get initialized by 318 # WordNetCorpusReader._synset_from_pos_and_line() 319 320 self.pos = None 321 self.offset = None 322 self.name = None 323 self.frame_ids = [] 324 self.lemmas = [] 325 self.lemma_names = [] 326 self.lemma_infos = [] # never used? 327 self.definition = None 328 self.examples = [] 329 self.lexname = None # lexicographer name 330 331 self._pointers = defaultdict(set) 332 self._lemma_pointers = defaultdict(set)
333
334 - def _needs_root(self):
335 if self.pos == NOUN: 336 if self._wordnet_corpus_reader.get_version() == '1.6': 337 return True 338 else: 339 return False 340 elif self.pos == VERB: 341 return True
342
343 - def root_hypernyms(self):
344 """Get the topmost hypernyms of this synset in WordNet.""" 345 346 result = [] 347 seen = set() 348 todo = [self] 349 while todo: 350 next_synset = todo.pop() 351 if next_synset not in seen: 352 seen.add(next_synset) 353 next_hypernyms = next_synset.hypernyms() + \ 354 next_synset.instance_hypernyms() 355 if not next_hypernyms: 356 result.append(next_synset) 357 else: 358 todo.extend(next_hypernyms) 359 return result
360 361 # Simpler implementation which makes incorrect assumption that 362 # hypernym hierarcy is acyclic: 363 # 364 # if not self.hypernyms(): 365 # return [self] 366 # else: 367 # return list(set(root for h in self.hypernyms() 368 # for root in h.root_hypernyms()))
369 - def max_depth(self):
370 """ 371 @return: The length of the longest hypernym path from this 372 synset to the root. 373 """ 374 375 if "_max_depth" not in self.__dict__: 376 hypernyms = self.hypernyms() + self.instance_hypernyms() 377 if not hypernyms: 378 self._max_depth = 0 379 else: 380 self._max_depth = 1 + max(h.max_depth() for h in hypernyms) 381 return self._max_depth
382
383 - def min_depth(self):
384 """ 385 @return: The length of the shortest hypernym path from this 386 synset to the root. 387 """ 388 389 if "_min_depth" not in self.__dict__: 390 hypernyms = self.hypernyms() + self.instance_hypernyms() 391 if not hypernyms: 392 self._min_depth = 0 393 else: 394 self._min_depth = 1 + min(h.min_depth() for h in hypernyms) 395 return self._min_depth
396
397 - def closure(self, rel, depth=-1):
398 """Return the transitive closure of source under the rel 399 relationship, breadth-first 400 401 >>> from nltk.corpus import wordnet as wn 402 >>> dog = wn.synset('dog.n.01') 403 >>> hyp = lambda s:s.hypernyms() 404 >>> list(dog.closure(hyp)) 405 [Synset('domestic_animal.n.01'), Synset('canine.n.02'), 406 Synset('animal.n.01'), Synset('carnivore.n.01'), 407 Synset('organism.n.01'), Synset('placental.n.01'), 408 Synset('living_thing.n.01'), Synset('mammal.n.01'), 409 Synset('whole.n.02'), Synset('vertebrate.n.01'), 410 Synset('object.n.01'), Synset('chordate.n.01'), 411 Synset('physical_entity.n.01'), Synset('entity.n.01')] 412 """ 413 from nltk.util import breadth_first 414 synset_offsets = [] 415 for synset in breadth_first(self, rel, depth): 416 if synset.offset != self.offset: 417 if synset.offset not in synset_offsets: 418 synset_offsets.append(synset.offset) 419 yield synset
420
421 - def hypernym_paths(self):
422 """ 423 Get the path(s) from this synset to the root, where each path is a 424 list of the synset nodes traversed on the way to the root. 425 426 @return: A list of lists, where each list gives the node sequence 427 connecting the initial L{Synset} node and a root node. 428 """ 429 paths = [] 430 431 hypernyms = self.hypernyms() + self.instance_hypernyms() 432 if len(hypernyms) == 0: 433 paths = [[self]] 434 435 for hypernym in hypernyms: 436 for ancestor_list in hypernym.hypernym_paths(): 437 ancestor_list.append(self) 438 paths.append(ancestor_list) 439 return paths
440
441 - def common_hypernyms(self, other):
442 """ 443 Find all synsets that are hypernyms of this synset and the 444 other synset. 445 446 @type other: L{Synset} 447 @param other: other input synset. 448 @return: The synsets that are hypernyms of both synsets. 449 """ 450 self_synsets = set(self_synset 451 for self_synsets in self._iter_hypernym_lists() 452 for self_synset in self_synsets) 453 other_synsets = set(other_synset 454 for other_synsets in other._iter_hypernym_lists() 455 for other_synset in other_synsets) 456 return list(self_synsets.intersection(other_synsets))
457
458 - def lowest_common_hypernyms(self, other, simulate_root=False):
459 """Get the lowest synset that both synsets have as a hypernym.""" 460 461 fake_synset = Synset(None) 462 fake_synset.name = '*ROOT*' 463 fake_synset.hypernyms = lambda: [] 464 fake_synset.instance_hypernyms = lambda: [] 465 466 if simulate_root: 467 self_hypernyms = chain(self._iter_hypernym_lists(), [[fake_synset]]) 468 other_hypernyms = chain(other._iter_hypernym_lists(), [[fake_synset]]) 469 else: 470 self_hypernyms = self._iter_hypernym_lists() 471 other_hypernyms = other._iter_hypernym_lists() 472 473 synsets = set(s for synsets in self_hypernyms for s in synsets) 474 others = set(s for synsets in other_hypernyms for s in synsets) 475 synsets.intersection_update(others) 476 477 try: 478 max_depth = max(s.min_depth() for s in synsets) 479 return [s for s in synsets if s.min_depth() == max_depth] 480 except ValueError: 481 return []
482
483 - def hypernym_distances(self, distance=0, simulate_root=False):
484 """ 485 Get the path(s) from this synset to the root, counting the distance 486 of each node from the initial node on the way. A set of 487 (synset, distance) tuples is returned. 488 489 @type distance: C{int} 490 @param distance: the distance (number of edges) from this hypernym to 491 the original hypernym L{Synset} on which this method was called. 492 @return: A set of (L{Synset}, int) tuples where each L{Synset} is 493 a hypernym of the first L{Synset}. 494 """ 495 distances = set([(self, distance)]) 496 for hypernym in self.hypernyms() + self.instance_hypernyms(): 497 distances |= hypernym.hypernym_distances(distance+1, simulate_root=False) 498 if simulate_root: 499 fake_synset = Synset(None) 500 fake_synset.name = '*ROOT*' 501 fake_synset_distance = max(distances, key=itemgetter(1))[1] 502 distances.add((fake_synset, fake_synset_distance+1)) 503 return distances
504
505 - def shortest_path_distance(self, other, simulate_root=False):
506 """ 507 Returns the distance of the shortest path linking the two synsets (if 508 one exists). For each synset, all the ancestor nodes and their 509 distances are recorded and compared. The ancestor node common to both 510 synsets that can be reached with the minimum number of traversals is 511 used. If no ancestor nodes are common, None is returned. If a node is 512 compared with itself 0 is returned. 513 514 @type other: L{Synset} 515 @param other: The Synset to which the shortest path will be found. 516 @return: The number of edges in the shortest path connecting the two 517 nodes, or None if no path exists. 518 """ 519 520 if self == other: 521 return 0 522 523 path_distance = None 524 525 dist_list1 = self.hypernym_distances(simulate_root=simulate_root) 526 dist_dict1 = {} 527 528 dist_list2 = other.hypernym_distances(simulate_root=simulate_root) 529 dist_dict2 = {} 530 531 # Transform each distance list into a dictionary. In cases where 532 # there are duplicate nodes in the list (due to there being multiple 533 # paths to the root) the duplicate with the shortest distance from 534 # the original node is entered. 535 536 for (l, d) in [(dist_list1, dist_dict1), (dist_list2, dist_dict2)]: 537 for (key, value) in l: 538 if key in d: 539 if value < d[key]: 540 d[key] = value 541 else: 542 d[key] = value 543 544 # For each ancestor synset common to both subject synsets, find the 545 # connecting path length. Return the shortest of these. 546 547 for synset1 in dist_dict1.keys(): 548 for synset2 in dist_dict2.keys(): 549 if synset1 == synset2: 550 new_distance = dist_dict1[synset1] + dist_dict2[synset2] 551 if path_distance < 0 or new_distance < path_distance: 552 path_distance = new_distance 553 554 return path_distance
555
556 - def tree(self, rel, depth=-1, cut_mark=None):
557 """ 558 >>> from nltk.corpus import wordnet as wn 559 >>> dog = wn.synset('dog.n.01') 560 >>> hyp = lambda s:s.hypernyms() 561 >>> from pprint import pprint 562 >>> pprint(dog.tree(hyp)) 563 [Synset('dog.n.01'), 564 [Synset('domestic_animal.n.01'), 565 [Synset('animal.n.01'), 566 [Synset('organism.n.01'), 567 [Synset('living_thing.n.01'), 568 [Synset('whole.n.02'), 569 [Synset('object.n.01'), 570 [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]], 571 [Synset('canine.n.02'), 572 [Synset('carnivore.n.01'), 573 [Synset('placental.n.01'), 574 [Synset('mammal.n.01'), 575 [Synset('vertebrate.n.01'), 576 [Synset('chordate.n.01'), 577 [Synset('animal.n.01'), 578 [Synset('organism.n.01'), 579 [Synset('living_thing.n.01'), 580 [Synset('whole.n.02'), 581 [Synset('object.n.01'), 582 [Synset('physical_entity.n.01'), 583 [Synset('entity.n.01')]]]]]]]]]]]]]] 584 """ 585 586 tree = [self] 587 if depth != 0: 588 tree += [x.tree(rel, depth-1, cut_mark) for x in rel(self)] 589 elif cut_mark: 590 tree += [cut_mark] 591 return tree
592 593 # interface to similarity methods
594 - def path_similarity(self, other, verbose=False, simulate_root=True):
595 """ 596 Path Distance Similarity: 597 Return a score denoting how similar two word senses are, based on the 598 shortest path that connects the senses in the is-a (hypernym/hypnoym) 599 taxonomy. The score is in the range 0 to 1, except in those cases where 600 a path cannot be found (will only be true for verbs as there are many 601 distinct verb taxonomies), in which case None is returned. A score of 602 1 represents identity i.e. comparing a sense with itself will return 1. 603 604 @type other: L{Synset} 605 @param other: The L{Synset} that this L{Synset} is being compared to. 606 @type simulate_root: L{bool} 607 @param simulate_root: The various verb taxonomies do not 608 share a single root which disallows this metric from working for 609 synsets that are not connected. This flag (True by default) 610 creates a fake root that connects all the taxonomies. Set it 611 to false to disable this behavior. For the noun taxonomy, 612 there is usually a default root except for WordNet version 1.6. 613 If you are using wordnet 1.6, a fake root will be added for nouns 614 as well. 615 @return: A score denoting the similarity of the two L{Synset}s, 616 normally between 0 and 1. None is returned if no connecting path 617 could be found. 1 is returned if a L{Synset} is compared with 618 itself. 619 """ 620 621 distance = self.shortest_path_distance(other, simulate_root=simulate_root and self._needs_root()) 622 if distance >= 0: 623 return 1.0 / (distance + 1) 624 else: 625 return None
626
627 - def lch_similarity(self, other, verbose=False, simulate_root=True):
628 """ 629 Leacock Chodorow Similarity: 630 Return a score denoting how similar two word senses are, based on the 631 shortest path that connects the senses (as above) and the maximum depth 632 of the taxonomy in which the senses occur. The relationship is given as 633 -log(p/2d) where p is the shortest path length and d is the taxonomy 634 depth. 635 636 @type other: L{Synset} 637 @param other: The L{Synset} that this L{Synset} is being compared to. 638 @type simulate_root: L{bool} 639 @param simulate_root: The various verb taxonomies do not 640 share a single root which disallows this metric from working for 641 synsets that are not connected. This flag (True by default) 642 creates a fake root that connects all the taxonomies. Set it 643 to false to disable this behavior. For the noun taxonomy, 644 there is usually a default root except for WordNet version 1.6. 645 If you are using wordnet 1.6, a fake root will be added for nouns 646 as well. 647 @return: A score denoting the similarity of the two L{Synset}s, 648 normally greater than 0. None is returned if no connecting path 649 could be found. If a L{Synset} is compared with itself, the 650 maximum score is returned, which varies depending on the taxonomy 651 depth. 652 """ 653 654 if self.pos != other.pos: 655 raise WordNetError('Computing the lch similarity requires ' + \ 656 '%s and %s to have the same part of speech.' % \ 657 (self, other)) 658 659 need_root = self._needs_root() 660 661 if self.pos not in self._wordnet_corpus_reader._max_depth: 662 self._wordnet_corpus_reader._compute_max_depth(self.pos, need_root) 663 664 depth = self._wordnet_corpus_reader._max_depth[self.pos] 665 666 distance = self.shortest_path_distance(other, simulate_root=simulate_root and need_root) 667 668 if distance >= 0: 669 return -math.log((distance + 1) / (2.0 * depth)) 670 else: 671 return None
672
673 - def wup_similarity(self, other, verbose=False, simulate_root=True):
674 """ 675 Wu-Palmer Similarity: 676 Return a score denoting how similar two word senses are, based on the 677 depth of the two senses in the taxonomy and that of their Least Common 678 Subsumer (most specific ancestor node). Previously, the scores computed 679 by this implementation did _not_ always agree with those given by 680 Pedersen's Perl implementation of WordNet Similarity. However, with 681 the addition of the simulate_root flag (see below), the score for 682 verbs now almost always agree but not always for nouns. 683 684 The LCS does not necessarily feature in the shortest path connecting 685 the two senses, as it is by definition the common ancestor deepest in 686 the taxonomy, not closest to the two senses. Typically, however, it 687 will so feature. Where multiple candidates for the LCS exist, that 688 whose shortest path to the root node is the longest will be selected. 689 Where the LCS has multiple paths to the root, the longer path is used 690 for the purposes of the calculation. 691 692 @type other: L{Synset} 693 @param other: The L{Synset} that this L{Synset} is being compared to. 694 @type simulate_root: L{bool} 695 @param simulate_root: The various verb taxonomies do not 696 share a single root which disallows this metric from working for 697 synsets that are not connected. This flag (True by default) 698 creates a fake root that connects all the taxonomies. Set it 699 to false to disable this behavior. For the noun taxonomy, 700 there is usually a default root except for WordNet version 1.6. 701 If you are using wordnet 1.6, a fake root will be added for nouns 702 as well. 703 @return: A float score denoting the similarity of the two L{Synset}s, 704 normally greater than zero. If no connecting path between the two 705 senses can be found, None is returned. 706 707 """ 708 709 need_root = self._needs_root() 710 subsumers = self.lowest_common_hypernyms(other, simulate_root=simulate_root and need_root) 711 712 # If no LCS was found return None 713 if len(subsumers) == 0: 714 return None 715 716 subsumer = subsumers[0] 717 718 # Get the longest path from the LCS to the root, 719 # including a correction: 720 # - add one because the calculations include both the start and end 721 # nodes 722 depth = subsumer.max_depth() + 1 723 724 # Note: No need for an additional add-one correction for non-nouns 725 # to account for an imaginary root node because that is now automatically 726 # handled by simulate_root 727 # if subsumer.pos != NOUN: 728 # depth += 1 729 730 # Get the shortest path from the LCS to each of the synsets it is 731 # subsuming. Add this to the LCS path length to get the path 732 # length from each synset to the root. 733 len1 = self.shortest_path_distance(subsumer, simulate_root=simulate_root and need_root) 734 len2 = other.shortest_path_distance(subsumer, simulate_root=simulate_root and need_root) 735 if len1 is None or len2 is None: 736 return None 737 len1 += depth 738 len2 += depth 739 return (2.0 * depth) / (len1 + len2)
740
741 - def res_similarity(self, other, ic, verbose=False):
742 """ 743 Resnik Similarity: 744 Return a score denoting how similar two word senses are, based on the 745 Information Content (IC) of the Least Common Subsumer (most specific 746 ancestor node). 747 748 @type other: L{Synset} 749 @param other: The L{Synset} that this L{Synset} is being compared to. 750 @type ic: C{dict} 751 @param ic: an information content object (as returned by L{load_ic()}). 752 @return: A float score denoting the similarity of the two L{Synset}s. 753 Synsets whose LCS is the root node of the taxonomy will have a 754 score of 0 (e.g. N['dog'][0] and N['table'][0]). 755 """ 756 757 ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) 758 return lcs_ic
759
760 - def jcn_similarity(self, other, ic, verbose=False):
761 """ 762 Jiang-Conrath Similarity: 763 Return a score denoting how similar two word senses are, based on the 764 Information Content (IC) of the Least Common Subsumer (most specific 765 ancestor node) and that of the two input Synsets. The relationship is 766 given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). 767 768 @type other: L{Synset} 769 @param other: The L{Synset} that this L{Synset} is being compared to. 770 @type ic: C{dict} 771 @param ic: an information content object (as returned by L{load_ic()}). 772 @return: A float score denoting the similarity of the two L{Synset}s. 773 """ 774 775 if self == other: 776 return _INF 777 778 ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) 779 780 # If either of the input synsets are the root synset, or have a 781 # frequency of 0 (sparse data problem), return 0. 782 if ic1 == 0 or ic2 == 0: 783 return 0 784 785 ic_difference = ic1 + ic2 - 2 * lcs_ic 786 787 if ic_difference == 0: 788 return _INF 789 790 return 1 / ic_difference
791
792 - def lin_similarity(self, other, ic, verbose=False):
793 """ 794 Lin Similarity: 795 Return a score denoting how similar two word senses are, based on the 796 Information Content (IC) of the Least Common Subsumer (most specific 797 ancestor node) and that of the two input Synsets. The relationship is 798 given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). 799 800 @type other: L{Synset} 801 @param other: The L{Synset} that this L{Synset} is being compared to. 802 @type ic: C{dict} 803 @param ic: an information content object (as returned by L{load_ic()}). 804 @return: A float score denoting the similarity of the two L{Synset}s, 805 in the range 0 to 1. 806 """ 807 808 ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) 809 return (2.0 * lcs_ic) / (ic1 + ic2)
810
811 - def _iter_hypernym_lists(self):
812 """ 813 @return: An iterator over L{Synset}s that are either proper 814 hypernyms or instance of hypernyms of the synset. 815 """ 816 todo = [self] 817 seen = set() 818 while todo: 819 for synset in todo: 820 seen.add(synset) 821 yield todo 822 todo = [hypernym 823 for synset in todo 824 for hypernym in (synset.hypernyms() + \ 825 synset.instance_hypernyms()) 826 if hypernym not in seen]
827
828 - def __repr__(self):
829 return '%s(%r)' % (type(self).__name__, self.name)
830
835 836 837 ###################################################################### 838 ## WordNet Corpus Reader 839 ###################################################################### 840
841 -class WordNetCorpusReader(CorpusReader):
842 """ 843 A corpus reader used to access wordnet or its variants. 844 """ 845 846 _ENCODING = None # what encoding should we be using, if any? 847 848 #{ Part-of-speech constants 849 ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v' 850 #} 851 852 #{ Filename constants 853 _FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'} 854 #} 855 856 #{ Part of speech constants 857 _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5} 858 _pos_names = dict(tup[::-1] for tup in _pos_numbers.items()) 859 #} 860 861 #: A list of file identifiers for all the fileids used by this 862 #: corpus reader. 863 _FILES = ('cntlist.rev', 'lexnames', 'index.sense', 864 'index.adj', 'index.adv', 'index.noun', 'index.verb', 865 'data.adj', 'data.adv', 'data.noun', 'data.verb', 866 'adj.exc', 'adv.exc', 'noun.exc', 'verb.exc', ) 867
868 - def __init__(self, root):
869 """ 870 Construct a new wordnet corpus reader, with the given root 871 directory. 872 """ 873 CorpusReader.__init__(self, root, self._FILES, 874 encoding=self._ENCODING) 875 876 self._lemma_pos_offset_map = defaultdict(dict) 877 """A index that provides the file offset 878 879 Map from lemma -> pos -> synset_index -> offset""" 880 881 self._synset_offset_cache = defaultdict(dict) 882 """A cache so we don't have to reconstuct synsets 883 884 Map from pos -> offset -> synset""" 885 886 self._max_depth = defaultdict(dict) 887 """A lookup for the maximum depth of each part of speech. Useful for 888 the lch similarity metric. 889 """ 890 891 self._data_file_map = {} 892 self._exception_map = {} 893 self._lexnames = [] 894 self._key_count_file = None 895 self._key_synset_file = None 896 897 # Load the lexnames 898 for i, line in enumerate(self.open('lexnames')): 899 index, lexname, _ = line.split() 900 assert int(index) == i 901 self._lexnames.append(lexname) 902 903 # Load the indices for lemmas and synset offsets 904 self._load_lemma_pos_offset_map() 905 906 # load the exception file data into memory 907 self._load_exception_map()
908 909
910 - def _load_lemma_pos_offset_map(self):
911 for suffix in self._FILEMAP.values(): 912 913 # parse each line of the file (ignoring comment lines) 914 for i, line in enumerate(self.open('index.%s' % suffix)): 915 if line.startswith(' '): 916 continue 917 918 next = iter(line.split()).next 919 try: 920 921 # get the lemma and part-of-speech 922 lemma = next() 923 pos = next() 924 925 # get the number of synsets for this lemma 926 n_synsets = int(next()) 927 assert n_synsets > 0 928 929 # get the pointer symbols for all synsets of this lemma 930 n_pointers = int(next()) 931 _ = [next() for _ in xrange(n_pointers)] 932 933 # same as number of synsets 934 n_senses = int(next()) 935 assert n_synsets == n_senses 936 937 # get number of senses ranked according to frequency 938 _ = int(next()) 939 940 # get synset offsets 941 synset_offsets = [int(next()) for _ in xrange(n_synsets)] 942 943 # raise more informative error with file name and line number 944 except (AssertionError, ValueError), e: 945 tup = ('index.%s' % suffix), (i + 1), e 946 raise WordNetError('file %s, line %i: %s' % tup) 947 948 # map lemmas and parts of speech to synsets 949 self._lemma_pos_offset_map[lemma][pos] = synset_offsets 950 if pos == ADJ: 951 self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
952
953 - def _load_exception_map(self):
954 # load the exception file data into memory 955 for pos, suffix in self._FILEMAP.items(): 956 self._exception_map[pos] = {} 957 for line in self.open('%s.exc' % suffix): 958 terms = line.split() 959 self._exception_map[pos][terms[0]] = terms[1:] 960 self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
961
962 - def _compute_max_depth(self, pos, simulate_root):
963 """ 964 Compute the max depth for the given part of speech. This is 965 used by the lch similarity metric. 966 """ 967 depth = 0 968 for ii in self.all_synsets(pos): 969 try: 970 depth = max(depth, ii.max_depth()) 971 except RuntimeError: 972 print ii 973 if simulate_root: 974 depth += 1 975 self._max_depth[pos] = depth
976
977 - def get_version(self):
978 fh = self._data_file(ADJ) 979 for line in fh: 980 match = re.search(r'WordNet (\d+\.\d+) Copyright', line) 981 if match is not None: 982 version = match.group(1) 983 fh.seek(0) 984 return version
985 986 #//////////////////////////////////////////////////////////// 987 # Loading Lemmas 988 #////////////////////////////////////////////////////////////
989 - def lemma(self, name):
990 synset_name, lemma_name = name.rsplit('.', 1) 991 synset = self.synset(synset_name) 992 for lemma in synset.lemmas: 993 if lemma.name == lemma_name: 994 return lemma 995 raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name))
996
997 - def lemma_from_key(self, key):
998 # Keys are case sensitive and always lower-case 999 key = key.lower() 1000 1001 lemma_name, lex_sense = key.split('%') 1002 pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':') 1003 pos = self._pos_names[int(pos_number)] 1004 1005 # open the key -> synset file if necessary 1006 if self._key_synset_file is None: 1007 self._key_synset_file = self.open('index.sense') 1008 1009 # Find the synset for the lemma. 1010 synset_line = _binary_search_file(self._key_synset_file, key) 1011 if not synset_line: 1012 raise WordNetError("No synset found for key %r" % key) 1013 offset = int(synset_line.split()[1]) 1014 synset = self._synset_from_pos_and_offset(pos, offset) 1015 1016 # return the corresponding lemma 1017 for lemma in synset.lemmas: 1018 if lemma.key == key: 1019 return lemma 1020 raise WordNetError("No lemma found for for key %r" % key)
1021 1022 #//////////////////////////////////////////////////////////// 1023 # Loading Synsets 1024 #////////////////////////////////////////////////////////////
1025 - def synset(self, name):
1026 # split name into lemma, part of speech and synset number 1027 lemma, pos, synset_index_str = name.lower().rsplit('.', 2) 1028 synset_index = int(synset_index_str) - 1 1029 1030 # get the offset for this synset 1031 try: 1032 offset = self._lemma_pos_offset_map[lemma][pos][synset_index] 1033 except KeyError: 1034 message = 'no lemma %r with part of speech %r' 1035 raise WordNetError(message % (lemma, pos)) 1036 except IndexError: 1037 n_senses = len(self._lemma_pos_offset_map[lemma][pos]) 1038 message = "lemma %r with part of speech %r has only %i %s" 1039 if n_senses == 1: 1040 tup = lemma, pos, n_senses, "sense" 1041 else: 1042 tup = lemma, pos, n_senses, "senses" 1043 raise WordNetError(message % tup) 1044 1045 # load synset information from the appropriate file 1046 synset = self._synset_from_pos_and_offset(pos, offset) 1047 1048 # some basic sanity checks on loaded attributes 1049 if pos == 's' and synset.pos == 'a': 1050 message = ('adjective satellite requested but only plain ' 1051 'adjective found for lemma %r') 1052 raise WordNetError(message % lemma) 1053 assert synset.pos == pos or (pos == 'a' and synset.pos == 's') 1054 1055 # Return the synset object. 1056 return synset
1057
1058 - def _data_file(self, pos):
1059 """ 1060 Return an open file pointer for the data file for the given 1061 part of speech. 1062 """ 1063 if pos == ADJ_SAT: 1064 pos = ADJ 1065 if self._data_file_map.get(pos) is None: 1066 fileid = 'data.%s' % self._FILEMAP[pos] 1067 self._data_file_map[pos] = self.open(fileid) 1068 return self._data_file_map[pos]
1069
1070 - def _synset_from_pos_and_offset(self, pos, offset):
1071 # Check to see if the synset is in the cache 1072 if offset in self._synset_offset_cache[pos]: 1073 return self._synset_offset_cache[pos][offset] 1074 1075 data_file = self._data_file(pos) 1076 data_file.seek(offset) 1077 data_file_line = data_file.readline() 1078 synset = self._synset_from_pos_and_line(pos, data_file_line) 1079 assert synset.offset == offset 1080 self._synset_offset_cache[pos][offset] = synset 1081 return synset
1082
1083 - def _synset_from_pos_and_line(self, pos, data_file_line):
1084 # Construct a new (empty) synset. 1085 synset = Synset(self) 1086 1087 # parse the entry for this synset 1088 try: 1089 1090 # parse out the definitions and examples from the gloss 1091 columns_str, gloss = data_file_line.split('|') 1092 gloss = gloss.strip() 1093 definitions = [] 1094 for gloss_part in gloss.split(';'): 1095 gloss_part = gloss_part.strip() 1096 if gloss_part.startswith('"'): 1097 synset.examples.append(gloss_part.strip('"')) 1098 else: 1099 definitions.append(gloss_part) 1100 synset.definition = '; '.join(definitions) 1101 1102 # split the other info into fields 1103 next = iter(columns_str.split()).next 1104 1105 # get the offset 1106 synset.offset = int(next()) 1107 1108 # determine the lexicographer file name 1109 lexname_index = int(next()) 1110 synset.lexname = self._lexnames[lexname_index] 1111 1112 # get the part of speech 1113 synset.pos = next() 1114 1115 # create Lemma objects for each lemma 1116 n_lemmas = int(next(), 16) 1117 for _ in xrange(n_lemmas): 1118 # get the lemma name 1119 lemma_name = next() 1120 # get the lex_id (used for sense_keys) 1121 lex_id = int(next(), 16) 1122 # If the lemma has a syntactic marker, extract it. 1123 m = re.match(r'(.*?)(\(.*\))?$', lemma_name) 1124 lemma_name, syn_mark = m.groups() 1125 # create the lemma object 1126 lemma = Lemma(self, synset, lemma_name, lexname_index, 1127 lex_id, syn_mark) 1128 synset.lemmas.append(lemma) 1129 synset.lemma_names.append(lemma.name) 1130 1131 # collect the pointer tuples 1132 n_pointers = int(next()) 1133 for _ in xrange(n_pointers): 1134 symbol = next() 1135 offset = int(next()) 1136 pos = next() 1137 lemma_ids_str = next() 1138 if lemma_ids_str == '0000': 1139 synset._pointers[symbol].add((pos, offset)) 1140 else: 1141 source_index = int(lemma_ids_str[:2], 16) - 1 1142 target_index = int(lemma_ids_str[2:], 16) - 1 1143 source_lemma_name = synset.lemmas[source_index].name 1144 lemma_pointers = synset._lemma_pointers 1145 tups = lemma_pointers[source_lemma_name, symbol] 1146 tups.add((pos, offset, target_index)) 1147 1148 # read the verb frames 1149 try: 1150 frame_count = int(next()) 1151 except StopIteration: 1152 pass 1153 else: 1154 for _ in xrange(frame_count): 1155 # read the plus sign 1156 plus = next() 1157 assert plus == '+' 1158 # read the frame and lemma number 1159 frame_number = int(next()) 1160 frame_string_fmt = VERB_FRAME_STRINGS[frame_number] 1161 lemma_number = int(next(), 16) 1162 # lemma number of 00 means all words in the synset 1163 if lemma_number == 0: 1164 synset.frame_ids.append(frame_number) 1165 for lemma in synset.lemmas: 1166 lemma.frame_ids.append(frame_number) 1167 lemma.frame_strings.append(frame_string_fmt % 1168 lemma.name) 1169 # only a specific word in the synset 1170 else: 1171 lemma = synset.lemmas[lemma_number - 1] 1172 lemma.frame_ids.append(frame_number) 1173 lemma.frame_strings.append(frame_string_fmt % 1174 lemma.name) 1175 1176 # raise a more informative error with line text 1177 except ValueError, e: 1178 raise WordNetError('line %r: %s' % (data_file_line, e)) 1179 1180 # set sense keys for Lemma objects - note that this has to be 1181 # done afterwards so that the relations are available 1182 for lemma in synset.lemmas: 1183 if synset.pos is ADJ_SAT: 1184 head_lemma = synset.similar_tos()[0].lemmas[0] 1185 head_name = head_lemma.name 1186 head_id = '%02d' % head_lemma._lex_id 1187 else: 1188 head_name = head_id = '' 1189 tup = (lemma.name, WordNetCorpusReader._pos_numbers[synset.pos], 1190 lemma._lexname_index, lemma._lex_id, head_name, head_id) 1191 lemma.key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower() 1192 1193 # the canonical name is based on the first lemma 1194 lemma_name = synset.lemmas[0].name.lower() 1195 offsets = self._lemma_pos_offset_map[lemma_name][synset.pos] 1196 sense_index = offsets.index(synset.offset) 1197 tup = lemma_name, synset.pos, sense_index + 1 1198 synset.name = '%s.%s.%02i' % tup 1199 1200 return synset
1201 1202 #//////////////////////////////////////////////////////////// 1203 # Retrieve synsets and lemmas. 1204 #////////////////////////////////////////////////////////////
1205 - def synsets(self, lemma, pos=None):
1206 """Load all synsets with a given lemma and part of speech tag. 1207 If no pos is specified, all synsets for all parts of speech 1208 will be loaded. 1209 """ 1210 lemma = lemma.lower() 1211 get_synset = self._synset_from_pos_and_offset 1212 index = self._lemma_pos_offset_map 1213 1214 if pos is None: 1215 pos = POS_LIST 1216 1217 return [get_synset(p, offset) 1218 for p in pos 1219 for form in self._morphy(lemma, p) 1220 for offset in index[form].get(p, [])]
1221
1222 - def lemmas(self, lemma, pos=None):
1223 """Return all Lemma objects with a name matching the specified lemma 1224 name and part of speech tag. Matches any part of speech tag if none is 1225 specified.""" 1226 return [lemma_obj 1227 for synset in self.synsets(lemma, pos) 1228 for lemma_obj in synset.lemmas 1229 if lemma_obj.name == lemma]
1230
1231 - def all_lemma_names(self, pos=None):
1232 """Return all lemma names for all synsets for the given 1233 part of speech tag. If pos is not specified, all synsets 1234 for all parts of speech will be used. 1235 """ 1236 if pos is None: 1237 return iter(self._lemma_pos_offset_map) 1238 else: 1239 return (lemma 1240 for lemma in self._lemma_pos_offset_map 1241 if pos in self._lemma_pos_offset_map[lemma])
1242
1243 - def all_synsets(self, pos=None):
1244 """Iterate over all synsets with a given part of speech tag. 1245 If no pos is specified, all synsets for all parts of speech 1246 will be loaded. 1247 """ 1248 if pos is None: 1249 pos_tags = self._FILEMAP.keys() 1250 else: 1251 pos_tags = [pos] 1252 1253 cache = self._synset_offset_cache 1254 from_pos_and_line = self._synset_from_pos_and_line 1255 1256 # generate all synsets for each part of speech 1257 for pos_tag in pos_tags: 1258 # Open the file for reading. Note that we can not re-use 1259 # the file poitners from self._data_file_map here, because 1260 # we're defining an iterator, and those file pointers might 1261 # be moved while we're not looking. 1262 if pos_tag == ADJ_SAT: 1263 pos_tag = ADJ 1264 fileid = 'data.%s' % self._FILEMAP[pos_tag] 1265 data_file = self.open(fileid) 1266 1267 try: 1268 # generate synsets for each line in the POS file 1269 offset = data_file.tell() 1270 line = data_file.readline() 1271 while line: 1272 if not line[0].isspace(): 1273 if offset in cache[pos_tag]: 1274 # See if the synset is cached 1275 synset = cache[pos_tag][offset] 1276 else: 1277 # Otherwise, parse the line 1278 synset = from_pos_and_line(pos_tag, line) 1279 cache[pos_tag][offset] = synset 1280 1281 # adjective satellites are in the same file as 1282 # adjectives so only yield the synset if it's actually 1283 # a satellite 1284 if pos_tag == ADJ_SAT: 1285 if synset.pos == pos_tag: 1286 yield synset 1287 1288 # for all other POS tags, yield all synsets (this means 1289 # that adjectives also include adjective satellites) 1290 else: 1291 yield synset 1292 offset = data_file.tell() 1293 line = data_file.readline() 1294 1295 # close the extra file handle we opened 1296 except: 1297 data_file.close() 1298 raise 1299 else: 1300 data_file.close()
1301 1302 #//////////////////////////////////////////////////////////// 1303 # Misc 1304 #////////////////////////////////////////////////////////////
1305 - def lemma_count(self, lemma):
1306 """Return the frequency count for this Lemma""" 1307 # open the count file if we haven't already 1308 if self._key_count_file is None: 1309 self._key_count_file = self.open('cntlist.rev') 1310 # find the key in the counts file and return the count 1311 line = _binary_search_file(self._key_count_file, lemma.key) 1312 if line: 1313 return int(line.rsplit(' ', 1)[-1]) 1314 else: 1315 return 0
1316
1317 - def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1318 return synset1.path_similarity(synset2, verbose, simulate_root)
1319 path_similarity.__doc__ = Synset.path_similarity.__doc__ 1320
1321 - def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1322 return synset1.lch_similarity(synset2, verbose, simulate_root)
1323 lch_similarity.__doc__ = Synset.lch_similarity.__doc__ 1324
1325 - def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1326 return synset1.wup_similarity(synset2, verbose, simulate_root)
1327 wup_similarity.__doc__ = Synset.wup_similarity.__doc__ 1328
1329 - def res_similarity(self, synset1, synset2, ic, verbose=False):
1330 return synset1.res_similarity(synset2, ic, verbose)
1331 res_similarity.__doc__ = Synset.res_similarity.__doc__ 1332
1333 - def jcn_similarity(self, synset1, synset2, ic, verbose=False):
1334 return synset1.jcn_similarity(synset2, ic, verbose)
1335 jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ 1336
1337 - def lin_similarity(self, synset1, synset2, ic, verbose=False):
1338 return synset1.lin_similarity(synset2, ic, verbose)
1339 lin_similarity.__doc__ = Synset.lin_similarity.__doc__ 1340 1341 #//////////////////////////////////////////////////////////// 1342 # Morphy 1343 #//////////////////////////////////////////////////////////// 1344 # Morphy, adapted from Oliver Steele's pywordnet
1345 - def morphy(self, form, pos=None):
1346 """ 1347 Find a possible base form for the given form, with the given 1348 part of speech, by checking WordNet's list of exceptional 1349 forms, and by recursively stripping affixes for this part of 1350 speech until a form in WordNet is found. 1351 1352 >>> from nltk.corpus import wordnet as wn 1353 >>> wn.morphy('dogs') 1354 'dog' 1355 >>> wn.morphy('churches') 1356 'church' 1357 >>> wn.morphy('aardwolves') 1358 'aardwolf' 1359 >>> wn.morphy('abaci') 1360 'abacus' 1361 >>> wn.morphy('hardrock', wn.ADV) 1362 >>> wn.morphy('book', wn.NOUN) 1363 'book' 1364 >>> wn.morphy('book', wn.ADJ) 1365 """ 1366 1367 if pos is None: 1368 morphy = self._morphy 1369 analyses = chain(a for p in POS_LIST for a in morphy(form, p)) 1370 else: 1371 analyses = self._morphy(form, pos) 1372 1373 # get the first one we find 1374 first = list(islice(analyses, 1)) 1375 if len(first) == 1: 1376 return first[0] 1377 else: 1378 return None
1379 1380 MORPHOLOGICAL_SUBSTITUTIONS = { 1381 NOUN: [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'), 1382 ('zes', 'z'), ('ches', 'ch'), ('shes', 'sh'), 1383 ('men', 'man'), ('ies', 'y')], 1384 VERB: [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''), 1385 ('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')], 1386 ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')], 1387 ADV: []} 1388
1389 - def _morphy(self, form, pos):
1390 # from jordanbg: 1391 # Given an original string x 1392 # 1. Apply rules once to the input to get y1, y2, y3, etc. 1393 # 2. Return all that are in the database 1394 # 3. If there are no matches, keep applying rules until you either 1395 # find a match or you can't go any further 1396 1397 exceptions = self._exception_map[pos] 1398 substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos] 1399 1400 def apply_rules(forms): 1401 return [form[:-len(old)] + new 1402 for form in forms 1403 for old, new in substitutions 1404 if form.endswith(old)]
1405 1406 def filter_forms(forms): 1407 result = [] 1408 seen = set() 1409 for form in forms: 1410 if form in self._lemma_pos_offset_map: 1411 if pos in self._lemma_pos_offset_map[form]: 1412 if form not in seen: 1413 result.append(form) 1414 seen.add(form) 1415 return result
1416 1417 # 0. Check the exception lists 1418 if form in exceptions: 1419 return filter_forms([form] + exceptions[form]) 1420 1421 # 1. Apply rules once to the input to get y1, y2, y3, etc. 1422 forms = apply_rules([form]) 1423 1424 # 2. Return all that are in the database (and check the original too) 1425 results = filter_forms([form] + forms) 1426 if results: 1427 return results 1428 1429 # 3. If there are no matches, keep applying rules until we find a match 1430 while forms: 1431 forms = apply_rules(forms) 1432 results = filter_forms(forms) 1433 if results: 1434 return results 1435 1436 # Return an empty list if we can't find anything 1437 return [] 1438 1439 #//////////////////////////////////////////////////////////// 1440 # Create information content from corpus 1441 #////////////////////////////////////////////////////////////
1442 - def ic(self, corpus, weight_senses_equally = False, smoothing = 1.0):
1443 """ 1444 Creates an information content lookup dictionary from a corpus. 1445 1446 @type corpus: L{CorpusReader} 1447 @param corpus: The corpus from which we create an information 1448 content dictionary. 1449 @type weight_senses_equally: L{bool} 1450 @param weight_senses_equally: If this is True, gives all 1451 possible senses equal weight rather than dividing by the 1452 number of possible senses. (If a word has 3 synses, each 1453 sense gets 0.3333 per appearance when this is False, 1.0 when 1454 it is true.) 1455 @param smoothing: How much do we smooth synset counts (default is 1.0) 1456 @type smoothing: L{float} 1457 @return: An information content dictionary 1458 """ 1459 counts = FreqDist() 1460 for ww in corpus.words(): 1461 counts.inc(ww) 1462 1463 ic = {} 1464 for pp in POS_LIST: 1465 ic[pp] = defaultdict(float) 1466 1467 # Initialize the counts with the smoothing value 1468 if smoothing > 0.0: 1469 for ss in self.all_synsets(): 1470 pos = ss.pos 1471 if pos == ADJ_SAT: 1472 pos = ADJ 1473 ic[pos][ss.offset] = smoothing 1474 1475 for ww in counts: 1476 possible_synsets = self.synsets(ww) 1477 if len(possible_synsets) == 0: 1478 continue 1479 1480 # Distribute weight among possible synsets 1481 weight = float(counts[ww]) 1482 if not weight_senses_equally: 1483 weight /= float(len(possible_synsets)) 1484 1485 for ss in possible_synsets: 1486 pos = ss.pos 1487 if pos == ADJ_SAT: 1488 pos = ADJ 1489 for level in ss._iter_hypernym_lists(): 1490 for hh in level: 1491 ic[pos][hh.offset] += weight 1492 # Add the weight to the root 1493 ic[pos][0] += weight 1494 return ic
1495 1496 1497 ###################################################################### 1498 ## WordNet Information Content Corpus Reader 1499 ###################################################################### 1500
1501 -class WordNetICCorpusReader(CorpusReader):
1502 """ 1503 A corpus reader for the WordNet information content corpus. 1504 """ 1505
1506 - def __init__(self, root, fileids):
1508 1509 # this load function would be more efficient if the data was pickled 1510 # Note that we can't use NLTK's frequency distributions because 1511 # synsets are overlapping (each instance of a synset also counts 1512 # as an instance of its hypernyms)
1513 - def ic(self, icfile):
1514 """ 1515 Load an information content file from the wordnet_ic corpus 1516 and return a dictionary. This dictionary has just two keys, 1517 NOUN and VERB, whose values are dictionaries that map from 1518 synsets to information content values. 1519 1520 @type icfile: L{str} 1521 @param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat") 1522 @return: An information content dictionary 1523 """ 1524 ic = {} 1525 ic[NOUN] = defaultdict(float) 1526 ic[VERB] = defaultdict(float) 1527 for num, line in enumerate(self.open(icfile)): 1528 if num == 0: # skip the header 1529 continue 1530 fields = line.split() 1531 offset = int(fields[0][:-1]) 1532 value = float(fields[1]) 1533 pos = _get_pos(fields[0]) 1534 if len(fields) == 3 and fields[2] == "ROOT": 1535 # Store root count. 1536 ic[pos][0] += value 1537 if value != 0: 1538 ic[pos][offset] = value 1539 return ic
1540 1541 1542 ###################################################################### 1543 # Similarity metrics 1544 ###################################################################### 1545 1546 # TODO: Add in the option to manually add a new root node; this will be 1547 # useful for verb similarity as there exist multiple verb taxonomies. 1548 1549 # More information about the metrics is available at 1550 # http://marimba.d.umn.edu/similarity/measures.html 1551
1552 -def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
1553 return synset1.path_similarity(synset2, verbose, simulate_root)
1554 path_similarity.__doc__ = Synset.path_similarity.__doc__ 1555 1556
1557 -def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
1558 return synset1.lch_similarity(synset2, verbose, simulate_root)
1559 lch_similarity.__doc__ = Synset.lch_similarity.__doc__ 1560 1561
1562 -def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
1563 return synset1.wup_similarity(synset2, verbose, simulate_root)
1564 wup_similarity.__doc__ = Synset.wup_similarity.__doc__ 1565 1566
1567 -def res_similarity(synset1, synset2, ic, verbose=False):
1568 return synset1.res_similarity(synset2, verbose)
1569 res_similarity.__doc__ = Synset.res_similarity.__doc__ 1570 1571
1572 -def jcn_similarity(synset1, synset2, ic, verbose=False):
1573 return synset1.jcn_similarity(synset2, verbose)
1574 jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ 1575 1576
1577 -def lin_similarity(synset1, synset2, ic, verbose=False):
1578 return synset1.lin_similarity(synset2, verbose)
1579 lin_similarity.__doc__ = Synset.lin_similarity.__doc__ 1580 1581
1582 -def _lcs_by_depth(synset1, synset2, verbose=False):
1583 """ 1584 Finds the least common subsumer of two synsets in a WordNet taxonomy, 1585 where the least common subsumer is defined as the ancestor node common 1586 to both input synsets whose shortest path to the root node is the longest. 1587 1588 @type synset1: L{Synset} 1589 @param synset1: First input synset. 1590 @type synset2: L{Synset} 1591 @param synset2: Second input synset. 1592 @return: The ancestor synset common to both input synsets which is also the 1593 LCS. 1594 """ 1595 subsumer = None 1596 max_min_path_length = -1 1597 1598 subsumers = synset1.common_hypernyms(synset2) 1599 1600 if verbose: 1601 print "> Subsumers1:", subsumers 1602 1603 # Eliminate those synsets which are ancestors of other synsets in the 1604 # set of subsumers. 1605 1606 eliminated = set() 1607 hypernym_relation = lambda s: s.hypernyms() + s.instance_hypernyms() 1608 for s1 in subsumers: 1609 for s2 in subsumers: 1610 if s2 in s1.closure(hypernym_relation): 1611 eliminated.add(s2) 1612 if verbose: 1613 print "> Eliminated:", eliminated 1614 1615 subsumers = [s for s in subsumers if s not in eliminated] 1616 1617 if verbose: 1618 print "> Subsumers2:", subsumers 1619 1620 # Calculate the length of the shortest path to the root for each 1621 # subsumer. Select the subsumer with the longest of these. 1622 1623 for candidate in subsumers: 1624 1625 paths_to_root = candidate.hypernym_paths() 1626 min_path_length = -1 1627 1628 for path in paths_to_root: 1629 if min_path_length < 0 or len(path) < min_path_length: 1630 min_path_length = len(path) 1631 1632 if min_path_length > max_min_path_length: 1633 max_min_path_length = min_path_length 1634 subsumer = candidate 1635 1636 if verbose: 1637 print "> LCS Subsumer by depth:", subsumer 1638 return subsumer
1639 1640
1641 -def _lcs_ic(synset1, synset2, ic, verbose=False):
1642 """ 1643 Get the information content of the least common subsumer that has 1644 the highest information content value. If two nodes have no 1645 explicit common subsumer, assume that they share an artificial 1646 root node that is the hypernym of all explicit roots. 1647 1648 @type synset1: L{Synset} 1649 @param synset1: First input synset. 1650 @type synset2: L{Synset} 1651 @param synset2: Second input synset. Must be the same part of 1652 speech as the first synset. 1653 @type ic: C{dict} 1654 @param ic: an information content object (as returned by L{load_ic()}). 1655 @return: The information content of the two synsets and their most 1656 informative subsumer 1657 """ 1658 if synset1.pos != synset2.pos: 1659 raise WordNetError('Computing the least common subsumer requires ' + \ 1660 '%s and %s to have the same part of speech.' % \ 1661 (synset1, synset2)) 1662 1663 ic1 = information_content(synset1, ic) 1664 ic2 = information_content(synset2, ic) 1665 subsumers = synset1.common_hypernyms(synset2) 1666 if len(subsumers) == 0: 1667 subsumer_ic = 0 1668 else: 1669 subsumer_ic = max(information_content(s, ic) for s in subsumers) 1670 1671 if verbose: 1672 print "> LCS Subsumer by content:", subsumer_ic 1673 1674 return ic1, ic2, subsumer_ic
1675 1676 1677 # Utility functions 1678
1679 -def information_content(synset, ic):
1680 try: 1681 icpos = ic[synset.pos] 1682 except KeyError: 1683 msg = 'Information content file has no entries for part-of-speech: %s' 1684 raise WordNetError(msg % synset.pos) 1685 1686 counts = icpos[synset.offset] 1687 if counts == 0: 1688 return _INF 1689 else: 1690 return -math.log(counts / icpos[0])
1691 1692 1693 # get the part of speech (NOUN or VERB) from the information content record 1694 # (each identifier has a 'n' or 'v' suffix) 1695
1696 -def _get_pos(field):
1697 if field[-1] == 'n': 1698 return NOUN 1699 elif field[-1] == 'v': 1700 return VERB 1701 else: 1702 msg = "Unidentified part of speech in WordNet Information Content file" 1703 raise ValueError(msg)
1704 1705 1706 ###################################################################### 1707 # Demo 1708 ###################################################################### 1709
1710 -def demo():
1711 import nltk 1712 print 'loading wordnet' 1713 wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet')) 1714 print 'done loading' 1715 S = wn.synset 1716 L = wn.lemma 1717 1718 print 'getting a synset for go' 1719 move_synset = S('go.v.21') 1720 print move_synset.name, move_synset.pos, move_synset.lexname 1721 print move_synset.lemma_names 1722 print move_synset.definition 1723 print move_synset.examples 1724 1725 zap_n = ['zap.n.01'] 1726 zap_v = ['zap.v.01', 'zap.v.02', 'nuke.v.01', 'microwave.v.01'] 1727 1728 def _get_synsets(synset_strings): 1729 return [S(synset) for synset in synset_strings]
1730 1731 zap_n_synsets = _get_synsets(zap_n) 1732 zap_v_synsets = _get_synsets(zap_v) 1733 zap_synsets = set(zap_n_synsets + zap_v_synsets) 1734 1735 print zap_n_synsets 1736 print zap_v_synsets 1737 1738 print "Navigations:" 1739 print S('travel.v.01').hypernyms() 1740 print S('travel.v.02').hypernyms() 1741 print S('travel.v.03').hypernyms() 1742 1743 print L('zap.v.03.nuke').derivationally_related_forms() 1744 print L('zap.v.03.atomize').derivationally_related_forms() 1745 print L('zap.v.03.atomise').derivationally_related_forms() 1746 print L('zap.v.03.zap').derivationally_related_forms() 1747 1748 print S('dog.n.01').member_holonyms() 1749 print S('dog.n.01').part_meronyms() 1750 1751 print S('breakfast.n.1').hypernyms() 1752 print S('meal.n.1').hyponyms() 1753 print S('Austen.n.1').instance_hypernyms() 1754 print S('composer.n.1').instance_hyponyms() 1755 1756 print S('faculty.n.2').member_meronyms() 1757 print S('copilot.n.1').member_holonyms() 1758 1759 print S('table.n.2').part_meronyms() 1760 print S('course.n.7').part_holonyms() 1761 1762 print S('water.n.1').substance_meronyms() 1763 print S('gin.n.1').substance_holonyms() 1764 1765 print L('leader.n.1.leader').antonyms() 1766 print L('increase.v.1.increase').antonyms() 1767 1768 print S('snore.v.1').entailments() 1769 print S('heavy.a.1').similar_tos() 1770 print S('light.a.1').attributes() 1771 print S('heavy.a.1').attributes() 1772 1773 print L('English.a.1.English').pertainyms() 1774 1775 print S('person.n.01').root_hypernyms() 1776 print S('sail.v.01').root_hypernyms() 1777 print S('fall.v.12').root_hypernyms() 1778 1779 print S('person.n.01').lowest_common_hypernyms(S('dog.n.01')) 1780 1781 print S('dog.n.01').path_similarity(S('cat.n.01')) 1782 print S('dog.n.01').lch_similarity(S('cat.n.01')) 1783 print S('dog.n.01').wup_similarity(S('cat.n.01')) 1784 1785 wnic = WordNetICCorpusReader(nltk.data.find('corpora/wordnet_ic'), 1786 '.*\.dat') 1787 ic = wnic.ic('ic-brown.dat') 1788 print S('dog.n.01').jcn_similarity(S('cat.n.01'), ic) 1789 1790 ic = wnic.ic('ic-semcor.dat') 1791 print S('dog.n.01').lin_similarity(S('cat.n.01'), ic) 1792 1793 print S('code.n.03').topic_domains() 1794 print S('pukka.a.01').region_domains() 1795 print S('freaky.a.01').usage_domains() 1796 1797 if __name__ == '__main__': 1798 demo() 1799