1
2
3
4
5
6
7
8
9
10
11 import math
12 import re
13 from itertools import islice, chain
14 from operator import itemgetter
15
16 from nltk.compat import defaultdict
17 from nltk.corpus.reader import CorpusReader
18 from nltk.util import binary_search_file as _binary_search_file
19 from nltk.probability import FreqDist
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39 _INF = 1e300
40
41
42 ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
43
44
45 POS_LIST = [NOUN, VERB, ADJ, ADV]
46
47
48 VERB_FRAME_STRINGS = (
49 None,
50 "Something %s",
51 "Somebody %s",
52 "It is %sing",
53 "Something is %sing PP",
54 "Something %s something Adjective/Noun",
55 "Something %s Adjective/Noun",
56 "Somebody %s Adjective",
57 "Somebody %s something",
58 "Somebody %s somebody",
59 "Something %s somebody",
60 "Something %s something",
61 "Something %s to somebody",
62 "Somebody %s on something",
63 "Somebody %s somebody something",
64 "Somebody %s something to somebody",
65 "Somebody %s something from somebody",
66 "Somebody %s somebody with something",
67 "Somebody %s somebody of something",
68 "Somebody %s something on somebody",
69 "Somebody %s somebody PP",
70 "Somebody %s something PP",
71 "Somebody %s PP",
72 "Somebody's (body part) %s",
73 "Somebody %s somebody to INFINITIVE",
74 "Somebody %s somebody INFINITIVE",
75 "Somebody %s that CLAUSE",
76 "Somebody %s to somebody",
77 "Somebody %s to INFINITIVE",
78 "Somebody %s whether INFINITIVE",
79 "Somebody %s somebody into V-ing something",
80 "Somebody %s something with something",
81 "Somebody %s INFINITIVE",
82 "Somebody %s VERB-ing",
83 "It %s that CLAUSE",
84 "Something %s INFINITIVE")
85
86
87
88
89
91 """An exception class for wordnet-related errors."""
92
93
162
163 -class Lemma(_WordNetObject):
164 """
165 The lexical entry for a single morphological form of a
166 sense-disambiguated word.
167
168 Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where:
169 <word> is the morphological stem identifying the synset
170 <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
171 <number> is the sense number, counting from 0.
172 <lemma> is the morphological form of interest
173
174 Note that <word> and <lemma> can be different, e.g. the Synset
175 'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
176 'salt.n.03.salinity'.
177
178 Lemma attributes
179 ----------------
180 name - The canonical name of this lemma.
181 synset - The synset that this lemma belongs to.
182 syntactic_marker - For adjectives, the WordNet string identifying the
183 syntactic position relative modified noun. See:
184 http://wordnet.princeton.edu/man/wninput.5WN.html#sect10
185 For all other parts of speech, this attribute is None.
186
187 Lemma methods
188 -------------
189 Lemmas have the following methods for retrieving related Lemmas. They
190 correspond to the names for the pointer symbols defined here:
191 http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
192 These methods all return lists of Lemmas.
193
194 antonyms
195 hypernyms
196 instance_hypernyms
197 hyponyms
198 instance_hyponyms
199 member_holonyms
200 substance_holonyms
201 part_holonyms
202 member_meronyms
203 substance_meronyms
204 part_meronyms
205 topic_domains
206 region_domains
207 usage_domains
208 attributes
209 derivationally_related_forms
210 entailments
211 causes
212 also_sees
213 verb_groups
214 similar_tos
215 pertainyms
216 """
217
218
219 - def __init__(self, wordnet_corpus_reader, synset, name,
220 lexname_index, lex_id, syntactic_marker):
221 self._wordnet_corpus_reader = wordnet_corpus_reader
222 self.name = name
223 self.syntactic_marker = syntactic_marker
224 self.synset = synset
225 self.frame_strings = []
226 self.frame_ids = []
227 self._lexname_index = lexname_index
228 self._lex_id = lex_id
229
230 self.key = None
231
235
241
243 """Return the frequency count for this Lemma"""
244 return self._wordnet_corpus_reader.lemma_count(self)
245
248
251
254
255
257 """Create a Synset from a "<lemma>.<pos>.<number>" string where:
258 <lemma> is the word's morphological stem
259 <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
260 <number> is the sense number, counting from 0.
261
262 Synset attributes
263 -----------------
264 name - The canonical name of this synset, formed using the first lemma
265 of this synset. Note that this may be different from the name
266 passed to the constructor if that string used a different lemma to
267 identify the synset.
268 pos - The synset's part of speech, matching one of the module level
269 attributes ADJ, ADJ_SAT, ADV, NOUN or VERB.
270 lemmas - A list of the Lemma objects for this synset.
271 definition - The definition for this synset.
272 examples - A list of example strings for this synset.
273 offset - The offset in the WordNet dict file of this synset.
274 #lexname - The name of the lexicographer file containing this synset.
275
276 Synset methods
277 --------------
278 Synsets have the following methods for retrieving related Synsets.
279 They correspond to the names for the pointer symbols defined here:
280 http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
281 These methods all return lists of Synsets.
282
283 hypernyms
284 instance_hypernyms
285 hyponyms
286 instance_hyponyms
287 member_holonyms
288 substance_holonyms
289 part_holonyms
290 member_meronyms
291 substance_meronyms
292 part_meronyms
293 attributes
294 entailments
295 causes
296 also_sees
297 verb_groups
298 similar_tos
299
300 Additionally, Synsets support the following methods specific to the
301 hypernym relation:
302
303 root_hypernyms
304 common_hypernyms
305 lowest_common_hypernyms
306
307 Note that Synsets do not support the following relations because
308 these are defined by WordNet as lexical relations:
309
310 antonyms
311 derivationally_related_forms
312 pertainyms
313 """
314
315 - def __init__(self, wordnet_corpus_reader):
316 self._wordnet_corpus_reader = wordnet_corpus_reader
317
318
319
320 self.pos = None
321 self.offset = None
322 self.name = None
323 self.frame_ids = []
324 self.lemmas = []
325 self.lemma_names = []
326 self.lemma_infos = []
327 self.definition = None
328 self.examples = []
329 self.lexname = None
330
331 self._pointers = defaultdict(set)
332 self._lemma_pointers = defaultdict(set)
333
335 if self.pos == NOUN:
336 if self._wordnet_corpus_reader.get_version() == '1.6':
337 return True
338 else:
339 return False
340 elif self.pos == VERB:
341 return True
342
344 """Get the topmost hypernyms of this synset in WordNet."""
345
346 result = []
347 seen = set()
348 todo = [self]
349 while todo:
350 next_synset = todo.pop()
351 if next_synset not in seen:
352 seen.add(next_synset)
353 next_hypernyms = next_synset.hypernyms() + \
354 next_synset.instance_hypernyms()
355 if not next_hypernyms:
356 result.append(next_synset)
357 else:
358 todo.extend(next_hypernyms)
359 return result
360
361
362
363
364
365
366
367
368
370 """
371 @return: The length of the longest hypernym path from this
372 synset to the root.
373 """
374
375 if "_max_depth" not in self.__dict__:
376 hypernyms = self.hypernyms() + self.instance_hypernyms()
377 if not hypernyms:
378 self._max_depth = 0
379 else:
380 self._max_depth = 1 + max(h.max_depth() for h in hypernyms)
381 return self._max_depth
382
384 """
385 @return: The length of the shortest hypernym path from this
386 synset to the root.
387 """
388
389 if "_min_depth" not in self.__dict__:
390 hypernyms = self.hypernyms() + self.instance_hypernyms()
391 if not hypernyms:
392 self._min_depth = 0
393 else:
394 self._min_depth = 1 + min(h.min_depth() for h in hypernyms)
395 return self._min_depth
396
398 """Return the transitive closure of source under the rel
399 relationship, breadth-first
400
401 >>> from nltk.corpus import wordnet as wn
402 >>> dog = wn.synset('dog.n.01')
403 >>> hyp = lambda s:s.hypernyms()
404 >>> list(dog.closure(hyp))
405 [Synset('domestic_animal.n.01'), Synset('canine.n.02'),
406 Synset('animal.n.01'), Synset('carnivore.n.01'),
407 Synset('organism.n.01'), Synset('placental.n.01'),
408 Synset('living_thing.n.01'), Synset('mammal.n.01'),
409 Synset('whole.n.02'), Synset('vertebrate.n.01'),
410 Synset('object.n.01'), Synset('chordate.n.01'),
411 Synset('physical_entity.n.01'), Synset('entity.n.01')]
412 """
413 from nltk.util import breadth_first
414 synset_offsets = []
415 for synset in breadth_first(self, rel, depth):
416 if synset.offset != self.offset:
417 if synset.offset not in synset_offsets:
418 synset_offsets.append(synset.offset)
419 yield synset
420
422 """
423 Get the path(s) from this synset to the root, where each path is a
424 list of the synset nodes traversed on the way to the root.
425
426 @return: A list of lists, where each list gives the node sequence
427 connecting the initial L{Synset} node and a root node.
428 """
429 paths = []
430
431 hypernyms = self.hypernyms() + self.instance_hypernyms()
432 if len(hypernyms) == 0:
433 paths = [[self]]
434
435 for hypernym in hypernyms:
436 for ancestor_list in hypernym.hypernym_paths():
437 ancestor_list.append(self)
438 paths.append(ancestor_list)
439 return paths
440
442 """
443 Find all synsets that are hypernyms of this synset and the
444 other synset.
445
446 @type other: L{Synset}
447 @param other: other input synset.
448 @return: The synsets that are hypernyms of both synsets.
449 """
450 self_synsets = set(self_synset
451 for self_synsets in self._iter_hypernym_lists()
452 for self_synset in self_synsets)
453 other_synsets = set(other_synset
454 for other_synsets in other._iter_hypernym_lists()
455 for other_synset in other_synsets)
456 return list(self_synsets.intersection(other_synsets))
457
482
484 """
485 Get the path(s) from this synset to the root, counting the distance
486 of each node from the initial node on the way. A set of
487 (synset, distance) tuples is returned.
488
489 @type distance: C{int}
490 @param distance: the distance (number of edges) from this hypernym to
491 the original hypernym L{Synset} on which this method was called.
492 @return: A set of (L{Synset}, int) tuples where each L{Synset} is
493 a hypernym of the first L{Synset}.
494 """
495 distances = set([(self, distance)])
496 for hypernym in self.hypernyms() + self.instance_hypernyms():
497 distances |= hypernym.hypernym_distances(distance+1, simulate_root=False)
498 if simulate_root:
499 fake_synset = Synset(None)
500 fake_synset.name = '*ROOT*'
501 fake_synset_distance = max(distances, key=itemgetter(1))[1]
502 distances.add((fake_synset, fake_synset_distance+1))
503 return distances
504
506 """
507 Returns the distance of the shortest path linking the two synsets (if
508 one exists). For each synset, all the ancestor nodes and their
509 distances are recorded and compared. The ancestor node common to both
510 synsets that can be reached with the minimum number of traversals is
511 used. If no ancestor nodes are common, None is returned. If a node is
512 compared with itself 0 is returned.
513
514 @type other: L{Synset}
515 @param other: The Synset to which the shortest path will be found.
516 @return: The number of edges in the shortest path connecting the two
517 nodes, or None if no path exists.
518 """
519
520 if self == other:
521 return 0
522
523 path_distance = None
524
525 dist_list1 = self.hypernym_distances(simulate_root=simulate_root)
526 dist_dict1 = {}
527
528 dist_list2 = other.hypernym_distances(simulate_root=simulate_root)
529 dist_dict2 = {}
530
531
532
533
534
535
536 for (l, d) in [(dist_list1, dist_dict1), (dist_list2, dist_dict2)]:
537 for (key, value) in l:
538 if key in d:
539 if value < d[key]:
540 d[key] = value
541 else:
542 d[key] = value
543
544
545
546
547 for synset1 in dist_dict1.keys():
548 for synset2 in dist_dict2.keys():
549 if synset1 == synset2:
550 new_distance = dist_dict1[synset1] + dist_dict2[synset2]
551 if path_distance < 0 or new_distance < path_distance:
552 path_distance = new_distance
553
554 return path_distance
555
556 - def tree(self, rel, depth=-1, cut_mark=None):
557 """
558 >>> from nltk.corpus import wordnet as wn
559 >>> dog = wn.synset('dog.n.01')
560 >>> hyp = lambda s:s.hypernyms()
561 >>> from pprint import pprint
562 >>> pprint(dog.tree(hyp))
563 [Synset('dog.n.01'),
564 [Synset('domestic_animal.n.01'),
565 [Synset('animal.n.01'),
566 [Synset('organism.n.01'),
567 [Synset('living_thing.n.01'),
568 [Synset('whole.n.02'),
569 [Synset('object.n.01'),
570 [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]],
571 [Synset('canine.n.02'),
572 [Synset('carnivore.n.01'),
573 [Synset('placental.n.01'),
574 [Synset('mammal.n.01'),
575 [Synset('vertebrate.n.01'),
576 [Synset('chordate.n.01'),
577 [Synset('animal.n.01'),
578 [Synset('organism.n.01'),
579 [Synset('living_thing.n.01'),
580 [Synset('whole.n.02'),
581 [Synset('object.n.01'),
582 [Synset('physical_entity.n.01'),
583 [Synset('entity.n.01')]]]]]]]]]]]]]]
584 """
585
586 tree = [self]
587 if depth != 0:
588 tree += [x.tree(rel, depth-1, cut_mark) for x in rel(self)]
589 elif cut_mark:
590 tree += [cut_mark]
591 return tree
592
593
595 """
596 Path Distance Similarity:
597 Return a score denoting how similar two word senses are, based on the
598 shortest path that connects the senses in the is-a (hypernym/hypnoym)
599 taxonomy. The score is in the range 0 to 1, except in those cases where
600 a path cannot be found (will only be true for verbs as there are many
601 distinct verb taxonomies), in which case None is returned. A score of
602 1 represents identity i.e. comparing a sense with itself will return 1.
603
604 @type other: L{Synset}
605 @param other: The L{Synset} that this L{Synset} is being compared to.
606 @type simulate_root: L{bool}
607 @param simulate_root: The various verb taxonomies do not
608 share a single root which disallows this metric from working for
609 synsets that are not connected. This flag (True by default)
610 creates a fake root that connects all the taxonomies. Set it
611 to false to disable this behavior. For the noun taxonomy,
612 there is usually a default root except for WordNet version 1.6.
613 If you are using wordnet 1.6, a fake root will be added for nouns
614 as well.
615 @return: A score denoting the similarity of the two L{Synset}s,
616 normally between 0 and 1. None is returned if no connecting path
617 could be found. 1 is returned if a L{Synset} is compared with
618 itself.
619 """
620
621 distance = self.shortest_path_distance(other, simulate_root=simulate_root and self._needs_root())
622 if distance >= 0:
623 return 1.0 / (distance + 1)
624 else:
625 return None
626
628 """
629 Leacock Chodorow Similarity:
630 Return a score denoting how similar two word senses are, based on the
631 shortest path that connects the senses (as above) and the maximum depth
632 of the taxonomy in which the senses occur. The relationship is given as
633 -log(p/2d) where p is the shortest path length and d is the taxonomy
634 depth.
635
636 @type other: L{Synset}
637 @param other: The L{Synset} that this L{Synset} is being compared to.
638 @type simulate_root: L{bool}
639 @param simulate_root: The various verb taxonomies do not
640 share a single root which disallows this metric from working for
641 synsets that are not connected. This flag (True by default)
642 creates a fake root that connects all the taxonomies. Set it
643 to false to disable this behavior. For the noun taxonomy,
644 there is usually a default root except for WordNet version 1.6.
645 If you are using wordnet 1.6, a fake root will be added for nouns
646 as well.
647 @return: A score denoting the similarity of the two L{Synset}s,
648 normally greater than 0. None is returned if no connecting path
649 could be found. If a L{Synset} is compared with itself, the
650 maximum score is returned, which varies depending on the taxonomy
651 depth.
652 """
653
654 if self.pos != other.pos:
655 raise WordNetError('Computing the lch similarity requires ' + \
656 '%s and %s to have the same part of speech.' % \
657 (self, other))
658
659 need_root = self._needs_root()
660
661 if self.pos not in self._wordnet_corpus_reader._max_depth:
662 self._wordnet_corpus_reader._compute_max_depth(self.pos, need_root)
663
664 depth = self._wordnet_corpus_reader._max_depth[self.pos]
665
666 distance = self.shortest_path_distance(other, simulate_root=simulate_root and need_root)
667
668 if distance >= 0:
669 return -math.log((distance + 1) / (2.0 * depth))
670 else:
671 return None
672
674 """
675 Wu-Palmer Similarity:
676 Return a score denoting how similar two word senses are, based on the
677 depth of the two senses in the taxonomy and that of their Least Common
678 Subsumer (most specific ancestor node). Previously, the scores computed
679 by this implementation did _not_ always agree with those given by
680 Pedersen's Perl implementation of WordNet Similarity. However, with
681 the addition of the simulate_root flag (see below), the score for
682 verbs now almost always agree but not always for nouns.
683
684 The LCS does not necessarily feature in the shortest path connecting
685 the two senses, as it is by definition the common ancestor deepest in
686 the taxonomy, not closest to the two senses. Typically, however, it
687 will so feature. Where multiple candidates for the LCS exist, that
688 whose shortest path to the root node is the longest will be selected.
689 Where the LCS has multiple paths to the root, the longer path is used
690 for the purposes of the calculation.
691
692 @type other: L{Synset}
693 @param other: The L{Synset} that this L{Synset} is being compared to.
694 @type simulate_root: L{bool}
695 @param simulate_root: The various verb taxonomies do not
696 share a single root which disallows this metric from working for
697 synsets that are not connected. This flag (True by default)
698 creates a fake root that connects all the taxonomies. Set it
699 to false to disable this behavior. For the noun taxonomy,
700 there is usually a default root except for WordNet version 1.6.
701 If you are using wordnet 1.6, a fake root will be added for nouns
702 as well.
703 @return: A float score denoting the similarity of the two L{Synset}s,
704 normally greater than zero. If no connecting path between the two
705 senses can be found, None is returned.
706
707 """
708
709 need_root = self._needs_root()
710 subsumers = self.lowest_common_hypernyms(other, simulate_root=simulate_root and need_root)
711
712
713 if len(subsumers) == 0:
714 return None
715
716 subsumer = subsumers[0]
717
718
719
720
721
722 depth = subsumer.max_depth() + 1
723
724
725
726
727
728
729
730
731
732
733 len1 = self.shortest_path_distance(subsumer, simulate_root=simulate_root and need_root)
734 len2 = other.shortest_path_distance(subsumer, simulate_root=simulate_root and need_root)
735 if len1 is None or len2 is None:
736 return None
737 len1 += depth
738 len2 += depth
739 return (2.0 * depth) / (len1 + len2)
740
742 """
743 Resnik Similarity:
744 Return a score denoting how similar two word senses are, based on the
745 Information Content (IC) of the Least Common Subsumer (most specific
746 ancestor node).
747
748 @type other: L{Synset}
749 @param other: The L{Synset} that this L{Synset} is being compared to.
750 @type ic: C{dict}
751 @param ic: an information content object (as returned by L{load_ic()}).
752 @return: A float score denoting the similarity of the two L{Synset}s.
753 Synsets whose LCS is the root node of the taxonomy will have a
754 score of 0 (e.g. N['dog'][0] and N['table'][0]).
755 """
756
757 ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
758 return lcs_ic
759
761 """
762 Jiang-Conrath Similarity:
763 Return a score denoting how similar two word senses are, based on the
764 Information Content (IC) of the Least Common Subsumer (most specific
765 ancestor node) and that of the two input Synsets. The relationship is
766 given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
767
768 @type other: L{Synset}
769 @param other: The L{Synset} that this L{Synset} is being compared to.
770 @type ic: C{dict}
771 @param ic: an information content object (as returned by L{load_ic()}).
772 @return: A float score denoting the similarity of the two L{Synset}s.
773 """
774
775 if self == other:
776 return _INF
777
778 ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
779
780
781
782 if ic1 == 0 or ic2 == 0:
783 return 0
784
785 ic_difference = ic1 + ic2 - 2 * lcs_ic
786
787 if ic_difference == 0:
788 return _INF
789
790 return 1 / ic_difference
791
793 """
794 Lin Similarity:
795 Return a score denoting how similar two word senses are, based on the
796 Information Content (IC) of the Least Common Subsumer (most specific
797 ancestor node) and that of the two input Synsets. The relationship is
798 given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
799
800 @type other: L{Synset}
801 @param other: The L{Synset} that this L{Synset} is being compared to.
802 @type ic: C{dict}
803 @param ic: an information content object (as returned by L{load_ic()}).
804 @return: A float score denoting the similarity of the two L{Synset}s,
805 in the range 0 to 1.
806 """
807
808 ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
809 return (2.0 * lcs_ic) / (ic1 + ic2)
810
812 """
813 @return: An iterator over L{Synset}s that are either proper
814 hypernyms or instance of hypernyms of the synset.
815 """
816 todo = [self]
817 seen = set()
818 while todo:
819 for synset in todo:
820 seen.add(synset)
821 yield todo
822 todo = [hypernym
823 for synset in todo
824 for hypernym in (synset.hypernyms() + \
825 synset.instance_hypernyms())
826 if hypernym not in seen]
827
829 return '%s(%r)' % (type(self).__name__, self.name)
830
835
836
837
838
839
840
842 """
843 A corpus reader used to access wordnet or its variants.
844 """
845
846 _ENCODING = None
847
848
849 ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
850
851
852
853 _FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'}
854
855
856
857 _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5}
858 _pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
859
860
861
862
863 _FILES = ('cntlist.rev', 'lexnames', 'index.sense',
864 'index.adj', 'index.adv', 'index.noun', 'index.verb',
865 'data.adj', 'data.adv', 'data.noun', 'data.verb',
866 'adj.exc', 'adv.exc', 'noun.exc', 'verb.exc', )
867
869 """
870 Construct a new wordnet corpus reader, with the given root
871 directory.
872 """
873 CorpusReader.__init__(self, root, self._FILES,
874 encoding=self._ENCODING)
875
876 self._lemma_pos_offset_map = defaultdict(dict)
877 """A index that provides the file offset
878
879 Map from lemma -> pos -> synset_index -> offset"""
880
881 self._synset_offset_cache = defaultdict(dict)
882 """A cache so we don't have to reconstuct synsets
883
884 Map from pos -> offset -> synset"""
885
886 self._max_depth = defaultdict(dict)
887 """A lookup for the maximum depth of each part of speech. Useful for
888 the lch similarity metric.
889 """
890
891 self._data_file_map = {}
892 self._exception_map = {}
893 self._lexnames = []
894 self._key_count_file = None
895 self._key_synset_file = None
896
897
898 for i, line in enumerate(self.open('lexnames')):
899 index, lexname, _ = line.split()
900 assert int(index) == i
901 self._lexnames.append(lexname)
902
903
904 self._load_lemma_pos_offset_map()
905
906
907 self._load_exception_map()
908
909
911 for suffix in self._FILEMAP.values():
912
913
914 for i, line in enumerate(self.open('index.%s' % suffix)):
915 if line.startswith(' '):
916 continue
917
918 next = iter(line.split()).next
919 try:
920
921
922 lemma = next()
923 pos = next()
924
925
926 n_synsets = int(next())
927 assert n_synsets > 0
928
929
930 n_pointers = int(next())
931 _ = [next() for _ in xrange(n_pointers)]
932
933
934 n_senses = int(next())
935 assert n_synsets == n_senses
936
937
938 _ = int(next())
939
940
941 synset_offsets = [int(next()) for _ in xrange(n_synsets)]
942
943
944 except (AssertionError, ValueError), e:
945 tup = ('index.%s' % suffix), (i + 1), e
946 raise WordNetError('file %s, line %i: %s' % tup)
947
948
949 self._lemma_pos_offset_map[lemma][pos] = synset_offsets
950 if pos == ADJ:
951 self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
952
954
955 for pos, suffix in self._FILEMAP.items():
956 self._exception_map[pos] = {}
957 for line in self.open('%s.exc' % suffix):
958 terms = line.split()
959 self._exception_map[pos][terms[0]] = terms[1:]
960 self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
961
963 """
964 Compute the max depth for the given part of speech. This is
965 used by the lch similarity metric.
966 """
967 depth = 0
968 for ii in self.all_synsets(pos):
969 try:
970 depth = max(depth, ii.max_depth())
971 except RuntimeError:
972 print ii
973 if simulate_root:
974 depth += 1
975 self._max_depth[pos] = depth
976
985
986
987
988
996
1021
1022
1023
1024
1026
1027 lemma, pos, synset_index_str = name.lower().rsplit('.', 2)
1028 synset_index = int(synset_index_str) - 1
1029
1030
1031 try:
1032 offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
1033 except KeyError:
1034 message = 'no lemma %r with part of speech %r'
1035 raise WordNetError(message % (lemma, pos))
1036 except IndexError:
1037 n_senses = len(self._lemma_pos_offset_map[lemma][pos])
1038 message = "lemma %r with part of speech %r has only %i %s"
1039 if n_senses == 1:
1040 tup = lemma, pos, n_senses, "sense"
1041 else:
1042 tup = lemma, pos, n_senses, "senses"
1043 raise WordNetError(message % tup)
1044
1045
1046 synset = self._synset_from_pos_and_offset(pos, offset)
1047
1048
1049 if pos == 's' and synset.pos == 'a':
1050 message = ('adjective satellite requested but only plain '
1051 'adjective found for lemma %r')
1052 raise WordNetError(message % lemma)
1053 assert synset.pos == pos or (pos == 'a' and synset.pos == 's')
1054
1055
1056 return synset
1057
1059 """
1060 Return an open file pointer for the data file for the given
1061 part of speech.
1062 """
1063 if pos == ADJ_SAT:
1064 pos = ADJ
1065 if self._data_file_map.get(pos) is None:
1066 fileid = 'data.%s' % self._FILEMAP[pos]
1067 self._data_file_map[pos] = self.open(fileid)
1068 return self._data_file_map[pos]
1069
1082
1084
1085 synset = Synset(self)
1086
1087
1088 try:
1089
1090
1091 columns_str, gloss = data_file_line.split('|')
1092 gloss = gloss.strip()
1093 definitions = []
1094 for gloss_part in gloss.split(';'):
1095 gloss_part = gloss_part.strip()
1096 if gloss_part.startswith('"'):
1097 synset.examples.append(gloss_part.strip('"'))
1098 else:
1099 definitions.append(gloss_part)
1100 synset.definition = '; '.join(definitions)
1101
1102
1103 next = iter(columns_str.split()).next
1104
1105
1106 synset.offset = int(next())
1107
1108
1109 lexname_index = int(next())
1110 synset.lexname = self._lexnames[lexname_index]
1111
1112
1113 synset.pos = next()
1114
1115
1116 n_lemmas = int(next(), 16)
1117 for _ in xrange(n_lemmas):
1118
1119 lemma_name = next()
1120
1121 lex_id = int(next(), 16)
1122
1123 m = re.match(r'(.*?)(\(.*\))?$', lemma_name)
1124 lemma_name, syn_mark = m.groups()
1125
1126 lemma = Lemma(self, synset, lemma_name, lexname_index,
1127 lex_id, syn_mark)
1128 synset.lemmas.append(lemma)
1129 synset.lemma_names.append(lemma.name)
1130
1131
1132 n_pointers = int(next())
1133 for _ in xrange(n_pointers):
1134 symbol = next()
1135 offset = int(next())
1136 pos = next()
1137 lemma_ids_str = next()
1138 if lemma_ids_str == '0000':
1139 synset._pointers[symbol].add((pos, offset))
1140 else:
1141 source_index = int(lemma_ids_str[:2], 16) - 1
1142 target_index = int(lemma_ids_str[2:], 16) - 1
1143 source_lemma_name = synset.lemmas[source_index].name
1144 lemma_pointers = synset._lemma_pointers
1145 tups = lemma_pointers[source_lemma_name, symbol]
1146 tups.add((pos, offset, target_index))
1147
1148
1149 try:
1150 frame_count = int(next())
1151 except StopIteration:
1152 pass
1153 else:
1154 for _ in xrange(frame_count):
1155
1156 plus = next()
1157 assert plus == '+'
1158
1159 frame_number = int(next())
1160 frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
1161 lemma_number = int(next(), 16)
1162
1163 if lemma_number == 0:
1164 synset.frame_ids.append(frame_number)
1165 for lemma in synset.lemmas:
1166 lemma.frame_ids.append(frame_number)
1167 lemma.frame_strings.append(frame_string_fmt %
1168 lemma.name)
1169
1170 else:
1171 lemma = synset.lemmas[lemma_number - 1]
1172 lemma.frame_ids.append(frame_number)
1173 lemma.frame_strings.append(frame_string_fmt %
1174 lemma.name)
1175
1176
1177 except ValueError, e:
1178 raise WordNetError('line %r: %s' % (data_file_line, e))
1179
1180
1181
1182 for lemma in synset.lemmas:
1183 if synset.pos is ADJ_SAT:
1184 head_lemma = synset.similar_tos()[0].lemmas[0]
1185 head_name = head_lemma.name
1186 head_id = '%02d' % head_lemma._lex_id
1187 else:
1188 head_name = head_id = ''
1189 tup = (lemma.name, WordNetCorpusReader._pos_numbers[synset.pos],
1190 lemma._lexname_index, lemma._lex_id, head_name, head_id)
1191 lemma.key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower()
1192
1193
1194 lemma_name = synset.lemmas[0].name.lower()
1195 offsets = self._lemma_pos_offset_map[lemma_name][synset.pos]
1196 sense_index = offsets.index(synset.offset)
1197 tup = lemma_name, synset.pos, sense_index + 1
1198 synset.name = '%s.%s.%02i' % tup
1199
1200 return synset
1201
1202
1203
1204
1205 - def synsets(self, lemma, pos=None):
1206 """Load all synsets with a given lemma and part of speech tag.
1207 If no pos is specified, all synsets for all parts of speech
1208 will be loaded.
1209 """
1210 lemma = lemma.lower()
1211 get_synset = self._synset_from_pos_and_offset
1212 index = self._lemma_pos_offset_map
1213
1214 if pos is None:
1215 pos = POS_LIST
1216
1217 return [get_synset(p, offset)
1218 for p in pos
1219 for form in self._morphy(lemma, p)
1220 for offset in index[form].get(p, [])]
1221
1222 - def lemmas(self, lemma, pos=None):
1223 """Return all Lemma objects with a name matching the specified lemma
1224 name and part of speech tag. Matches any part of speech tag if none is
1225 specified."""
1226 return [lemma_obj
1227 for synset in self.synsets(lemma, pos)
1228 for lemma_obj in synset.lemmas
1229 if lemma_obj.name == lemma]
1230
1232 """Return all lemma names for all synsets for the given
1233 part of speech tag. If pos is not specified, all synsets
1234 for all parts of speech will be used.
1235 """
1236 if pos is None:
1237 return iter(self._lemma_pos_offset_map)
1238 else:
1239 return (lemma
1240 for lemma in self._lemma_pos_offset_map
1241 if pos in self._lemma_pos_offset_map[lemma])
1242
1301
1302
1303
1304
1306 """Return the frequency count for this Lemma"""
1307
1308 if self._key_count_file is None:
1309 self._key_count_file = self.open('cntlist.rev')
1310
1311 line = _binary_search_file(self._key_count_file, lemma.key)
1312 if line:
1313 return int(line.rsplit(' ', 1)[-1])
1314 else:
1315 return 0
1316
1317 - def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1319 path_similarity.__doc__ = Synset.path_similarity.__doc__
1320
1321 - def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1323 lch_similarity.__doc__ = Synset.lch_similarity.__doc__
1324
1325 - def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1327 wup_similarity.__doc__ = Synset.wup_similarity.__doc__
1328
1331 res_similarity.__doc__ = Synset.res_similarity.__doc__
1332
1335 jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
1336
1339 lin_similarity.__doc__ = Synset.lin_similarity.__doc__
1340
1341
1342
1343
1344
1345 - def morphy(self, form, pos=None):
1346 """
1347 Find a possible base form for the given form, with the given
1348 part of speech, by checking WordNet's list of exceptional
1349 forms, and by recursively stripping affixes for this part of
1350 speech until a form in WordNet is found.
1351
1352 >>> from nltk.corpus import wordnet as wn
1353 >>> wn.morphy('dogs')
1354 'dog'
1355 >>> wn.morphy('churches')
1356 'church'
1357 >>> wn.morphy('aardwolves')
1358 'aardwolf'
1359 >>> wn.morphy('abaci')
1360 'abacus'
1361 >>> wn.morphy('hardrock', wn.ADV)
1362 >>> wn.morphy('book', wn.NOUN)
1363 'book'
1364 >>> wn.morphy('book', wn.ADJ)
1365 """
1366
1367 if pos is None:
1368 morphy = self._morphy
1369 analyses = chain(a for p in POS_LIST for a in morphy(form, p))
1370 else:
1371 analyses = self._morphy(form, pos)
1372
1373
1374 first = list(islice(analyses, 1))
1375 if len(first) == 1:
1376 return first[0]
1377 else:
1378 return None
1379
1380 MORPHOLOGICAL_SUBSTITUTIONS = {
1381 NOUN: [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'),
1382 ('zes', 'z'), ('ches', 'ch'), ('shes', 'sh'),
1383 ('men', 'man'), ('ies', 'y')],
1384 VERB: [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''),
1385 ('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')],
1386 ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
1387 ADV: []}
1388
1390
1391
1392
1393
1394
1395
1396
1397 exceptions = self._exception_map[pos]
1398 substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
1399
1400 def apply_rules(forms):
1401 return [form[:-len(old)] + new
1402 for form in forms
1403 for old, new in substitutions
1404 if form.endswith(old)]
1405
1406 def filter_forms(forms):
1407 result = []
1408 seen = set()
1409 for form in forms:
1410 if form in self._lemma_pos_offset_map:
1411 if pos in self._lemma_pos_offset_map[form]:
1412 if form not in seen:
1413 result.append(form)
1414 seen.add(form)
1415 return result
1416
1417
1418 if form in exceptions:
1419 return filter_forms([form] + exceptions[form])
1420
1421
1422 forms = apply_rules([form])
1423
1424
1425 results = filter_forms([form] + forms)
1426 if results:
1427 return results
1428
1429
1430 while forms:
1431 forms = apply_rules(forms)
1432 results = filter_forms(forms)
1433 if results:
1434 return results
1435
1436
1437 return []
1438
1439
1440
1441
1442 - def ic(self, corpus, weight_senses_equally = False, smoothing = 1.0):
1443 """
1444 Creates an information content lookup dictionary from a corpus.
1445
1446 @type corpus: L{CorpusReader}
1447 @param corpus: The corpus from which we create an information
1448 content dictionary.
1449 @type weight_senses_equally: L{bool}
1450 @param weight_senses_equally: If this is True, gives all
1451 possible senses equal weight rather than dividing by the
1452 number of possible senses. (If a word has 3 synses, each
1453 sense gets 0.3333 per appearance when this is False, 1.0 when
1454 it is true.)
1455 @param smoothing: How much do we smooth synset counts (default is 1.0)
1456 @type smoothing: L{float}
1457 @return: An information content dictionary
1458 """
1459 counts = FreqDist()
1460 for ww in corpus.words():
1461 counts.inc(ww)
1462
1463 ic = {}
1464 for pp in POS_LIST:
1465 ic[pp] = defaultdict(float)
1466
1467
1468 if smoothing > 0.0:
1469 for ss in self.all_synsets():
1470 pos = ss.pos
1471 if pos == ADJ_SAT:
1472 pos = ADJ
1473 ic[pos][ss.offset] = smoothing
1474
1475 for ww in counts:
1476 possible_synsets = self.synsets(ww)
1477 if len(possible_synsets) == 0:
1478 continue
1479
1480
1481 weight = float(counts[ww])
1482 if not weight_senses_equally:
1483 weight /= float(len(possible_synsets))
1484
1485 for ss in possible_synsets:
1486 pos = ss.pos
1487 if pos == ADJ_SAT:
1488 pos = ADJ
1489 for level in ss._iter_hypernym_lists():
1490 for hh in level:
1491 ic[pos][hh.offset] += weight
1492
1493 ic[pos][0] += weight
1494 return ic
1495
1496
1497
1498
1499
1500
1502 """
1503 A corpus reader for the WordNet information content corpus.
1504 """
1505
1508
1509
1510
1511
1512
1513 - def ic(self, icfile):
1514 """
1515 Load an information content file from the wordnet_ic corpus
1516 and return a dictionary. This dictionary has just two keys,
1517 NOUN and VERB, whose values are dictionaries that map from
1518 synsets to information content values.
1519
1520 @type icfile: L{str}
1521 @param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
1522 @return: An information content dictionary
1523 """
1524 ic = {}
1525 ic[NOUN] = defaultdict(float)
1526 ic[VERB] = defaultdict(float)
1527 for num, line in enumerate(self.open(icfile)):
1528 if num == 0:
1529 continue
1530 fields = line.split()
1531 offset = int(fields[0][:-1])
1532 value = float(fields[1])
1533 pos = _get_pos(fields[0])
1534 if len(fields) == 3 and fields[2] == "ROOT":
1535
1536 ic[pos][0] += value
1537 if value != 0:
1538 ic[pos][offset] = value
1539 return ic
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1554 path_similarity.__doc__ = Synset.path_similarity.__doc__
1555
1556
1557 -def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
1559 lch_similarity.__doc__ = Synset.lch_similarity.__doc__
1560
1561
1562 -def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
1564 wup_similarity.__doc__ = Synset.wup_similarity.__doc__
1565
1566
1569 res_similarity.__doc__ = Synset.res_similarity.__doc__
1570
1571
1574 jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
1575
1576
1579 lin_similarity.__doc__ = Synset.lin_similarity.__doc__
1580
1581
1583 """
1584 Finds the least common subsumer of two synsets in a WordNet taxonomy,
1585 where the least common subsumer is defined as the ancestor node common
1586 to both input synsets whose shortest path to the root node is the longest.
1587
1588 @type synset1: L{Synset}
1589 @param synset1: First input synset.
1590 @type synset2: L{Synset}
1591 @param synset2: Second input synset.
1592 @return: The ancestor synset common to both input synsets which is also the
1593 LCS.
1594 """
1595 subsumer = None
1596 max_min_path_length = -1
1597
1598 subsumers = synset1.common_hypernyms(synset2)
1599
1600 if verbose:
1601 print "> Subsumers1:", subsumers
1602
1603
1604
1605
1606 eliminated = set()
1607 hypernym_relation = lambda s: s.hypernyms() + s.instance_hypernyms()
1608 for s1 in subsumers:
1609 for s2 in subsumers:
1610 if s2 in s1.closure(hypernym_relation):
1611 eliminated.add(s2)
1612 if verbose:
1613 print "> Eliminated:", eliminated
1614
1615 subsumers = [s for s in subsumers if s not in eliminated]
1616
1617 if verbose:
1618 print "> Subsumers2:", subsumers
1619
1620
1621
1622
1623 for candidate in subsumers:
1624
1625 paths_to_root = candidate.hypernym_paths()
1626 min_path_length = -1
1627
1628 for path in paths_to_root:
1629 if min_path_length < 0 or len(path) < min_path_length:
1630 min_path_length = len(path)
1631
1632 if min_path_length > max_min_path_length:
1633 max_min_path_length = min_path_length
1634 subsumer = candidate
1635
1636 if verbose:
1637 print "> LCS Subsumer by depth:", subsumer
1638 return subsumer
1639
1640
1641 -def _lcs_ic(synset1, synset2, ic, verbose=False):
1642 """
1643 Get the information content of the least common subsumer that has
1644 the highest information content value. If two nodes have no
1645 explicit common subsumer, assume that they share an artificial
1646 root node that is the hypernym of all explicit roots.
1647
1648 @type synset1: L{Synset}
1649 @param synset1: First input synset.
1650 @type synset2: L{Synset}
1651 @param synset2: Second input synset. Must be the same part of
1652 speech as the first synset.
1653 @type ic: C{dict}
1654 @param ic: an information content object (as returned by L{load_ic()}).
1655 @return: The information content of the two synsets and their most
1656 informative subsumer
1657 """
1658 if synset1.pos != synset2.pos:
1659 raise WordNetError('Computing the least common subsumer requires ' + \
1660 '%s and %s to have the same part of speech.' % \
1661 (synset1, synset2))
1662
1663 ic1 = information_content(synset1, ic)
1664 ic2 = information_content(synset2, ic)
1665 subsumers = synset1.common_hypernyms(synset2)
1666 if len(subsumers) == 0:
1667 subsumer_ic = 0
1668 else:
1669 subsumer_ic = max(information_content(s, ic) for s in subsumers)
1670
1671 if verbose:
1672 print "> LCS Subsumer by content:", subsumer_ic
1673
1674 return ic1, ic2, subsumer_ic
1675
1676
1677
1678
1680 try:
1681 icpos = ic[synset.pos]
1682 except KeyError:
1683 msg = 'Information content file has no entries for part-of-speech: %s'
1684 raise WordNetError(msg % synset.pos)
1685
1686 counts = icpos[synset.offset]
1687 if counts == 0:
1688 return _INF
1689 else:
1690 return -math.log(counts / icpos[0])
1691
1692
1693
1694
1695
1697 if field[-1] == 'n':
1698 return NOUN
1699 elif field[-1] == 'v':
1700 return VERB
1701 else:
1702 msg = "Unidentified part of speech in WordNet Information Content file"
1703 raise ValueError(msg)
1704
1705
1706
1707
1708
1709
1711 import nltk
1712 print 'loading wordnet'
1713 wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet'))
1714 print 'done loading'
1715 S = wn.synset
1716 L = wn.lemma
1717
1718 print 'getting a synset for go'
1719 move_synset = S('go.v.21')
1720 print move_synset.name, move_synset.pos, move_synset.lexname
1721 print move_synset.lemma_names
1722 print move_synset.definition
1723 print move_synset.examples
1724
1725 zap_n = ['zap.n.01']
1726 zap_v = ['zap.v.01', 'zap.v.02', 'nuke.v.01', 'microwave.v.01']
1727
1728 def _get_synsets(synset_strings):
1729 return [S(synset) for synset in synset_strings]
1730
1731 zap_n_synsets = _get_synsets(zap_n)
1732 zap_v_synsets = _get_synsets(zap_v)
1733 zap_synsets = set(zap_n_synsets + zap_v_synsets)
1734
1735 print zap_n_synsets
1736 print zap_v_synsets
1737
1738 print "Navigations:"
1739 print S('travel.v.01').hypernyms()
1740 print S('travel.v.02').hypernyms()
1741 print S('travel.v.03').hypernyms()
1742
1743 print L('zap.v.03.nuke').derivationally_related_forms()
1744 print L('zap.v.03.atomize').derivationally_related_forms()
1745 print L('zap.v.03.atomise').derivationally_related_forms()
1746 print L('zap.v.03.zap').derivationally_related_forms()
1747
1748 print S('dog.n.01').member_holonyms()
1749 print S('dog.n.01').part_meronyms()
1750
1751 print S('breakfast.n.1').hypernyms()
1752 print S('meal.n.1').hyponyms()
1753 print S('Austen.n.1').instance_hypernyms()
1754 print S('composer.n.1').instance_hyponyms()
1755
1756 print S('faculty.n.2').member_meronyms()
1757 print S('copilot.n.1').member_holonyms()
1758
1759 print S('table.n.2').part_meronyms()
1760 print S('course.n.7').part_holonyms()
1761
1762 print S('water.n.1').substance_meronyms()
1763 print S('gin.n.1').substance_holonyms()
1764
1765 print L('leader.n.1.leader').antonyms()
1766 print L('increase.v.1.increase').antonyms()
1767
1768 print S('snore.v.1').entailments()
1769 print S('heavy.a.1').similar_tos()
1770 print S('light.a.1').attributes()
1771 print S('heavy.a.1').attributes()
1772
1773 print L('English.a.1.English').pertainyms()
1774
1775 print S('person.n.01').root_hypernyms()
1776 print S('sail.v.01').root_hypernyms()
1777 print S('fall.v.12').root_hypernyms()
1778
1779 print S('person.n.01').lowest_common_hypernyms(S('dog.n.01'))
1780
1781 print S('dog.n.01').path_similarity(S('cat.n.01'))
1782 print S('dog.n.01').lch_similarity(S('cat.n.01'))
1783 print S('dog.n.01').wup_similarity(S('cat.n.01'))
1784
1785 wnic = WordNetICCorpusReader(nltk.data.find('corpora/wordnet_ic'),
1786 '.*\.dat')
1787 ic = wnic.ic('ic-brown.dat')
1788 print S('dog.n.01').jcn_similarity(S('cat.n.01'), ic)
1789
1790 ic = wnic.ic('ic-semcor.dat')
1791 print S('dog.n.01').lin_similarity(S('cat.n.01'), ic)
1792
1793 print S('code.n.03').topic_domains()
1794 print S('pukka.a.01').region_domains()
1795 print S('freaky.a.01').usage_domains()
1796
1797 if __name__ == '__main__':
1798 demo()
1799