1
2
3
4
5
6
7
8
9
10 import math
11 import pickle
12 import string
13 import re
14
15 from nltk import defaultdict
16 from nltk.util import binary_search_file
17 from nltk.internals import deprecated
18
19 from util import *
20 import dictionary
21 import similarity
22 from frequency import *
23 from lexname import Lexname
24
25 -class Word(object):
26
27 @deprecated("Use nltk.corpus.wordnet.Lemma() instead.")
29 """
30 Extract a word from a line of a WordNet POS file.
31 @type line: C{string}
32 @param line: The appropriate line taken from the Wordnet data files.
33 """
34
35 tokens = line.split()
36 ints = map(int, tokens[int(tokens[3]) + 4:])
37
38 self.form = tokens[0].replace('_', ' ')
39 self.pos = normalizePOS(tokens[1])
40 self.taggedSenseCount = ints[1]
41 self._synsetOffsets = ints[2:ints[0]+2]
42
44 """
45 Get a sequence of the L{synsets}s of this word.
46
47 >>> from nltk.wordnet import *
48 >>> N['dog'].synsets()
49 [{noun: dog, domestic dog, Canis familiaris}, {noun: frump, dog}, {noun: dog}, {noun: cad, bounder, blackguard, dog, hound, heel}, {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, {noun: pawl, detent, click, dog}, {noun: andiron, firedog, dog, dog-iron}]
50
51 @return: A list of this L{Word}'s L{Synset}s
52 """
53
54 try:
55 return self._synsets
56 except AttributeError:
57 self._synsets = [dictionary.synset(self.pos, offset)
58 for offset in self._synsetOffsets]
59 del self._synsetOffsets
60 return self._synsets
61
63 """
64 Return a list of WordSense objects corresponding to this word's L{synset}s.
65 """
66 return [s.wordSense(self.form) for s in self.synsets()]
67
69 """
70 Return the frequencies of each sense of this word in a tagged concordance.
71 """
72 return [s.count() for s in self.senses()]
73
75 """
76 >>> from nltk.wordnet import *
77 >>> N['dog'].isTagged()
78 True
79
80 @return: True/false (1/0) if one of this L{Word}'s senses is tagged.
81 """
82 return self.taggedSenseCount > 0
83
84
85
86
87
88
89
90
91
92
93
94
95
96
99
102
105
108
111
115
117 return self.form + ' (' + self.pos + ")"
118
121
123 return hash((self.form, self.pos))
124
127 """
128 A single word-sense pairing, indicated by in WordNet by a sense key of
129 the form::
130 lemma%ss_type:lex_filenum:lex_id:head_word:head_id
131 """
132
133 _ssTypeMap = {'n': 1, 'v': 2, 'a': 3, 'r': 4, 's':5}
134 _ssTypeRevMap = dict((v,k) for k,v in _ssTypeMap.iteritems())
135
136 @deprecated("Use nltk.corpus.wordnet.Lemma() instead.")
138 self.senseKey = senseKey
139 self.lemma, remainder = senseKey.split('%', 1)
140 (ssType, lexFilenum, lexId,
141 self.headWord, headId) = remainder.split(':')
142
143 self.ssType = self._ssTypeRevMap[int(ssType)]
144 self.lexFilenum = int(lexFilenum)
145 self.lexId = int(lexId)
146 try:
147 self.headId = int(headId)
148 except ValueError:
149 self.headId = None
150
153
166
170
173
177
180
182 return ('%s (%s) %d'
183 % (self.lemma, normalizePOS(self.ssType), self.senseNo()))
184
187
189 return hash(self.senseKey)
190
191
192 __repr__ = __str__
193
194 @staticmethod
207
208 @staticmethod
209 - def fromKeyParams(lemma, ss_type, lex_filenum, lex_id,
210 head_word='', head_id=''):
211
212 if head_word:
213 head_id = '%02d' % head_id
214
215 return WordSense('%s%%%d:%02d:%02d:%s:%s'
216 % (lemma, ss_type, lex_filenum, lex_id, head_word, head_id))
217
220 """
221 A set of synonyms.
222
223 Each synset contains one or more Senses, which represent a
224 specific sense of a specific word. Senses can be retrieved via
225 synset.senses() or through the index notations synset[0],
226 synset[string], or synset[word]. Synsets participate in
227 lexical relations, which can be accessed via synset.relations().
228
229 >>> from nltk.wordnet import *
230 >>> N['dog'][0]
231 {noun: dog, domestic_dog, Canis_familiaris}
232 >>> N['dog'][0][HYPERNYM]
233 [{noun: canine, canid}, {noun: domestic_animal, domesticated_animal}]
234 >>> V['think'][0].verbFrameStrings
235 ['Something think something Adjective/Noun', 'Somebody think somebody']
236
237 @type pos: C{string}
238 @ivar pos: The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB.
239
240 @type offset: C{int}
241 @ivar offset: An integer offset into the part-of-speech file. Together
242 with pos, this can be used as a unique id.
243
244 @type gloss: C{string}
245 @ivar gloss: A gloss (dictionary definition) for the sense.
246
247 @type verbFrames: C{list} of C{integer}
248 @ivar verbFrames: A sequence of integers that index into
249 VERB_FRAME_STRINGS. These list the verb frames that any
250 Sense in this synset participates in. (See also
251 Sense.verbFrames.) Defined only for verbs.
252 """
253
254 @deprecated("Use nltk.corpus.wordnet.Synset() instead.")
256 """Initialize the synset from a line in a WordNet lexicographer file."""
257
258
259 self.pos = pos
260
261
262
263 self.offset = offset
264
265
266 self._min_depth = self._max_depth = None
267
268
269
270
271
272 dividerIndex = line.index('|')
273 tokens = line[:dividerIndex].split()
274 self.ssType = tokens[2]
275 self.gloss = line[dividerIndex + 1:].strip()
276 self.lexname = Lexname.lexnames[int(tokens[1])]
277
278
279
280
281
282 synset_cnt = int(tokens[3], 16)
283
284
285 (senseTuples, remainder1) = _partition(tokens[4:], 2, synset_cnt)
286 self.words = [form for form, lex_id in senseTuples]
287
288
289 (self._pointerTuples, remainder2) = _partition(remainder1[1:], 4, int(remainder1[0]))
290
291
292 if self.ssType == 's':
293
294 self.headSynset = self.relation('similar')[0]
295 self.wordSenses = [WordSense.fromSynset(self, form, int(lex_id, 16))
296 for form, lex_id in senseTuples]
297
298
299
300
301
302
303
304
305
306
307
308
309
310 if pos == VERB:
311 (vfTuples, remainder3) = _partition(remainder2[1:], 3, int(remainder2[0]))
312
313
314 def extractVerbFrames(index, vfTuples):
315 return tuple(map(lambda t:int(t[1]), filter(lambda t,i=index:int(t[2],16) in (0, i), vfTuples)))
316
317 senseVerbFrames = []
318 for index in range(1, len(self.words) + 1):
319 senseVerbFrames.append(extractVerbFrames(index, vfTuples))
320 self._senseVerbFrames = senseVerbFrames
321
322
323
324
325
326 self.verbFrames = tuple(extractVerbFrames(None, vfTuples))
327
328
329 self.verbFrameStrings = self.extractVerbFrameStrings(vfTuples)
330
332 """
333 Return the WordSense object for the given word in this synset.
334 """
335 word = word.replace(' ', '_')
336 try:
337 index = self.words.index(word)
338 except ValueError:
339 try:
340
341 index = self.words.index(word.title())
342 except ValueError:
343 raise ValueError(
344 "Could not find word '%s' for this synset." % word)
345
346 return self.wordSenses[index]
347
349 """
350 Return a list of verb frame strings for this synset.
351 """
352
353 frame_indices = [int(t[1]) for t in vfTuples if int(t[2], 16) == 0]
354 try:
355 verbFrames = [VERB_FRAME_STRINGS[i] for i in frame_indices]
356 except IndexError:
357 return []
358
359 form = self[0]
360 verbFrameStrings = [vf % form for vf in verbFrames]
361 return verbFrameStrings
362
389
392
393
395 """
396 >>> from nltk.wordnet import *
397 >>> N['dog'][0].isTagged()
398 True
399
400 >>> N['dog'][1].isTagged()
401 False
402
403 @return: True/false (1/0) if one of this L{Word}'s senses is tagged.
404 """
405 return len(filter(Word.isTagged, self.words)) > 0
406
408 """
409 Return a human-readable representation.
410
411 >>> from nltk.wordnet import *
412 >>> str(N['dog'][0].synset)
413 '{noun: dog, domestic dog, Canis familiaris}'
414 """
415 return "{" + self.pos + ": " + string.join(self.words, ", ") + "}"
416
418 return "{" + self.pos + ": " + string.join(self.words, ", ") + "}"
419
422
424 return hash((self.pos, self.offset))
425
427 return not (self==other)
428
430 try:
431 return self.words[idx]
432 except TypeError:
433 return self.relation(idx)
434
436 return iter(self.words)
437
439 return item in self.words
440
443
446
448 """
449 >>> from nltk.wordnet import *
450 >>> len(N['dog'][0].synset)
451 3
452 """
453 return len(self.words)
454
456 """
457 @return: The length of the longest hypernym path from this synset to the root.
458 """
459
460 if not self._max_depth:
461 if self[HYPERNYM] == []:
462 self._max_depth = 0
463 else:
464 self._max_depth = 1 + max(h.max_depth() for h in self[HYPERNYM])
465 return self._max_depth
466
468 """
469 @return: The length of the shortest hypernym path from this synset to the root.
470 """
471
472 if not self._min_depth:
473 if self[HYPERNYM] == []:
474 self._min_depth = 0
475 else:
476 self._min_depth = 1 + min(h.min_depth() for h in self[HYPERNYM])
477 return self._min_depth
478
480 """Return the transitive closure of source under the rel relationship, breadth-first
481
482 >>> dog = N['dog'][0]
483 >>> dog.closure(HYPERNYM)
484 [{noun: dog, domestic dog, Canis familiaris}, {noun: canine, canid}, {noun: carnivore}, {noun: placental, placental mammal, eutherian, eutherian mammal}, {noun: mammal, mammalian}, {noun: vertebrate, craniate}, {noun: chordate}, {noun: animal, animate being, beast, brute, creature, fauna}, {noun: organism, being}, {noun: living thing, animate thing}, {noun: object, physical object}, {noun: physical entity}, {noun: entity}]
485 """
486 from nltk.util import breadth_first
487 synset_offsets = []
488 for synset in breadth_first(self, lambda s:s[rel], depth):
489 if synset.offset != self.offset and synset.offset not in synset_offsets:
490 synset_offsets.append(synset.offset)
491 yield synset
492
493
495 """
496 Get the path(s) from this synset to the root, where each path is a
497 list of the synset nodes traversed on the way to the root.
498
499 @return: A list of lists, where each list gives the node sequence
500 connecting the initial L{Synset} node and a root node.
501 """
502 paths = []
503
504 hypernyms = self[HYPERNYM]
505 if len(hypernyms) == 0:
506 paths = [[self]]
507
508 for hypernym in hypernyms:
509 for ancestor_list in hypernym.hypernym_paths():
510 ancestor_list.append(self)
511 paths.append(ancestor_list)
512 return paths
513
515 """
516 Get the path(s) from this synset to the root, counting the distance
517 of each node from the initial node on the way. A list of
518 (synset, distance) tuples is returned.
519
520 @type distance: C{int}
521 @param distance: the distance (number of edges) from this hypernym to
522 the original hypernym L{Synset} on which this method was called.
523 @return: A list of (L{Synset}, int) tuples where each L{Synset} is
524 a hypernym of the first L{Synset}.
525 """
526 distances = set([(self, distance)])
527
528 for hypernym in self[HYPERNYM]:
529 distances |= hypernym.hypernym_distances(distance+1, verbose=False)
530 if verbose:
531 print "> Hypernym Distances:", self, string.join(synset.__str__() + ":" + `dist` for synset, dist in distances)
532 return distances
533
535 """
536 Returns the distance of the shortest path linking the two synsets (if
537 one exists). For each synset, all the ancestor nodes and their distances
538 are recorded and compared. The ancestor node common to both synsets that
539 can be reached with the minimum number of traversals is used. If no
540 ancestor nodes are common, -1 is returned. If a node is compared with
541 itself 0 is returned.
542
543 @type other: L{Synset}
544 @param other: The Synset to which the shortest path will be found.
545 @return: The number of edges in the shortest path connecting the two
546 nodes, or -1 if no path exists.
547 """
548
549 if self == other: return 0
550
551 path_distance = -1
552
553 dist_list1 = self.hypernym_distances(0)
554 dist_dict1 = {}
555
556 dist_list2 = other.hypernym_distances(0)
557 dist_dict2 = {}
558
559
560
561
562
563
564 for (l, d) in [(dist_list1, dist_dict1), (dist_list2, dist_dict2)]:
565 for (key, value) in l:
566 if key in d:
567 if value < d[key]:
568 d[key] = value
569 else:
570 d[key] = value
571
572
573
574
575 for synset1 in dist_dict1.keys():
576 for synset2 in dist_dict2.keys():
577 if synset1 == synset2:
578 new_distance = dist_dict1[synset1] + dist_dict2[synset2]
579 if path_distance < 0 or new_distance < path_distance:
580 path_distance = new_distance
581
582 return path_distance
583
584 - def tree(self, rel, depth=-1, cut_mark=None):
585 """
586 >>> dog = N['dog'][0]
587 >>> from pprint import pprint
588 >>> pprint(dog.tree(HYPERNYM))
589 ['dog' in {noun: dog, domestic dog, Canis familiaris},
590 [{noun: canine, canid},
591 [{noun: carnivore},
592 [{noun: placental, placental mammal, eutherian, eutherian mammal},
593 [{noun: mammal, mammalian},
594 [{noun: vertebrate, craniate},
595 [{noun: chordate},
596 [{noun: animal, animate being, beast, brute, creature, fauna},
597 [{noun: organism, being},
598 [{noun: living thing, animate thing},
599 [{noun: object, physical object},
600 [{noun: physical entity}, [{noun: entity}]]]]]]]]]]]]]
601 """
602
603 tree = [self]
604 if depth != 0:
605 tree += [x.tree(rel, depth-1, cut_mark) for x in self[rel]]
606 elif cut_mark:
607 tree += [cut_mark]
608 return tree
609
610
611
614
617
620
623
626
629
630
631
632
633 _RELATION_TABLE = {
634 '!': ANTONYM, '@': HYPERNYM, '~': HYPONYM, '=': ATTRIBUTE,
635 '^': ALSO_SEE, '*': ENTAILMENT, '>': CAUSE, '$': VERB_GROUP,
636 '#m': MEMBER_MERONYM, '#s': SUBSTANCE_MERONYM, '#p': PART_MERONYM,
637 '%m': MEMBER_HOLONYM, '%s': SUBSTANCE_HOLONYM, '%p': PART_HOLONYM,
638 '&': SIMILAR, '<': PARTICIPLE_OF, '\\': PERTAINYM, '+': FRAMES,
639 ';c': CLASSIF_CATEGORY, ';u': CLASSIF_USAGE, ';r': CLASSIF_REGIONAL,
640 '-c': CLASS_CATEGORY, '-u': CLASS_USAGE, '-r': CLASS_REGIONAL,
641 '@i': INSTANCE_HYPERNYM,'~i': INSTANCE_HYPONYM,
642 }
643
644
645
646 -def _index(key, sequence, testfn=None, keyfn=None):
647 """
648 Return the index of key within sequence, using testfn for
649 comparison and transforming items of sequence by keyfn first.
650
651 >>> _index('e', 'hello')
652 1
653 >>> _index('E', 'hello', testfn=_equalsIgnoreCase)
654 1
655 >>> _index('x', 'hello')
656 """
657 index = 0
658 for element in sequence:
659 value = element
660 if keyfn:
661 value = keyfn(value)
662 if (not testfn and value == key) or (testfn and testfn(value, key)):
663 return index
664 index = index + 1
665 return None
666
668 """
669 Partition sequence into C{count} subsequences of
670 length C{size}, and a remainder.
671
672 Return C{(partitions, remainder)}, where C{partitions} is a sequence of
673 C{count} subsequences of cardinality C{size}, and
674 C{apply(append, partitions) + remainder == sequence}.
675 """
676
677 partitions = []
678 for index in range(0, size * count, size):
679 partitions.append(sequence[index:index + size])
680 return (partitions, sequence[size * count:])
681
683 """
684 Return -1, 0, or 1 according to a comparison first by type,
685 then by class, and finally by each of fields. Used when comparing two
686 Wordnet objects (Synsets, Words, or Senses) to each other.
687 """
688 if not hasattr(b, '__class__'):
689 return cmp(type(a), type(b))
690 elif a.__class__ != b.__class__:
691 return cmp(a.__class__, b.__class__)
692
693 for field in fields:
694 diff = cmp(getattr(a, field), getattr(b, field))
695 if diff: return diff
696
697 return 0
698
700 """
701 Return true iff a and b have the same lowercase representation.
702
703 >>> _equalsIgnoreCase('dog', 'Dog')
704 True
705 >>> _equalsIgnoreCase('dOg', 'DOG')
706 True
707 """
708 return a == b or a.lower() == b.lower()
709
713 from nltk import wordnet
714 from pprint import pprint
715
716 dog = wordnet.N['dog']
717 cat = wordnet.N['cat']
718
719 print "wordnet.N['dog']"
720 print 'dog' in wordnet.N
721 print dog
722 print dog.pos, dog.form
723 print dog.taggedSenseCount
724 print dog.synsets()
725 print dog.isTagged()
726
727
728
729
730 print "Verb Frames:",
731 print wordnet.V['think'][0].verbFrameStrings
732
733 print "Relations:"
734 print dog[0].relations()
735 print dog[0][wordnet.HYPERNYM]
736
737 print "Glosses:"
738 print dog[0].gloss
739 print dog[0].relation(wordnet.HYPERNYM)[0].gloss
740
741 print
742 print "Paths and Distances:"
743 print
744
745 print dog[0].hypernym_paths()
746 print dog[0].hypernym_distances(0)
747 print dog[0].shortest_path_distance(cat[0])
748
749 print
750 print "Closures and Trees:"
751 print
752
753 pprint(wordnet.ADJ['red'][0].closure(wordnet.SIMILAR, depth=1))
754 pprint(wordnet.ADJ['red'][0].closure(wordnet.SIMILAR, depth=2))
755 pprint(dog[0].tree(wordnet.HYPERNYM))
756 pprint(dog[0].tree(wordnet.HYPERNYM, depth=2, cut_mark = '...'))
757
758 entity = wordnet.N["entity"]
759 print entity, entity[0]
760 print entity[0][wordnet.HYPONYM]
761 pprint(entity[0].tree(wordnet.HYPONYM, depth=1), indent=4)
762 abstract_entity = wordnet.N["abstract entity"]
763 print abstract_entity, abstract_entity[0]
764 print abstract_entity[0][wordnet.HYPONYM]
765 pprint(abstract_entity[0].tree(wordnet.HYPONYM, depth=1), indent=4)
766
767
768
769
770 print "All the words in the hyponym synsets of dog[0]"
771 print [word for synset in dog[0][wordnet.HYPONYM] for word in synset]
772
773 print "Hyponyms of the first (and only) sense of 'animal' that are homophonous with verbs:"
774 print [word for synset in wordnet.N['animal'][0].closure(wordnet.HYPONYM) for word in synset if word in wordnet.V]
775
776
777 print "Senses of 'raise'(v.) and 'lower'(v.) that are antonyms:"
778 print filter(lambda p:p[0] in p[1][wordnet.ANTONYM], [(r,l) for r in wordnet.V['raise'] for l in wordnet.V['lower']])
779
780 print
781 print "Similarity: dog~cat"
782 print
783
784 print "Path Distance Similarity:",
785 print dog[0].path_similarity(cat[0])
786 print "Leacock Chodorow Similarity:",
787 print dog[0].lch_similarity(cat[0])
788 print "Wu Palmer Similarity:",
789 print dog[0].wup_similarity(cat[0])
790
791
792
793
794
795
796
797
798
799 if __name__ == '__main__':
800 demo()
801