Package nltk :: Package classify :: Module rte_classify
[hide private]
[frames] | no frames]

Source Code for Module nltk.classify.rte_classify

  1  # Natural Language Toolkit: RTE Classifier 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Ewan Klein <ewan@inf.ed.ac.uk> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  Simple classifier for RTE corpus. 
 10   
 11  It calculates the overlap in words and named entities between text and 
 12  hypothesis, and also whether there are words / named entities in the 
 13  hypothesis which fail to occur in the text, since this is an indicator that 
 14  the hypothesis is more informative than (i.e not entailed by) the text. 
 15   
 16  TO DO: better Named Entity classification 
 17  TO DO: add lemmatization 
 18  """ 
 19   
 20  import nltk 
 21  from util import accuracy 
 22   
23 -def ne(token):
24 """ 25 This just assumes that words in all caps or titles are 26 named entities. 27 28 @type token: C{str} 29 """ 30 if token.istitle() or \ 31 token.isupper(): 32 return True 33 return False
34
35 -def lemmatize(word):
36 """ 37 Use morphy from WordNet to find the base form of verbs. 38 """ 39 lemma = nltk.corpus.wordnet.morphy(word, pos='verb') 40 if lemma is not None: 41 return lemma 42 return word
43
44 -class RTEFeatureExtractor(object):
45 """ 46 This builds a bag of words for both the text and the hypothesis after 47 throwing away some stopwords, then calculates overlap and difference. 48 """
49 - def __init__(self, rtepair, stop=True, lemmatize=False):
50 """ 51 @param rtepair: a L{RTEPair} from which features should be extracted 52 @param stop: if C{True}, stopwords are thrown away. 53 @type stop: C{bool} 54 """ 55 self.stop = stop 56 self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 57 'have', 'is', 'are', 'were', 'and', 'very', '.',',']) 58 59 self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied']) 60 # Try to tokenize so that abbreviations like U.S.and monetary amounts 61 # like "$23.00" are kept as tokens. 62 from nltk.tokenize import RegexpTokenizer 63 tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') 64 65 #Get the set of word types for text and hypothesis 66 self.text_tokens = tokenizer.tokenize(rtepair.text) 67 self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) 68 self.text_words = set(self.text_tokens) 69 self.hyp_words = set(self.hyp_tokens) 70 71 if lemmatize: 72 self.text_words = set([lemmatize(token) for token in self.text_tokens]) 73 self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens]) 74 75 if self.stop: 76 self.text_words = self.text_words - self.stopwords 77 self.hyp_words = self.hyp_words - self.stopwords 78 79 self._overlap = self.hyp_words & self.text_words 80 self._hyp_extra = self.hyp_words - self.text_words 81 self._txt_extra = self.text_words - self.hyp_words
82 83
84 - def overlap(self, toktype, debug=False):
85 """ 86 Compute the overlap between text and hypothesis. 87 88 @param toktype: distinguish Named Entities from ordinary words 89 @type toktype: 'ne' or 'word' 90 """ 91 ne_overlap = set([token for token in self._overlap if ne(token)]) 92 if toktype == 'ne': 93 if debug: print "ne overlap", ne_overlap 94 return ne_overlap 95 elif toktype == 'word': 96 if debug: print "word overlap", self._overlap - ne_overlap 97 return self._overlap - ne_overlap 98 else: 99 raise ValueError("Type not recognized:'%s'" % toktype)
100
101 - def hyp_extra(self, toktype, debug=True):
102 """ 103 Compute the extraneous material in the hypothesis. 104 105 @param toktype: distinguish Named Entities from ordinary words 106 @type toktype: 'ne' or 'word' 107 """ 108 ne_extra = set([token for token in self._hyp_extra if ne(token)]) 109 if toktype == 'ne': 110 return ne_extra 111 elif toktype == 'word': 112 return self._hyp_extra - ne_extra 113 else: 114 raise ValueError("Type not recognized: '%s'" % toktype)
115 116
117 -def rte_features(rtepair):
118 extractor = RTEFeatureExtractor(rtepair) 119 features = {} 120 features['alwayson'] = True 121 features['word_overlap'] = len(extractor.overlap('word')) 122 features['word_hyp_extra'] = len(extractor.hyp_extra('word')) 123 features['ne_overlap'] = len(extractor.overlap('ne')) 124 features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) 125 features['neg_txt'] = len(extractor.negwords & extractor.text_words) 126 features['neg_hyp'] = len(extractor.negwords & extractor.hyp_words) 127 return features
128 129
130 -def rte_classifier(trainer, features=rte_features):
131 """ 132 Classify RTEPairs 133 """ 134 train = [(pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])] 135 test = [(pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])] 136 137 # Train up a classifier. 138 print 'Training classifier...' 139 classifier = trainer( [(features(pair), label) for (pair,label) in train] ) 140 141 # Run the classifier on the test data. 142 print 'Testing classifier...' 143 acc = accuracy(classifier, [(features(pair), label) for (pair,label) in test]) 144 print 'Accuracy: %6.4f' % acc 145 146 # Return the classifier 147 return classifier
148 149
150 -def demo_features():
151 pairs = nltk.corpus.rte.pairs(['rte1_dev.xml'])[:6] 152 for pair in pairs: 153 print 154 for key in sorted(rte_features(pair)): 155 print "%-15s => %s" % (key, rte_features(pair)[key])
156 157
158 -def demo_feature_extractor():
159 rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33] 160 extractor = RTEFeatureExtractor(rtepair) 161 print extractor.hyp_words 162 print extractor.overlap('word') 163 print extractor.overlap('ne') 164 print extractor.hyp_extra('word')
165 166
167 -def demo():
168 import nltk 169 try: 170 nltk.config_megam('/usr/local/bin/megam') 171 trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam') 172 except ValueError: 173 try: 174 trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS') 175 except ValueError: 176 trainer = nltk.MaxentClassifier.train 177 nltk.classify.rte_classifier(trainer) 178 179 if __name__ == '__main__': 180 demo_features() 181 demo_feature_extractor() 182 demo() 183