1
2
3
4
5
6
7
8 """
9 Simple classifier for RTE corpus.
10
11 It calculates the overlap in words and named entities between text and
12 hypothesis, and also whether there are words / named entities in the
13 hypothesis which fail to occur in the text, since this is an indicator that
14 the hypothesis is more informative than (i.e not entailed by) the text.
15
16 TO DO: better Named Entity classification
17 TO DO: add lemmatization
18 """
19
20 import nltk
21 from util import accuracy
22
24 """
25 This just assumes that words in all caps or titles are
26 named entities.
27
28 @type token: C{str}
29 """
30 if token.istitle() or \
31 token.isupper():
32 return True
33 return False
34
43
45 """
46 This builds a bag of words for both the text and the hypothesis after
47 throwing away some stopwords, then calculates overlap and difference.
48 """
50 """
51 @param rtepair: a L{RTEPair} from which features should be extracted
52 @param stop: if C{True}, stopwords are thrown away.
53 @type stop: C{bool}
54 """
55 self.stop = stop
56 self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to',
57 'have', 'is', 'are', 'were', 'and', 'very', '.',','])
58
59 self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied'])
60
61
62 from nltk.tokenize import RegexpTokenizer
63 tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
64
65
66 self.text_tokens = tokenizer.tokenize(rtepair.text)
67 self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
68 self.text_words = set(self.text_tokens)
69 self.hyp_words = set(self.hyp_tokens)
70
71 if lemmatize:
72 self.text_words = set([lemmatize(token) for token in self.text_tokens])
73 self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])
74
75 if self.stop:
76 self.text_words = self.text_words - self.stopwords
77 self.hyp_words = self.hyp_words - self.stopwords
78
79 self._overlap = self.hyp_words & self.text_words
80 self._hyp_extra = self.hyp_words - self.text_words
81 self._txt_extra = self.text_words - self.hyp_words
82
83
85 """
86 Compute the overlap between text and hypothesis.
87
88 @param toktype: distinguish Named Entities from ordinary words
89 @type toktype: 'ne' or 'word'
90 """
91 ne_overlap = set([token for token in self._overlap if ne(token)])
92 if toktype == 'ne':
93 if debug: print "ne overlap", ne_overlap
94 return ne_overlap
95 elif toktype == 'word':
96 if debug: print "word overlap", self._overlap - ne_overlap
97 return self._overlap - ne_overlap
98 else:
99 raise ValueError("Type not recognized:'%s'" % toktype)
100
102 """
103 Compute the extraneous material in the hypothesis.
104
105 @param toktype: distinguish Named Entities from ordinary words
106 @type toktype: 'ne' or 'word'
107 """
108 ne_extra = set([token for token in self._hyp_extra if ne(token)])
109 if toktype == 'ne':
110 return ne_extra
111 elif toktype == 'word':
112 return self._hyp_extra - ne_extra
113 else:
114 raise ValueError("Type not recognized: '%s'" % toktype)
115
116
118 extractor = RTEFeatureExtractor(rtepair)
119 features = {}
120 features['alwayson'] = True
121 features['word_overlap'] = len(extractor.overlap('word'))
122 features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
123 features['ne_overlap'] = len(extractor.overlap('ne'))
124 features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
125 features['neg_txt'] = len(extractor.negwords & extractor.text_words)
126 features['neg_hyp'] = len(extractor.negwords & extractor.hyp_words)
127 return features
128
129
131 """
132 Classify RTEPairs
133 """
134 train = [(pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])]
135 test = [(pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])]
136
137
138 print 'Training classifier...'
139 classifier = trainer( [(features(pair), label) for (pair,label) in train] )
140
141
142 print 'Testing classifier...'
143 acc = accuracy(classifier, [(features(pair), label) for (pair,label) in test])
144 print 'Accuracy: %6.4f' % acc
145
146
147 return classifier
148
149
156
157
165
166
168 import nltk
169 try:
170 nltk.config_megam('/usr/local/bin/megam')
171 trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam')
172 except ValueError:
173 try:
174 trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS')
175 except ValueError:
176 trainer = nltk.MaxentClassifier.train
177 nltk.classify.rte_classifier(trainer)
178
179 if __name__ == '__main__':
180 demo_features()
181 demo_feature_extractor()
182 demo()
183