1
2
3
4
5
6
7
8
9 """
10 Read from the Senseval 2 Corpus.
11
12 SENSEVAL [http://www.senseval.org/]
13 Evaluation exercises for Word Sense Disambiguation.
14 Organized by ACL-SIGLEX [http://www.siglex.org/]
15
16 Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
17 http://www.d.umn.edu/~tpederse/data.html
18 Distributed with permission.
19
20 The NLTK version of the Senseval 2 files uses well-formed XML.
21 Each instance of the ambiguous words "hard", "interest", "line", and "serve"
22 is tagged with a sense identifier, and supplied with context.
23 """
24
25 import os
26 import re
27 import xml.sax
28 from xmldocs import XMLCorpusReader
29
30 from nltk.tokenize import *
31 from nltk.etree import ElementTree
32
33 from util import *
34 from api import *
35
37 - def __init__(self, word, position, context, senses):
38 self.word = word
39 self.senses = tuple(senses)
40 self.position = position
41 self.context = context
43 return ('SensevalInstance(word=%r, position=%r, '
44 'context=%r, senses=%r)' %
45 (self.word, self.position, self.context, self.senses))
46
69
70
78
80
81 lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1
82 lexelt = self._lexelts[lexelt_num]
83
84 instance_lines = []
85 in_instance = False
86 while True:
87 line = stream.readline()
88 if line == '':
89 assert instance_lines == []
90 return []
91
92
93 if line.lstrip().startswith('<lexelt'):
94 lexelt_num += 1
95 m = re.search('item=("[^"]+"|\'[^\']+\')', line)
96 assert m is not None
97 lexelt = m.group(1)[1:-1]
98 if lexelt_num < len(self._lexelts):
99 assert lexelt == self._lexelts[lexelt_num]
100 else:
101 self._lexelts.append(lexelt)
102 self._lexelt_starts.append(stream.tell())
103
104
105 if line.lstrip().startswith('<instance'):
106 assert instance_lines == []
107 in_instance = True
108
109
110 if in_instance:
111 instance_lines.append(line)
112
113
114 if line.lstrip().startswith('</instance'):
115 xml_block = '\n'.join(instance_lines)
116 xml_block = _fixXML(xml_block)
117 inst = ElementTree.fromstring(xml_block)
118 return [self._parse_instance(inst, lexelt)]
119
121 senses = []
122 context = []
123 position = None
124 for child in instance:
125 if child.tag == 'answer':
126 senses.append(child.attrib['senseid'])
127 elif child.tag == 'context':
128 context += self._word_tokenizer.tokenize(child.text)
129 for cword in child:
130 if cword.tag == 'compound':
131 cword = cword[0]
132
133 if cword.tag == 'head':
134
135 assert position is None, 'head specified twice'
136 assert cword.text.strip() or len(cword)==1
137 assert not (cword.text.strip() and len(cword)==1)
138
139 position = len(context)
140
141 if cword.text.strip():
142 context.append(cword.text.strip())
143 elif cword[0].tag == 'wf':
144 context.append((cword[0].text,
145 cword[0].attrib['pos']))
146 if cword[0].tail:
147 context += self._word_tokenizer.tokenize(
148 cword[0].tail)
149 else:
150 assert False, 'expected CDATA or wf in <head>'
151 elif cword.tag == 'wf':
152 context.append((cword.text, cword.attrib['pos']))
153 elif cword.tag == 's':
154 pass
155
156 else:
157 print 'ACK', cword.tag
158 assert False, 'expected CDATA or <wf> or <head>'
159 if cword.tail:
160 context += self._word_tokenizer.tokenize(cword.tail)
161 else:
162 assert False, 'unexpected tag %s' % child.tag
163 return SensevalInstance(lexelt, position, context, senses)
164
166 """
167 Fix the various issues with Senseval pseudo-XML.
168 """
169
170 text = re.sub(r'<([~\^])>', r'\1', text)
171
172 text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text)
173
174 text = re.sub(r'"""', '\'"\'', text)
175
176 text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
177
178 text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)
179
180 text = re.sub(r'<\&I[^>]*>', '', text)
181
182 text = re.sub(r'<{([^}]+)}>', r'\1', text)
183
184 text = re.sub(r'<(@|/?p)>', r'', text)
185
186 text = re.sub(r'<&\w+ \.>', r'', text)
187
188 text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)
189
190 text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)
191
192 text = re.sub(r'<(\&\w+;)>', r'\1', text)
193
194 text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)
195
196 text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>',
197 r' <wf pos="\2">\1</wf>', text)
198 text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
199 return text
200