Package nltk :: Package corpus :: Package reader :: Module senseval
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.senseval

  1  # Natural Language Toolkit: Senseval 2 Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Trevor Cohn <tacohn@cs.mu.oz.au> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (modifications) 
  6  # URL: <http://www.nltk.org/> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Read from the Senseval 2 Corpus. 
 11   
 12  SENSEVAL [http://www.senseval.org/] 
 13  Evaluation exercises for Word Sense Disambiguation. 
 14  Organized by ACL-SIGLEX [http://www.siglex.org/] 
 15   
 16  Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota, 
 17  http://www.d.umn.edu/~tpederse/data.html 
 18  Distributed with permission. 
 19   
 20  The NLTK version of the Senseval 2 files uses well-formed XML. 
 21  Each instance of the ambiguous words "hard", "interest", "line", and "serve" 
 22  is tagged with a sense identifier, and supplied with context. 
 23  """        
 24   
 25  import os 
 26  import re 
 27  import xml.sax 
 28  from xmldocs import XMLCorpusReader 
 29   
 30  from nltk.tokenize import * 
 31  from nltk.etree import ElementTree 
 32   
 33  from util import * 
 34  from api import * 
 35   
36 -class SensevalInstance(object):
37 - def __init__(self, word, position, context, senses):
38 self.word = word 39 self.senses = tuple(senses) 40 self.position = position 41 self.context = context
42 - def __repr__(self):
43 return ('SensevalInstance(word=%r, position=%r, ' 44 'context=%r, senses=%r)' % 45 (self.word, self.position, self.context, self.senses))
46
47 -class SensevalCorpusReader(CorpusReader):
48 - def instances(self, fileids=None):
49 return concat([SensevalCorpusView(fileid, enc) 50 for (fileid, enc) in self.abspaths(fileids, True)])
51
52 - def raw(self, fileids=None):
53 """ 54 @return: the text contents of the given fileids, as a single string. 55 """ 56 if fileids is None: fileids = self._fileids 57 elif isinstance(fileids, basestring): fileids = [fileids] 58 return concat([self.open(f).read() for f in fileids])
59
60 - def _entry(self, tree):
61 elts = [] 62 for lexelt in tree.findall('lexelt'): 63 for inst in lexelt.findall('instance'): 64 sense = inst[0].attrib['senseid'] 65 context = [(w.text, w.attrib['pos']) 66 for w in inst[1]] 67 elts.append( (sense, context) ) 68 return elts
69 70
71 -class SensevalCorpusView(StreamBackedCorpusView):
72 - def __init__(self, fileid, encoding):
73 StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) 74 75 self._word_tokenizer = WhitespaceTokenizer() 76 self._lexelt_starts = [0] # list of streampos 77 self._lexelts = [None] # list of lexelt names
78
79 - def read_block(self, stream):
80 # Decide which lexical element we're in. 81 lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1 82 lexelt = self._lexelts[lexelt_num] 83 84 instance_lines = [] 85 in_instance = False 86 while True: 87 line = stream.readline() 88 if line == '': 89 assert instance_lines == [] 90 return [] 91 92 # Start of a lexical element? 93 if line.lstrip().startswith('<lexelt'): 94 lexelt_num += 1 95 m = re.search('item=("[^"]+"|\'[^\']+\')', line) 96 assert m is not None # <lexelt> has no 'item=...' 97 lexelt = m.group(1)[1:-1] 98 if lexelt_num < len(self._lexelts): 99 assert lexelt == self._lexelts[lexelt_num] 100 else: 101 self._lexelts.append(lexelt) 102 self._lexelt_starts.append(stream.tell()) 103 104 # Start of an instance? 105 if line.lstrip().startswith('<instance'): 106 assert instance_lines == [] 107 in_instance = True 108 109 # Body of an instance? 110 if in_instance: 111 instance_lines.append(line) 112 113 # End of an instance? 114 if line.lstrip().startswith('</instance'): 115 xml_block = '\n'.join(instance_lines) 116 xml_block = _fixXML(xml_block) 117 inst = ElementTree.fromstring(xml_block) 118 return [self._parse_instance(inst, lexelt)]
119
120 - def _parse_instance(self, instance, lexelt):
121 senses = [] 122 context = [] 123 position = None 124 for child in instance: 125 if child.tag == 'answer': 126 senses.append(child.attrib['senseid']) 127 elif child.tag == 'context': 128 context += self._word_tokenizer.tokenize(child.text) 129 for cword in child: 130 if cword.tag == 'compound': 131 cword = cword[0] # is this ok to do? 132 133 if cword.tag == 'head': 134 # Some santiy checks: 135 assert position is None, 'head specified twice' 136 assert cword.text.strip() or len(cword)==1 137 assert not (cword.text.strip() and len(cword)==1) 138 # Record the position of the head: 139 position = len(context) 140 # Addd on the head word itself: 141 if cword.text.strip(): 142 context.append(cword.text.strip()) 143 elif cword[0].tag == 'wf': 144 context.append((cword[0].text, 145 cword[0].attrib['pos'])) 146 if cword[0].tail: 147 context += self._word_tokenizer.tokenize( 148 cword[0].tail) 149 else: 150 assert False, 'expected CDATA or wf in <head>' 151 elif cword.tag == 'wf': 152 context.append((cword.text, cword.attrib['pos'])) 153 elif cword.tag == 's': 154 pass # Sentence boundary marker. 155 156 else: 157 print 'ACK', cword.tag 158 assert False, 'expected CDATA or <wf> or <head>' 159 if cword.tail: 160 context += self._word_tokenizer.tokenize(cword.tail) 161 else: 162 assert False, 'unexpected tag %s' % child.tag 163 return SensevalInstance(lexelt, position, context, senses)
164
165 -def _fixXML(text):
166 """ 167 Fix the various issues with Senseval pseudo-XML. 168 """ 169 # <~> or <^> => ~ or ^ 170 text = re.sub(r'<([~\^])>', r'\1', text) 171 # fix lone & 172 text = re.sub(r'(\s+)\&(\s+)', r'\1&amp;\2', text) 173 # fix """ 174 text = re.sub(r'"""', '\'"\'', text) 175 # fix <s snum=dd> => <s snum="dd"/> 176 text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text) 177 # fix foreign word tag 178 text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text) 179 # remove <&I .> 180 text = re.sub(r'<\&I[^>]*>', '', text) 181 # fix <{word}> 182 text = re.sub(r'<{([^}]+)}>', r'\1', text) 183 # remove <@>, <p>, </p> 184 text = re.sub(r'<(@|/?p)>', r'', text) 185 # remove <&M .> and <&T .> and <&Ms .> 186 text = re.sub(r'<&\w+ \.>', r'', text) 187 # remove <!DOCTYPE... > lines 188 text = re.sub(r'<!DOCTYPE[^>]*>', r'', text) 189 # remove <[hi]> and <[/p]> etc 190 text = re.sub(r'<\[\/?[^>]+\]*>', r'', text) 191 # take the thing out of the brackets: <&hellip;> 192 text = re.sub(r'<(\&\w+;)>', r'\1', text) 193 # and remove the & for those patterns that aren't regular XML 194 text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text) 195 # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf> 196 text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', 197 r' <wf pos="\2">\1</wf>', text) 198 text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text) 199 return text
200