Package nltk :: Package corpus :: Package reader :: Module rte
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.rte

  1  # Natural Language Toolkit: RTE Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author:  Ewan Klein <ewan@inf.ed.ac.uk> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora. 
 10   
 11  The files were taken from the RTE1, RTE2 and RTE3 datasets and the files 
 12  were regularized.  
 13   
 14  Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the 
 15  gold standard annotated files. 
 16   
 17  Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following 
 18  example is taken from RTE3:: 
 19   
 20   <pair id="1" entailment="YES" task="IE" length="short" > 
 21    
 22      <t>The sale was made to pay Yukos' US$ 27.5 billion tax bill, 
 23      Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known 
 24      company Baikalfinansgroup which was later bought by the Russian 
 25      state-owned oil company Rosneft .</t> 
 26       
 27     <h>Baikalfinansgroup was sold to Rosneft.</h> 
 28   </pair> 
 29   
 30  In order to provide globally unique IDs for each pair, a new attribute 
 31  C{challenge} has been added to the root element C{entailment-corpus} of each 
 32  file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the 
 33  challenge number and 'n' is the pair ID. 
 34  """ 
 35   
 36  from util import * 
 37  from api import * 
 38  from xmldocs import * 
 39   
 40   
41 -def norm(value_string):
42 """ 43 Normalize the string value in an RTE pair's C{value} or C{entailment} 44 attribute as an integer (1, 0). 45 46 @param value_string: the label used to classify a text/hypothesis pair 47 @type value_string: C{str} 48 @rtype: C{int} 49 """ 50 51 valdict = {"TRUE": 1, 52 "FALSE": 0, 53 "YES": 1, 54 "NO": 0} 55 return valdict[value_string.upper()]
56
57 -class RTEPair:
58 """ 59 Container for RTE text-hypothesis pairs. 60 61 The entailment relation is signalled by the C{value} attribute in RTE1, and by 62 C{entailment} in RTE2 and RTE3. These both get mapped on to the C{entailment} 63 attribute of this class. 64 """
65 - def __init__(self, pair, challenge=None, id=None, text=None, hyp=None, 66 value=None, task=None, length=None):
67 """ 68 @param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3) 69 @param id: identifier for the pair 70 @param text: the text component of the pair 71 @param hyp: the hypothesis component of the pair 72 @param value: classification label for the pair 73 @param task: attribute for the particular NLP task that the data was drawn from 74 @param length: attribute for the length of the text of the pair 75 """ 76 self.challenge = challenge 77 self.id = pair.attrib["id"] 78 self.gid = "%s-%s" % (self.challenge, self.id) 79 self.text = pair[0].text 80 self.hyp = pair[1].text 81 82 if "value" in pair.attrib: 83 self.value = norm(pair.attrib["value"]) 84 elif "entailment" in pair.attrib: 85 self.value = norm(pair.attrib["entailment"]) 86 else: 87 self.value = value 88 if "task" in pair.attrib: 89 self.task = pair.attrib["task"] 90 else: 91 self.task = task 92 if "length" in pair.attrib: 93 self.length = pair.attrib["length"] 94 else: 95 self.length = length
96
97 - def __repr__(self):
98 if self.challenge: 99 return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id) 100 else: 101 return '<RTEPair: id=%s>' % self.id
102 103
104 -class RTECorpusReader(XMLCorpusReader):
105 """ 106 Corpus reader for corpora in RTE challenges. 107 108 This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected 109 structure of input documents. 110 """ 111
112 - def _read_etree(self, doc):
113 """ 114 Map the XML input into an RTEPair. 115 116 This uses the C{getiterator()} method from the ElementTree package to 117 find all the C{<pair>} elements. 118 119 @param doc: a parsed XML document 120 @rtype: C{list} of L{RTEPair}s 121 """ 122 try: 123 challenge = doc.attrib['challenge'] 124 except KeyError: 125 challenge = None 126 return [RTEPair(pair, challenge=challenge) 127 for pair in doc.getiterator("pair")]
128 129
130 - def pairs(self, fileids):
131 """ 132 Build a list of RTEPairs from a RTE corpus. 133 134 @param fileids: a list of RTE corpus fileids 135 @type: C{list} 136 @rtype: C{list} of L{RTEPair}s 137 """ 138 if isinstance(fileids, basestring): fileids = [fileids] 139 return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
140