1
2
3
4
5
6
7
8 """
9 Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
10
11 The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
12 were regularized.
13
14 Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
15 gold standard annotated files.
16
17 Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
18 example is taken from RTE3::
19
20 <pair id="1" entailment="YES" task="IE" length="short" >
21
22 <t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
23 Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
24 company Baikalfinansgroup which was later bought by the Russian
25 state-owned oil company Rosneft .</t>
26
27 <h>Baikalfinansgroup was sold to Rosneft.</h>
28 </pair>
29
30 In order to provide globally unique IDs for each pair, a new attribute
31 C{challenge} has been added to the root element C{entailment-corpus} of each
32 file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
33 challenge number and 'n' is the pair ID.
34 """
35
36 from util import *
37 from api import *
38 from xmldocs import *
39
40
41 -def norm(value_string):
42 """
43 Normalize the string value in an RTE pair's C{value} or C{entailment}
44 attribute as an integer (1, 0).
45
46 @param value_string: the label used to classify a text/hypothesis pair
47 @type value_string: C{str}
48 @rtype: C{int}
49 """
50
51 valdict = {"TRUE": 1,
52 "FALSE": 0,
53 "YES": 1,
54 "NO": 0}
55 return valdict[value_string.upper()]
56
58 """
59 Container for RTE text-hypothesis pairs.
60
61 The entailment relation is signalled by the C{value} attribute in RTE1, and by
62 C{entailment} in RTE2 and RTE3. These both get mapped on to the C{entailment}
63 attribute of this class.
64 """
65 - def __init__(self, pair, challenge=None, id=None, text=None, hyp=None,
66 value=None, task=None, length=None):
67 """
68 @param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
69 @param id: identifier for the pair
70 @param text: the text component of the pair
71 @param hyp: the hypothesis component of the pair
72 @param value: classification label for the pair
73 @param task: attribute for the particular NLP task that the data was drawn from
74 @param length: attribute for the length of the text of the pair
75 """
76 self.challenge = challenge
77 self.id = pair.attrib["id"]
78 self.gid = "%s-%s" % (self.challenge, self.id)
79 self.text = pair[0].text
80 self.hyp = pair[1].text
81
82 if "value" in pair.attrib:
83 self.value = norm(pair.attrib["value"])
84 elif "entailment" in pair.attrib:
85 self.value = norm(pair.attrib["entailment"])
86 else:
87 self.value = value
88 if "task" in pair.attrib:
89 self.task = pair.attrib["task"]
90 else:
91 self.task = task
92 if "length" in pair.attrib:
93 self.length = pair.attrib["length"]
94 else:
95 self.length = length
96
98 if self.challenge:
99 return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id)
100 else:
101 return '<RTEPair: id=%s>' % self.id
102
103
105 """
106 Corpus reader for corpora in RTE challenges.
107
108 This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
109 structure of input documents.
110 """
111
113 """
114 Map the XML input into an RTEPair.
115
116 This uses the C{getiterator()} method from the ElementTree package to
117 find all the C{<pair>} elements.
118
119 @param doc: a parsed XML document
120 @rtype: C{list} of L{RTEPair}s
121 """
122 try:
123 challenge = doc.attrib['challenge']
124 except KeyError:
125 challenge = None
126 return [RTEPair(pair, challenge=challenge)
127 for pair in doc.getiterator("pair")]
128
129
130 - def pairs(self, fileids):
131 """
132 Build a list of RTEPairs from a RTE corpus.
133
134 @param fileids: a list of RTE corpus fileids
135 @type: C{list}
136 @rtype: C{list} of L{RTEPair}s
137 """
138 if isinstance(fileids, basestring): fileids = [fileids]
139 return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
140