1
2
3
4
5
6
7
8 """
9 Corpus reader for the XML version of the British National Corpus.
10 """
11 __docformat__ = 'epytext en'
12
13 import re
14
15 import nltk.etree.ElementTree as ET
16
17 from api import *
18 from util import *
19 from xmldocs import *
20
22 """
23 Corpus reader for the XML version of the British National Corpus.
24 For access to the complete XML data structure, use the L{xml()}
25 method. For access to simple word lists and tagged word lists, use
26 L{words()}, L{sents()}, L{tagged_words()}, and L{tagged_sents()}.
27 """
28 - def __init__(self, root, fileids, lazy=True):
31
32 - def words(self, fileids=None, strip_space=True, stem=False):
33 """
34 @return: the given file(s) as a list of words
35 and punctuation symbols.
36 @rtype: C{list} of C{str}
37
38 @param strip_space: If true, then strip trailing spaces from
39 word tokens. Otherwise, leave the spaces on the tokens.
40 @param stem: If true, then use word stems instead of word strings.
41 """
42 if self._lazy:
43 return concat([BNCWordView(fileid, False, None,
44 strip_space, stem)
45 for fileid in self.abspaths(fileids)])
46 else:
47 return concat([self._words(fileid, False, None,
48 strip_space, stem)
49 for fileid in self.abspaths(fileids)])
50
51 - def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
52 """
53 @return: the given file(s) as a list of tagged
54 words and punctuation symbols, encoded as tuples
55 C{(word,tag)}.
56 @rtype: C{list} of C{(str,str)}
57
58 @param c5: If true, then the tags used will be the more detailed
59 c5 tags. Otherwise, the simplified tags will be used.
60 @param strip_space: If true, then strip trailing spaces from
61 word tokens. Otherwise, leave the spaces on the tokens.
62 @param stem: If true, then use word stems instead of word strings.
63 """
64 if c5: tag = 'c5'
65 else: tag = 'pos'
66 if self._lazy:
67 return concat([BNCWordView(fileid, False, tag, strip_space, stem)
68 for fileid in self.abspaths(fileids)])
69 else:
70 return concat([self._words(fileid, False, tag, strip_space, stem)
71 for fileid in self.abspaths(fileids)])
72
73 - def sents(self, fileids=None, strip_space=True, stem=False):
74 """
75 @return: the given file(s) as a list of
76 sentences or utterances, each encoded as a list of word
77 strings.
78 @rtype: C{list} of (C{list} of C{str})
79
80 @param strip_space: If true, then strip trailing spaces from
81 word tokens. Otherwise, leave the spaces on the tokens.
82 @param stem: If true, then use word stems instead of word strings.
83 """
84 if self._lazy:
85 return concat([BNCWordView(fileid, True, None, strip_space, stem)
86 for fileid in self.abspaths(fileids)])
87 else:
88 return concat([self._words(fileid, True, None, strip_space, stem)
89 for fileid in self.abspaths(fileids)])
90
91 - def tagged_sents(self, fileids=None, c5=False, strip_space=True,
92 stem=False):
93 """
94 @return: the given file(s) as a list of
95 sentences, each encoded as a list of C{(word,tag)} tuples.
96 @rtype: C{list} of (C{list} of C{(str,str)})
97
98 @param c5: If true, then the tags used will be the more detailed
99 c5 tags. Otherwise, the simplified tags will be used.
100 @param strip_space: If true, then strip trailing spaces from
101 word tokens. Otherwise, leave the spaces on the tokens.
102 @param stem: If true, then use word stems instead of word strings.
103 """
104 if c5: tag = 'c5'
105 else: tag = 'pos'
106 if self._lazy:
107 return concat([BNCWordView(fileid, True, tag, strip_space, stem)
108 for fileid in self.abspaths(fileids)])
109 else:
110 return concat([self._words(fileid, True, tag, strip_space, stem)
111 for fileid in self.abspaths(fileids)])
112
113 - def _words(self, fileid, bracket_sent, tag, strip_space, stem):
114 """
115 Helper used to implement the view methods -- returns a list of
116 words or a list of sentences, optionally tagged.
117
118 @param fileid: The name of the underlying file.
119 @param bracket_sent: If true, include sentence bracketing.
120 @param tag: The name of the tagset to use, or None for no tags.
121 @param strip_space: If true, strip spaces from word tokens.
122 @param stem: If true, then substitute stems for words.
123 """
124 result = []
125
126 xmldoc = ElementTree.parse(fileid).getroot()
127 for xmlsent in xmldoc.findall('.//s'):
128 sent = []
129 for xmlword in _all_xmlwords_in(xmlsent):
130 word = xmlword.text
131 if not word:
132 word = ""
133 if strip_space or stem: word = word.strip()
134 if stem: word = xmlword.get('hw', word)
135 if tag == 'c5':
136 word = (word, xmlword.get('c5'))
137 elif tag == 'pos':
138 word = (word, xmlword.get('pos', xmlword.get('c5')))
139 sent.append(word)
140 if bracket_sent:
141 result.append(BNCSentence(xmlsent.attrib['n'], sent))
142 else:
143 result.extend(sent)
144
145 assert None not in result
146 return result
147
154
156 """
157 A list of words, augmented by an attribute C{num} used to record
158 the sentence identifier (the C{n} attribute from the XML).
159 """
163
165 """
166 A stream backed corpus view specialized for use with the BNC corpus.
167 """
168 - def __init__(self, fileid, sent, tag, strip_space, stem):
169 """
170 @param fileid: The name of the underlying file.
171 @param sent: If true, include sentence bracketing.
172 @param tag: The name of the tagset to use, or None for no tags.
173 @param strip_space: If true, strip spaces from word tokens.
174 @param stem: If true, then substitute stems for words.
175 """
176 if sent: tagspec = '.*/s'
177 else: tagspec = '.*/s/(.*/)?(c|w)'
178 self._sent = sent
179 self._tag = tag
180 self._strip_space = strip_space
181 self._stem = stem
182
183 XMLCorpusView.__init__(self, fileid, tagspec)
184
185
186 self._open()
187 self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
188 self.close()
189
190
191 self._tag_context = {0: ()}
192
193
194 title = None
195 author = None
196 editor = None
197 resps = None
198
217
221
223 word = elt.text
224 if not word:
225 word = ""
226 if self._strip_space or self._stem:
227 word = word.strip()
228 if self._stem:
229 word = elt.get('hw', word)
230 if self._tag == 'c5':
231 word = (word, elt.get('c5'))
232 elif self._tag == 'pos':
233 word = (word, elt.get('pos', elt.get('c5')))
234 return word
235
246