1
2
3
4
5
6
7
8 """
9 Corpus reader for corpora whose documents are xml files.
10
11 (note -- not named 'xml' to avoid conflicting w/ standard xml package)
12 """
13
14 import codecs
15
16
17 try: from xml.etree import cElementTree as ElementTree
18 except ImportError: from nltk.etree import ElementTree
19
20 from nltk.data import SeekableUnicodeStreamReader
21 from nltk.tokenize import WordPunctTokenizer
22 from nltk.internals import ElementWrapper
23
24 from nltk.corpus.reader.api import CorpusReader
25 from nltk.corpus.reader.util import *
26
28 """
29 Corpus reader for corpora whose documents are xml files.
30
31 Note that the C{XMLCorpusReader} constructor does not take an
32 C{encoding} argument, because the unicode encoding is specified by
33 the XML files themselves. See the XML specs for more info.
34 """
35 - def __init__(self, root, fileids, wrap_etree=False):
38
39 - def xml(self, fileid=None):
52
53 - def words(self, fileid=None):
54 """
55 Returns all of the words and punctuation symbols in the specified file
56 that were in text nodes -- ie, tags are ignored. Like the xml() method,
57 fileid can only specify one file.
58
59 @return: the given file's text nodes as a list of words and punctuation symbols
60 @rtype: C{list} of C{str}
61 """
62
63 elt = self.xml(fileid)
64 word_tokenizer=WordPunctTokenizer()
65 iterator = elt.getiterator()
66 out = []
67
68 for node in iterator:
69 text = node.text
70 if text is not None:
71 toks = word_tokenizer.tokenize(text)
72 out.extend(toks)
73 return out
74
75 - def raw(self, fileids=None):
79
80
82 """
83 A corpus view that selects out specified elements from an XML
84 file, and provides a flat list-like interface for accessing them.
85 (Note: C{XMLCorpusView} is not used by L{XMLCorpusReader} itself,
86 but may be used by subclasses of L{XMLCorpusReader}.)
87
88 Every XML corpus view has a X{tag specification}, indicating what
89 XML elements should be included in the view; and each (non-nested)
90 element that matches this specification corresponds to one item in
91 the view. Tag specifications are regular expressions over tag
92 paths, where a tag path is a list of element tag names, separated
93 by '/', indicating the ancestry of the element. Some examples:
94
95 - C{'foo'}: A top-level element whose tag is C{foo}.
96 - C{'foo/bar'}: An element whose tag is C{bar} and whose parent
97 is a top-level element whose tag is C{foo}.
98 - C{'.*/foo'}: An element whose tag is C{foo}, appearing anywhere
99 in the xml tree.
100 - C{'.*/(foo|bar)'}: An wlement whose tag is C{foo} or C{bar},
101 appearing anywhere in the xml tree.
102
103 The view items are generated from the selected XML elements via
104 the method L{handle_elt()}. By default, this method returns the
105 element as-is (i.e., as an ElementTree object); but it can be
106 overridden, either via subclassing or via the C{elt_handler}
107 constructor parameter.
108 """
109
110
111
112 _DEBUG = False
113
114
115 _BLOCK_SIZE = 1024
116
117 - def __init__(self, fileid, tagspec, elt_handler=None):
118 """
119 Create a new corpus view based on a specified XML file.
120
121 Note that the C{XMLCorpusView} constructor does not take an
122 C{encoding} argument, because the unicode encoding is
123 specified by the XML files themselves.
124
125 @type tagspec: C{str}
126 @param tagspec: A tag specification, indicating what XML
127 elements should be included in the view. Each non-nested
128 element that matches this specification corresponds to one
129 item in the view.
130
131 @param elt_handler: A function used to transform each element
132 to a value for the view. If no handler is specified, then
133 L{self.handle_elt()} is called, which returns the element
134 as an ElementTree object. The signature of elt_handler is::
135
136 elt_handler(elt, tagspec) -> value
137 """
138 if elt_handler: self.handle_elt = elt_handler
139
140 self._tagspec = re.compile(tagspec+r'\Z')
141 """The tag specification for this corpus view."""
142
143 self._tag_context = {0: ()}
144 """A dictionary mapping from file positions (as returned by
145 C{stream.seek()} to XML contexts. An XML context is a
146 tuple of XML tag names, indicating which tags have not yet
147 been closed."""
148
149 encoding = self._detect_encoding(fileid)
150 StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
151
153 if isinstance(fileid, PathPointer):
154 s = fileid.open().readline()
155 else:
156 s = open(fileid, 'rb').readline()
157 if s.startswith(codecs.BOM_UTF16_BE):
158 return 'utf-16-be'
159 if s.startswith(codecs.BOM_UTF16_LE):
160 return 'utf-16-le'
161 if s.startswith(codecs.BOM_UTF32_BE):
162 return 'utf-32-be'
163 if s.startswith(codecs.BOM_UTF32_LE):
164 return 'utf-32-le'
165 if s.startswith(codecs.BOM_UTF8):
166 return 'utf-8'
167 m = re.match(r'\s*<?xml\b.*\bencoding="([^"]+)"', s)
168 if m: return m.group(1)
169 m = re.match(r"\s*<?xml\b.*\bencoding='([^']+)'", s)
170 if m: return m.group(1)
171
172 return 'utf-8'
173
175 """
176 Convert an element into an appropriate value for inclusion in
177 the view. Unless overridden by a subclass or by the
178 C{elt_handler} constructor argument, this method simply
179 returns C{elt}.
180
181 @return: The view value corresponding to C{elt}.
182
183 @type elt: C{ElementTree}
184 @param elt: The element that should be converted.
185
186 @type context: C{str}
187 @param context: A string composed of element tags separated by
188 forward slashes, indicating the XML context of the given
189 element. For example, the string C{'foo/bar/baz'}
190 indicates that the element is a C{baz} element whose
191 parent is a C{bar} element and whose grandparent is a
192 top-level C{foo} element.
193 """
194 return elt
195
196
197
198 _VALID_XML_RE = re.compile(r"""
199 [^<]*
200 (
201 ((<!--.*?-->) | # comment
202 (<![CDATA[.*?]]) | # raw character data
203 (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl
204 (<[^>]*>)) # tag or PI
205 [^<]*)*
206 \Z""",
207 re.DOTALL|re.VERBOSE)
208
209
210
211 _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)')
212
213
214
215
216
217 _XML_PIECE = re.compile(r"""
218 # Include these so we can skip them:
219 (?P<COMMENT> <!--.*?--> )|
220 (?P<CDATA> <![CDATA[.*?]]> )|
221 (?P<PI> <\?.*?\?> )|
222 (?P<DOCTYPE> <!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*> )|
223 # These are the ones we actually care about:
224 (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )|
225 (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )|
226 (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""",
227 re.DOTALL|re.VERBOSE)
228
230 """
231 Read a string from the given stream that does not contain any
232 un-closed tags. In particular, this function first reads a
233 block from the stream of size L{self._BLOCK_SIZE}. It then
234 checks if that block contains an un-closed tag. If it does,
235 then this function either backtracks to the last '<', or reads
236 another block.
237 """
238 fragment = ''
239
240 while True:
241 if isinstance(stream, SeekableUnicodeStreamReader):
242 startpos = stream.tell()
243
244 xml_block = stream.read(self._BLOCK_SIZE)
245 fragment += xml_block
246
247
248 if self._VALID_XML_RE.match(fragment):
249 return fragment
250
251
252 if re.search('[<>]', fragment).group(0) == '>':
253 pos = stream.tell() - (
254 len(fragment)-re.search('[<>]', fragment).end())
255 raise ValueError('Unexpected ">" near char %s' % pos)
256
257
258 if not xml_block:
259 raise ValueError('Unexpected end of file: tag not closed')
260
261
262
263
264 last_open_bracket = fragment.rfind('<')
265 if last_open_bracket > 0:
266 if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
267 if isinstance(stream, SeekableUnicodeStreamReader):
268 stream.seek(startpos)
269 stream.char_seek_forward(last_open_bracket)
270 else:
271 stream.seek(-(len(fragment)-last_open_bracket), 1)
272 return fragment[:last_open_bracket]
273
274
275
276
277 - def read_block(self, stream, tagspec=None, elt_handler=None):
278 """
279 Read from C{stream} until we find at least one element that
280 matches C{tagspec}, and return the result of applying
281 C{elt_handler} to each element found.
282 """
283 if tagspec is None: tagspec = self._tagspec
284 if elt_handler is None: elt_handler = self.handle_elt
285
286
287 context = list(self._tag_context.get(stream.tell()))
288 assert context is not None
289
290 elts = []
291
292 elt_start = None
293 elt_depth = None
294 elt_text = ''
295
296 while elts==[] or elt_start is not None:
297 if isinstance(stream, SeekableUnicodeStreamReader):
298 startpos = stream.tell()
299 xml_fragment = self._read_xml_fragment(stream)
300
301
302 if not xml_fragment:
303 if elt_start is None: break
304 else: raise ValueError('Unexpected end of file')
305
306
307 for piece in self._XML_PIECE.finditer(xml_fragment):
308 if self._DEBUG:
309 print '%25s %s' % ('/'.join(context)[-20:], piece.group())
310
311 if piece.group('START_TAG'):
312 name = self._XML_TAG_NAME.match(piece.group()).group(1)
313
314 context.append(name)
315
316 if elt_start is None:
317 if re.match(tagspec, '/'.join(context)):
318 elt_start = piece.start()
319 elt_depth = len(context)
320
321 elif piece.group('END_TAG'):
322 name = self._XML_TAG_NAME.match(piece.group()).group(1)
323
324 if not context:
325 raise ValueError('Unmatched tag </%s>' % name)
326 if name != context[-1]:
327 raise ValueError('Unmatched tag <%s>...</%s>' %
328 (context[-1], name))
329
330 if elt_start is not None and elt_depth == len(context):
331 elt_text += xml_fragment[elt_start:piece.end()]
332 elts.append( (elt_text, '/'.join(context)) )
333 elt_start = elt_depth = None
334 elt_text = ''
335
336 context.pop()
337
338 elif piece.group('EMPTY_ELT_TAG'):
339 name = self._XML_TAG_NAME.match(piece.group()).group(1)
340 if elt_start is None:
341 if re.match(tagspec, '/'.join(context)+'/'+name):
342 elts.append((piece.group(),
343 '/'.join(context)+'/'+name))
344
345 if elt_start is not None:
346
347
348 if elts == []:
349 elt_text += xml_fragment[elt_start:]
350 elt_start = 0
351
352
353
354
355 else:
356
357
358 if self._DEBUG:
359 print ' '*36+'(backtrack)'
360 if isinstance(stream, SeekableUnicodeStreamReader):
361 stream.seek(startpos)
362 stream.char_seek_forward(elt_start)
363 else:
364 stream.seek(-(len(xml_fragment)-elt_start), 1)
365 context = context[:elt_depth-1]
366 elt_start = elt_depth = None
367 elt_text = ''
368
369
370 pos = stream.tell()
371 if pos in self._tag_context:
372 assert tuple(context) == self._tag_context[pos]
373 else:
374 self._tag_context[pos] = tuple(context)
375
376 return [elt_handler(ElementTree.fromstring(
377 elt.encode('ascii', 'xmlcharrefreplace')),
378 context)
379 for (elt, context) in elts]
380