Package nltk :: Package corpus :: Package reader :: Module xmldocs
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.xmldocs

  1  # Natural Language Toolkit: XML Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Steven Bird <sb@csse.unimelb.edu.au> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  Corpus reader for corpora whose documents are xml files. 
 10   
 11  (note -- not named 'xml' to avoid conflicting w/ standard xml package) 
 12  """ 
 13   
 14  import codecs 
 15   
 16  # Use the c version of ElementTree, which is faster, if possible: 
 17  try: from xml.etree import cElementTree as ElementTree 
 18  except ImportError: from nltk.etree import ElementTree 
 19   
 20  from nltk.data import SeekableUnicodeStreamReader 
 21  from nltk.tokenize import WordPunctTokenizer 
 22  from nltk.internals import ElementWrapper 
 23   
 24  from nltk.corpus.reader.api import CorpusReader 
 25  from nltk.corpus.reader.util import * 
 26   
27 -class XMLCorpusReader(CorpusReader):
28 """ 29 Corpus reader for corpora whose documents are xml files. 30 31 Note that the C{XMLCorpusReader} constructor does not take an 32 C{encoding} argument, because the unicode encoding is specified by 33 the XML files themselves. See the XML specs for more info. 34 """
35 - def __init__(self, root, fileids, wrap_etree=False):
36 self._wrap_etree = wrap_etree 37 CorpusReader.__init__(self, root, fileids)
38
39 - def xml(self, fileid=None):
40 # Make sure we have exactly one file -- no concatenating XML. 41 if fileid is None and len(self._fileids) == 1: 42 fileid = self._fileids[0] 43 if not isinstance(fileid, basestring): 44 raise TypeError('Expected a single file identifier string') 45 # Read the XML in using ElementTree. 46 elt = ElementTree.parse(self.abspath(fileid).open()).getroot() 47 # If requested, wrap it. 48 if self._wrap_etree: 49 elt = ElementWrapper(elt) 50 # Return the ElementTree element. 51 return elt
52
53 - def words(self, fileid=None):
54 """ 55 Returns all of the words and punctuation symbols in the specified file 56 that were in text nodes -- ie, tags are ignored. Like the xml() method, 57 fileid can only specify one file. 58 59 @return: the given file's text nodes as a list of words and punctuation symbols 60 @rtype: C{list} of C{str} 61 """ 62 63 elt = self.xml(fileid) 64 word_tokenizer=WordPunctTokenizer() 65 iterator = elt.getiterator() 66 out = [] 67 68 for node in iterator: 69 text = node.text 70 if text is not None: 71 toks = word_tokenizer.tokenize(text) 72 out.extend(toks) 73 return out
74
75 - def raw(self, fileids=None):
76 if fileids is None: fileids = self._fileids 77 elif isinstance(fileids, basestring): fileids = [fileids] 78 return concat([self.open(f).read() for f in fileids])
79 80
81 -class XMLCorpusView(StreamBackedCorpusView):
82 """ 83 A corpus view that selects out specified elements from an XML 84 file, and provides a flat list-like interface for accessing them. 85 (Note: C{XMLCorpusView} is not used by L{XMLCorpusReader} itself, 86 but may be used by subclasses of L{XMLCorpusReader}.) 87 88 Every XML corpus view has a X{tag specification}, indicating what 89 XML elements should be included in the view; and each (non-nested) 90 element that matches this specification corresponds to one item in 91 the view. Tag specifications are regular expressions over tag 92 paths, where a tag path is a list of element tag names, separated 93 by '/', indicating the ancestry of the element. Some examples: 94 95 - C{'foo'}: A top-level element whose tag is C{foo}. 96 - C{'foo/bar'}: An element whose tag is C{bar} and whose parent 97 is a top-level element whose tag is C{foo}. 98 - C{'.*/foo'}: An element whose tag is C{foo}, appearing anywhere 99 in the xml tree. 100 - C{'.*/(foo|bar)'}: An wlement whose tag is C{foo} or C{bar}, 101 appearing anywhere in the xml tree. 102 103 The view items are generated from the selected XML elements via 104 the method L{handle_elt()}. By default, this method returns the 105 element as-is (i.e., as an ElementTree object); but it can be 106 overridden, either via subclassing or via the C{elt_handler} 107 constructor parameter. 108 """ 109 110 #: If true, then display debugging output to stdout when reading 111 #: blocks. 112 _DEBUG = False 113 114 #: The number of characters read at a time by this corpus reader. 115 _BLOCK_SIZE = 1024 116
117 - def __init__(self, fileid, tagspec, elt_handler=None):
118 """ 119 Create a new corpus view based on a specified XML file. 120 121 Note that the C{XMLCorpusView} constructor does not take an 122 C{encoding} argument, because the unicode encoding is 123 specified by the XML files themselves. 124 125 @type tagspec: C{str} 126 @param tagspec: A tag specification, indicating what XML 127 elements should be included in the view. Each non-nested 128 element that matches this specification corresponds to one 129 item in the view. 130 131 @param elt_handler: A function used to transform each element 132 to a value for the view. If no handler is specified, then 133 L{self.handle_elt()} is called, which returns the element 134 as an ElementTree object. The signature of elt_handler is:: 135 136 elt_handler(elt, tagspec) -> value 137 """ 138 if elt_handler: self.handle_elt = elt_handler 139 140 self._tagspec = re.compile(tagspec+r'\Z') 141 """The tag specification for this corpus view.""" 142 143 self._tag_context = {0: ()} 144 """A dictionary mapping from file positions (as returned by 145 C{stream.seek()} to XML contexts. An XML context is a 146 tuple of XML tag names, indicating which tags have not yet 147 been closed.""" 148 149 encoding = self._detect_encoding(fileid) 150 StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
151
152 - def _detect_encoding(self, fileid):
153 if isinstance(fileid, PathPointer): 154 s = fileid.open().readline() 155 else: 156 s = open(fileid, 'rb').readline() 157 if s.startswith(codecs.BOM_UTF16_BE): 158 return 'utf-16-be' 159 if s.startswith(codecs.BOM_UTF16_LE): 160 return 'utf-16-le' 161 if s.startswith(codecs.BOM_UTF32_BE): 162 return 'utf-32-be' 163 if s.startswith(codecs.BOM_UTF32_LE): 164 return 'utf-32-le' 165 if s.startswith(codecs.BOM_UTF8): 166 return 'utf-8' 167 m = re.match(r'\s*<?xml\b.*\bencoding="([^"]+)"', s) 168 if m: return m.group(1) 169 m = re.match(r"\s*<?xml\b.*\bencoding='([^']+)'", s) 170 if m: return m.group(1) 171 # No encoding found -- what should the default be? 172 return 'utf-8'
173
174 - def handle_elt(self, elt, context):
175 """ 176 Convert an element into an appropriate value for inclusion in 177 the view. Unless overridden by a subclass or by the 178 C{elt_handler} constructor argument, this method simply 179 returns C{elt}. 180 181 @return: The view value corresponding to C{elt}. 182 183 @type elt: C{ElementTree} 184 @param elt: The element that should be converted. 185 186 @type context: C{str} 187 @param context: A string composed of element tags separated by 188 forward slashes, indicating the XML context of the given 189 element. For example, the string C{'foo/bar/baz'} 190 indicates that the element is a C{baz} element whose 191 parent is a C{bar} element and whose grandparent is a 192 top-level C{foo} element. 193 """ 194 return elt
195 196 #: A regular expression that matches XML fragments that do not 197 #: contain any un-closed tags. 198 _VALID_XML_RE = re.compile(r""" 199 [^<]* 200 ( 201 ((<!--.*?-->) | # comment 202 (<![CDATA[.*?]]) | # raw character data 203 (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl 204 (<[^>]*>)) # tag or PI 205 [^<]*)* 206 \Z""", 207 re.DOTALL|re.VERBOSE) 208 209 #: A regular expression used to extract the tag name from a start tag, 210 #: end tag, or empty-elt tag string. 211 _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)') 212 213 #: A regular expression used to find all start-tags, end-tags, and 214 #: emtpy-elt tags in an XML file. This regexp is more lenient than 215 #: the XML spec -- e.g., it allows spaces in some places where the 216 #: spec does not. 217 _XML_PIECE = re.compile(r""" 218 # Include these so we can skip them: 219 (?P<COMMENT> <!--.*?--> )| 220 (?P<CDATA> <![CDATA[.*?]]> )| 221 (?P<PI> <\?.*?\?> )| 222 (?P<DOCTYPE> <!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*> )| 223 # These are the ones we actually care about: 224 (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )| 225 (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )| 226 (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""", 227 re.DOTALL|re.VERBOSE) 228
229 - def _read_xml_fragment(self, stream):
230 """ 231 Read a string from the given stream that does not contain any 232 un-closed tags. In particular, this function first reads a 233 block from the stream of size L{self._BLOCK_SIZE}. It then 234 checks if that block contains an un-closed tag. If it does, 235 then this function either backtracks to the last '<', or reads 236 another block. 237 """ 238 fragment = '' 239 240 while True: 241 if isinstance(stream, SeekableUnicodeStreamReader): 242 startpos = stream.tell() 243 # Read a block and add it to the fragment. 244 xml_block = stream.read(self._BLOCK_SIZE) 245 fragment += xml_block 246 247 # Do we have a well-formed xml fragment? 248 if self._VALID_XML_RE.match(fragment): 249 return fragment 250 251 # Do we have a fragment that will never be well-formed? 252 if re.search('[<>]', fragment).group(0) == '>': 253 pos = stream.tell() - ( 254 len(fragment)-re.search('[<>]', fragment).end()) 255 raise ValueError('Unexpected ">" near char %s' % pos) 256 257 # End of file? 258 if not xml_block: 259 raise ValueError('Unexpected end of file: tag not closed') 260 261 # If not, then we must be in the middle of a <..tag..>. 262 # If appropriate, backtrack to the most recent '<' 263 # character. 264 last_open_bracket = fragment.rfind('<') 265 if last_open_bracket > 0: 266 if self._VALID_XML_RE.match(fragment[:last_open_bracket]): 267 if isinstance(stream, SeekableUnicodeStreamReader): 268 stream.seek(startpos) 269 stream.char_seek_forward(last_open_bracket) 270 else: 271 stream.seek(-(len(fragment)-last_open_bracket), 1) 272 return fragment[:last_open_bracket]
273 274 # Otherwise, read another block. (i.e., return to the 275 # top of the loop.) 276
277 - def read_block(self, stream, tagspec=None, elt_handler=None):
278 """ 279 Read from C{stream} until we find at least one element that 280 matches C{tagspec}, and return the result of applying 281 C{elt_handler} to each element found. 282 """ 283 if tagspec is None: tagspec = self._tagspec 284 if elt_handler is None: elt_handler = self.handle_elt 285 286 # Use a stack of strings to keep track of our context: 287 context = list(self._tag_context.get(stream.tell())) 288 assert context is not None # check this -- could it ever happen? 289 290 elts = [] 291 292 elt_start = None # where does the elt start 293 elt_depth = None # what context depth 294 elt_text = '' 295 296 while elts==[] or elt_start is not None: 297 if isinstance(stream, SeekableUnicodeStreamReader): 298 startpos = stream.tell() 299 xml_fragment = self._read_xml_fragment(stream) 300 301 # End of file. 302 if not xml_fragment: 303 if elt_start is None: break 304 else: raise ValueError('Unexpected end of file') 305 306 # Process each <tag> in the xml fragment. 307 for piece in self._XML_PIECE.finditer(xml_fragment): 308 if self._DEBUG: 309 print '%25s %s' % ('/'.join(context)[-20:], piece.group()) 310 311 if piece.group('START_TAG'): 312 name = self._XML_TAG_NAME.match(piece.group()).group(1) 313 # Keep context up-to-date. 314 context.append(name) 315 # Is this one of the elts we're looking for? 316 if elt_start is None: 317 if re.match(tagspec, '/'.join(context)): 318 elt_start = piece.start() 319 elt_depth = len(context) 320 321 elif piece.group('END_TAG'): 322 name = self._XML_TAG_NAME.match(piece.group()).group(1) 323 # sanity checks: 324 if not context: 325 raise ValueError('Unmatched tag </%s>' % name) 326 if name != context[-1]: 327 raise ValueError('Unmatched tag <%s>...</%s>' % 328 (context[-1], name)) 329 # Is this the end of an element? 330 if elt_start is not None and elt_depth == len(context): 331 elt_text += xml_fragment[elt_start:piece.end()] 332 elts.append( (elt_text, '/'.join(context)) ) 333 elt_start = elt_depth = None 334 elt_text = '' 335 # Keep context up-to-date 336 context.pop() 337 338 elif piece.group('EMPTY_ELT_TAG'): 339 name = self._XML_TAG_NAME.match(piece.group()).group(1) 340 if elt_start is None: 341 if re.match(tagspec, '/'.join(context)+'/'+name): 342 elts.append((piece.group(), 343 '/'.join(context)+'/'+name)) 344 345 if elt_start is not None: 346 # If we haven't found any elements yet, then keep 347 # looping until we do. 348 if elts == []: 349 elt_text += xml_fragment[elt_start:] 350 elt_start = 0 351 352 # If we've found at least one element, then try 353 # backtracking to the start of the element that we're 354 # inside of. 355 else: 356 # take back the last start-tag, and return what 357 # we've gotten so far (elts is non-empty). 358 if self._DEBUG: 359 print ' '*36+'(backtrack)' 360 if isinstance(stream, SeekableUnicodeStreamReader): 361 stream.seek(startpos) 362 stream.char_seek_forward(elt_start) 363 else: 364 stream.seek(-(len(xml_fragment)-elt_start), 1) 365 context = context[:elt_depth-1] 366 elt_start = elt_depth = None 367 elt_text = '' 368 369 # Update the _tag_context dict. 370 pos = stream.tell() 371 if pos in self._tag_context: 372 assert tuple(context) == self._tag_context[pos] 373 else: 374 self._tag_context[pos] = tuple(context) 375 376 return [elt_handler(ElementTree.fromstring( 377 elt.encode('ascii', 'xmlcharrefreplace')), 378 context) 379 for (elt, context) in elts]
380