Package nltk :: Package corpus :: Package reader :: Module bnc
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.bnc

  1  # Natural Language Toolkit: Plaintext Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  Corpus reader for the XML version of the British National Corpus. 
 10  """ 
 11  __docformat__ = 'epytext en' 
 12   
 13  import re 
 14   
 15  import nltk.etree.ElementTree as ET 
 16   
 17  from api import * 
 18  from util import * 
 19  from xmldocs import * 
 20   
21 -class BNCCorpusReader(XMLCorpusReader):
22 """ 23 Corpus reader for the XML version of the British National Corpus. 24 For access to the complete XML data structure, use the L{xml()} 25 method. For access to simple word lists and tagged word lists, use 26 L{words()}, L{sents()}, L{tagged_words()}, and L{tagged_sents()}. 27 """
28 - def __init__(self, root, fileids, lazy=True):
29 XMLCorpusReader.__init__(self, root, fileids) 30 self._lazy = lazy
31
32 - def words(self, fileids=None, strip_space=True, stem=False):
33 """ 34 @return: the given file(s) as a list of words 35 and punctuation symbols. 36 @rtype: C{list} of C{str} 37 38 @param strip_space: If true, then strip trailing spaces from 39 word tokens. Otherwise, leave the spaces on the tokens. 40 @param stem: If true, then use word stems instead of word strings. 41 """ 42 if self._lazy: 43 return concat([BNCWordView(fileid, False, None, 44 strip_space, stem) 45 for fileid in self.abspaths(fileids)]) 46 else: 47 return concat([self._words(fileid, False, None, 48 strip_space, stem) 49 for fileid in self.abspaths(fileids)])
50
51 - def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
52 """ 53 @return: the given file(s) as a list of tagged 54 words and punctuation symbols, encoded as tuples 55 C{(word,tag)}. 56 @rtype: C{list} of C{(str,str)} 57 58 @param c5: If true, then the tags used will be the more detailed 59 c5 tags. Otherwise, the simplified tags will be used. 60 @param strip_space: If true, then strip trailing spaces from 61 word tokens. Otherwise, leave the spaces on the tokens. 62 @param stem: If true, then use word stems instead of word strings. 63 """ 64 if c5: tag = 'c5' 65 else: tag = 'pos' 66 if self._lazy: 67 return concat([BNCWordView(fileid, False, tag, strip_space, stem) 68 for fileid in self.abspaths(fileids)]) 69 else: 70 return concat([self._words(fileid, False, tag, strip_space, stem) 71 for fileid in self.abspaths(fileids)])
72
73 - def sents(self, fileids=None, strip_space=True, stem=False):
74 """ 75 @return: the given file(s) as a list of 76 sentences or utterances, each encoded as a list of word 77 strings. 78 @rtype: C{list} of (C{list} of C{str}) 79 80 @param strip_space: If true, then strip trailing spaces from 81 word tokens. Otherwise, leave the spaces on the tokens. 82 @param stem: If true, then use word stems instead of word strings. 83 """ 84 if self._lazy: 85 return concat([BNCWordView(fileid, True, None, strip_space, stem) 86 for fileid in self.abspaths(fileids)]) 87 else: 88 return concat([self._words(fileid, True, None, strip_space, stem) 89 for fileid in self.abspaths(fileids)])
90
91 - def tagged_sents(self, fileids=None, c5=False, strip_space=True, 92 stem=False):
93 """ 94 @return: the given file(s) as a list of 95 sentences, each encoded as a list of C{(word,tag)} tuples. 96 @rtype: C{list} of (C{list} of C{(str,str)}) 97 98 @param c5: If true, then the tags used will be the more detailed 99 c5 tags. Otherwise, the simplified tags will be used. 100 @param strip_space: If true, then strip trailing spaces from 101 word tokens. Otherwise, leave the spaces on the tokens. 102 @param stem: If true, then use word stems instead of word strings. 103 """ 104 if c5: tag = 'c5' 105 else: tag = 'pos' 106 if self._lazy: 107 return concat([BNCWordView(fileid, True, tag, strip_space, stem) 108 for fileid in self.abspaths(fileids)]) 109 else: 110 return concat([self._words(fileid, True, tag, strip_space, stem) 111 for fileid in self.abspaths(fileids)])
112
113 - def _words(self, fileid, bracket_sent, tag, strip_space, stem):
114 """ 115 Helper used to implement the view methods -- returns a list of 116 words or a list of sentences, optionally tagged. 117 118 @param fileid: The name of the underlying file. 119 @param bracket_sent: If true, include sentence bracketing. 120 @param tag: The name of the tagset to use, or None for no tags. 121 @param strip_space: If true, strip spaces from word tokens. 122 @param stem: If true, then substitute stems for words. 123 """ 124 result = [] 125 126 xmldoc = ElementTree.parse(fileid).getroot() 127 for xmlsent in xmldoc.findall('.//s'): 128 sent = [] 129 for xmlword in _all_xmlwords_in(xmlsent): 130 word = xmlword.text 131 if not word: 132 word = "" # fixes issue 337? 133 if strip_space or stem: word = word.strip() 134 if stem: word = xmlword.get('hw', word) 135 if tag == 'c5': 136 word = (word, xmlword.get('c5')) 137 elif tag == 'pos': 138 word = (word, xmlword.get('pos', xmlword.get('c5'))) 139 sent.append(word) 140 if bracket_sent: 141 result.append(BNCSentence(xmlsent.attrib['n'], sent)) 142 else: 143 result.extend(sent) 144 145 assert None not in result 146 return result
147
148 -def _all_xmlwords_in(elt, result=None):
149 if result is None: result = [] 150 for child in elt: 151 if child.tag in ('c', 'w'): result.append(child) 152 else: _all_xmlwords_in(child, result) 153 return result
154
155 -class BNCSentence(list):
156 """ 157 A list of words, augmented by an attribute C{num} used to record 158 the sentence identifier (the C{n} attribute from the XML). 159 """
160 - def __init__(self, num, items):
161 self.num = num 162 list.__init__(self, items)
163
164 -class BNCWordView(XMLCorpusView):
165 """ 166 A stream backed corpus view specialized for use with the BNC corpus. 167 """
168 - def __init__(self, fileid, sent, tag, strip_space, stem):
169 """ 170 @param fileid: The name of the underlying file. 171 @param sent: If true, include sentence bracketing. 172 @param tag: The name of the tagset to use, or None for no tags. 173 @param strip_space: If true, strip spaces from word tokens. 174 @param stem: If true, then substitute stems for words. 175 """ 176 if sent: tagspec = '.*/s' 177 else: tagspec = '.*/s/(.*/)?(c|w)' 178 self._sent = sent 179 self._tag = tag 180 self._strip_space = strip_space 181 self._stem = stem 182 183 XMLCorpusView.__init__(self, fileid, tagspec) 184 185 # Read in a tasty header. 186 self._open() 187 self.read_block(self._stream, '.*/teiHeader$', self.handle_header) 188 self.close() 189 190 # Reset tag context. 191 self._tag_context = {0: ()}
192 193 194 title = None #: Title of the document. 195 author = None #: Author of the document. 196 editor = None #: Editor 197 resps = None #: Statement of responsibility 198
199 - def handle_header(self, elt, context):
200 # Set up some metadata! 201 titles = elt.findall('titleStmt/title') 202 if titles: self.title = '\n'.join( 203 [title.text.strip() for title in titles]) 204 205 authors = elt.findall('titleStmt/author') 206 if authors: self.author = '\n'.join( 207 [author.text.strip() for author in authors]) 208 209 editors = elt.findall('titleStmt/editor') 210 if editors: self.editor = '\n'.join( 211 [editor.text.strip() for editor in editors]) 212 213 resps = elt.findall('titleStmt/respStmt') 214 if resps: self.resps = '\n\n'.join([ 215 '\n'.join([resp_elt.text.strip() for resp_elt in resp]) 216 for resp in resps])
217
218 - def handle_elt(self, elt, context):
219 if self._sent: return self.handle_sent(elt) 220 else: return self.handle_word(elt)
221
222 - def handle_word(self, elt):
223 word = elt.text 224 if not word: 225 word = "" # fixes issue 337? 226 if self._strip_space or self._stem: 227 word = word.strip() 228 if self._stem: 229 word = elt.get('hw', word) 230 if self._tag == 'c5': 231 word = (word, elt.get('c5')) 232 elif self._tag == 'pos': 233 word = (word, elt.get('pos', elt.get('c5'))) 234 return word
235
236 - def handle_sent(self, elt):
237 sent = [] 238 for child in elt: 239 if child.tag == 'mw': 240 sent += [self.handle_word(w) for w in child] 241 elif child.tag in ('w','c'): 242 sent.append(self.handle_word(child)) 243 else: 244 raise ValueError('Unexpected element %s' % child.tag) 245 return BNCSentence(elt.attrib['n'], sent)
246