Package nltk :: Package corpus :: Package reader :: Module pl196x
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.pl196x

  1  # Natural Language Toolkit:  
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import os 
  9  import re 
 10   
 11  from nltk import tokenize, tree 
 12   
 13  from util import * 
 14  from api import * 
 15  from xmldocs import XMLCorpusReader 
 16   
 17  # (?:something) -- non-grouping parentheses! 
 18   
 19  PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>') 
 20  SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>') 
 21   
 22  TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>') 
 23  WORD   =   re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>') 
 24   
 25  TYPE = re.compile(r'type="(.*?)"') 
 26  ANA  = re.compile(r'ana="(.*?)"') 
 27   
 28  TEXTID = re.compile(r'text id="(.*?)"') 
 29   
 30   
31 -class TEICorpusView(StreamBackedCorpusView):
32 - def __init__(self, corpus_file, 33 tagged, group_by_sent, group_by_para, 34 tag_mapping_function=None, headLen=0, 35 textids=None):
36 self._tagged = tagged 37 self._textids = textids 38 39 self._group_by_sent = group_by_sent 40 self._group_by_para = group_by_para 41 # WARNING -- skip header 42 StreamBackedCorpusView.__init__(self, corpus_file, startpos=headLen)
43 44 _pagesize = 4096 45
46 - def read_block(self, stream):
47 block = stream.readlines(self._pagesize) 48 block = concat(block) 49 while (block.count('<text id') > block.count('</text>')) \ 50 or block.count('<text id') == 0: 51 tmp = stream.readline() 52 if len(tmp) <= 0: 53 break 54 block += tmp 55 56 block = block.replace('\n','') 57 58 textids = TEXTID.findall(block) 59 if self._textids: 60 for tid in textids: 61 if tid not in self._textids: 62 beg = block.find(tid)-1 63 end = block[beg: ].find('</text>')+len('</text>') 64 block = block[ :beg]+block[beg+end: ] 65 66 output = [] 67 for para_str in PARA.findall(block): 68 para = [] 69 for sent_str in SENT.findall(para_str): 70 if not self._tagged: 71 sent = WORD.findall(sent_str) 72 else: 73 sent = map(self._parse_tag, TAGGEDWORD.findall(sent_str)) 74 if self._group_by_sent: 75 para.append(sent) 76 else: 77 para.extend(sent) 78 if self._group_by_para: 79 output.append(para) 80 else: 81 output.extend(para) 82 return output
83
84 - def _parse_tag(self, (tag, word)):
85 if tag.startswith('w'): 86 tag = ANA.search(tag).group(1) 87 else: # tag.startswith('c') 88 tag = TYPE.search(tag).group(1) 89 return (word, tag)
90 91
92 -class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
93 94 headLen = 2770 95
96 - def __init__(self, *args, **kwargs):
97 if 'textid_file' in kwargs: self._textids = kwargs['textid_file'] 98 else: self._textids = None 99 100 XMLCorpusReader.__init__(self, *args) 101 CategorizedCorpusReader.__init__(self, kwargs) 102 103 self._init_textids()
104
105 - def _init_textids(self):
106 self._f2t = defaultdict(list) 107 self._t2f = defaultdict(list) 108 if self._textids is not None: 109 for line in self.open(self._textids).readlines(): 110 line = line.strip() 111 file_id, text_ids = line.split(' ', 1) 112 if file_id not in self.fileids(): 113 raise ValueError('In text_id mapping file %s: %s ' 114 'not found' % (catfile, file_id)) 115 for text_id in text_ids.split(self._delimiter): 116 self._add_textids(file_id, text_id)
117
118 - def _add_textids(self, file_id, text_id):
119 self._f2t[file_id].append(text_id) 120 self._t2f[text_id].append(file_id)
121
122 - def _resolve(self, fileids, categories, textids=None):
123 tmp = None 124 if fileids is not None: 125 if not tmp: 126 tmp = fileids, None 127 else: 128 raise ValueError('Specify only fileids, categories or textids') 129 if categories is not None: 130 if not tmp: 131 tmp = self.fileids(categories), None 132 else: 133 raise ValueError('Specify only fileids, categories or textids') 134 if textids is not None: 135 if not tmp: 136 if isinstance(textids, basestring): textids = [textids] 137 files = sum((self._t2f[t] for t in textids), []) 138 tdict = dict() 139 for f in files: 140 tdict[f] = (set(self._f2t[f]) & set(textids)) 141 tmp = files, tdict 142 else: 143 raise ValueError('Specify only fileids, categories or textids') 144 return None, None
145
146 - def decode_tag(self, tag):
147 # to be implemented 148 return tag
149
150 - def textids(self, fileids=None, categories=None):
151 """ 152 In the pl196x corpus each category is stored in single 153 file and thus both methods provide identical functionality. In order 154 to accommodate finer granularity, a non-standard textids() method was 155 implemented. All the main functions can be supplied with a list 156 of required chunks---giving much more control to the user. 157 """ 158 fileids, _ = self._resolve(fileids, categories) 159 if fileids is None: return sorted(self._t2f) 160 161 if isinstance(fileids, basestring): 162 fileids = [fileids] 163 return sorted(sum((self._f2t[d] for d in fileids), []))
164
165 - def words(self, fileids=None, categories=None, textids=None):
166 fileids, textids = self._resolve(fileids, categories, textids) 167 if fileids is None: fileids = self._fileids 168 elif isinstance(fileids, basestring): fileids = [fileids] 169 170 if textids: 171 return concat([TEICorpusView(self.abspath(fileid), 172 False, False, False, 173 headLen=self.headLen, 174 textids=textids[fileid]) 175 for fileid in fileids]) 176 else: 177 return concat([TEICorpusView(self.abspath(fileid), 178 False, False, False, 179 headLen=self.headLen) 180 for fileid in fileids])
181
182 - def sents(self, fileids=None, categories=None, textids=None):
183 fileids, textids = self._resolve(fileids, categories, textids) 184 if fileids is None: fileids = self._fileids 185 elif isinstance(fileids, basestring): fileids = [fileids] 186 187 if textids: 188 return concat([TEICorpusView(self.abspath(fileid), 189 False, True, False, 190 headLen=self.headLen, 191 textids=textids[fileid]) 192 for fileid in fileids]) 193 else: 194 return concat([TEICorpusView(self.abspath(fileid), 195 False, True, False, 196 headLen=self.headLen) 197 for fileid in fileids])
198
199 - def paras(self, fileids=None, categories=None, textids=None):
200 fileids, textids = self._resolve(fileids, categories, textids) 201 if fileids is None: fileids = self._fileids 202 elif isinstance(fileids, basestring): fileids = [fileids] 203 204 if textids: 205 return concat([TEICorpusView(self.abspath(fileid), 206 False, True, True, 207 headLen=self.headLen, 208 textids=textids[fileid]) 209 for fileid in fileids]) 210 else: 211 return concat([TEICorpusView(self.abspath(fileid), 212 False, True, True, 213 headLen=self.headLen) 214 for fileid in fileids])
215
216 - def tagged_words(self, fileids=None, categories=None, textids=None):
217 fileids, textids = self._resolve(fileids, categories, textids) 218 if fileids is None: fileids = self._fileids 219 elif isinstance(fileids, basestring): fileids = [fileids] 220 221 if textids: 222 return concat([TEICorpusView(self.abspath(fileid), 223 True, False, False, 224 headLen=self.headLen, 225 textids=textids[fileid]) 226 for fileid in fileids]) 227 else: 228 return concat([TEICorpusView(self.abspath(fileid), 229 True, False, False, 230 headLen=self.headLen) 231 for fileid in fileids])
232
233 - def tagged_sents(self, fileids=None, categories=None, textids=None):
234 fileids, textids = self._resolve(fileids, categories, textids) 235 if fileids is None: fileids = self._fileids 236 elif isinstance(fileids, basestring): fileids = [fileids] 237 238 if textids: 239 return concat([TEICorpusView(self.abspath(fileid), 240 True, True, False, 241 headLen=self.headLen, 242 textids=textids[fileid]) 243 for fileid in fileids]) 244 else: 245 return concat([TEICorpusView(self.abspath(fileid), 246 True, True, False, 247 headLen=self.headLen) 248 for fileid in fileids])
249
250 - def tagged_paras(self, fileids=None, categories=None, textids=None):
251 fileids, textids = self._resolve(fileids, categories, textids) 252 if fileids is None: fileids = self._fileids 253 elif isinstance(fileids, basestring): fileids = [fileids] 254 255 if textids: 256 return concat([TEICorpusView(self.abspath(fileid), 257 True, True, True, 258 headLen=self.headLen, 259 textids=textids[fileid]) 260 for fileid in fileids]) 261 else: 262 return concat([TEICorpusView(self.abspath(fileid), 263 True, True, True, 264 headLen=self.headLen) 265 for fileid in fileids])
266
267 - def xml(self, fileids=None, categories=None):
268 fileids, _ = self._resolve(fileids, categories) 269 if len(fileids) == 1: return XMLCorpusReader.xml(self, fileids[0]) 270 else: raise TypeError('Expected a single file')
271
272 - def raw(self, fileids=None, categories=None):
273 fileids, _ = self._resolve(fileids, categories) 274 if fileids is None: fileids = self._fileids 275 elif isinstance(fileids, basestring): fileids = [fileids] 276 return concat([self.open(f).read() for f in fileids])
277