| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit:
2 #
3 # Copyright (C) 2001-2011 NLTK Project
4 # Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
5 # URL: <http://www.nltk.org/>
6 # For license information, see LICENSE.TXT
7
8 import os
9 import re
10
11 from nltk import tokenize, tree
12
13 from util import *
14 from api import *
15 from xmldocs import XMLCorpusReader
16
17 # (?:something) -- non-grouping parentheses!
18
19 PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>')
20 SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>')
21
22 TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>')
23 WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
24
25 TYPE = re.compile(r'type="(.*?)"')
26 ANA = re.compile(r'ana="(.*?)"')
27
28 TEXTID = re.compile(r'text id="(.*?)"')
29
30
32 - def __init__(self, corpus_file,
33 tagged, group_by_sent, group_by_para,
34 tag_mapping_function=None, headLen=0,
35 textids=None):
36 self._tagged = tagged
37 self._textids = textids
38
39 self._group_by_sent = group_by_sent
40 self._group_by_para = group_by_para
41 # WARNING -- skip header
42 StreamBackedCorpusView.__init__(self, corpus_file, startpos=headLen)
43
44 _pagesize = 4096
45
47 block = stream.readlines(self._pagesize)
48 block = concat(block)
49 while (block.count('<text id') > block.count('</text>')) \
50 or block.count('<text id') == 0:
51 tmp = stream.readline()
52 if len(tmp) <= 0:
53 break
54 block += tmp
55
56 block = block.replace('\n','')
57
58 textids = TEXTID.findall(block)
59 if self._textids:
60 for tid in textids:
61 if tid not in self._textids:
62 beg = block.find(tid)-1
63 end = block[beg: ].find('</text>')+len('</text>')
64 block = block[ :beg]+block[beg+end: ]
65
66 output = []
67 for para_str in PARA.findall(block):
68 para = []
69 for sent_str in SENT.findall(para_str):
70 if not self._tagged:
71 sent = WORD.findall(sent_str)
72 else:
73 sent = map(self._parse_tag, TAGGEDWORD.findall(sent_str))
74 if self._group_by_sent:
75 para.append(sent)
76 else:
77 para.extend(sent)
78 if self._group_by_para:
79 output.append(para)
80 else:
81 output.extend(para)
82 return output
83
90
91
93
94 headLen = 2770
95
97 if 'textid_file' in kwargs: self._textids = kwargs['textid_file']
98 else: self._textids = None
99
100 XMLCorpusReader.__init__(self, *args)
101 CategorizedCorpusReader.__init__(self, kwargs)
102
103 self._init_textids()
104
106 self._f2t = defaultdict(list)
107 self._t2f = defaultdict(list)
108 if self._textids is not None:
109 for line in self.open(self._textids).readlines():
110 line = line.strip()
111 file_id, text_ids = line.split(' ', 1)
112 if file_id not in self.fileids():
113 raise ValueError('In text_id mapping file %s: %s '
114 'not found' % (catfile, file_id))
115 for text_id in text_ids.split(self._delimiter):
116 self._add_textids(file_id, text_id)
117
121
123 tmp = None
124 if fileids is not None:
125 if not tmp:
126 tmp = fileids, None
127 else:
128 raise ValueError('Specify only fileids, categories or textids')
129 if categories is not None:
130 if not tmp:
131 tmp = self.fileids(categories), None
132 else:
133 raise ValueError('Specify only fileids, categories or textids')
134 if textids is not None:
135 if not tmp:
136 if isinstance(textids, basestring): textids = [textids]
137 files = sum((self._t2f[t] for t in textids), [])
138 tdict = dict()
139 for f in files:
140 tdict[f] = (set(self._f2t[f]) & set(textids))
141 tmp = files, tdict
142 else:
143 raise ValueError('Specify only fileids, categories or textids')
144 return None, None
145
149
151 """
152 In the pl196x corpus each category is stored in single
153 file and thus both methods provide identical functionality. In order
154 to accommodate finer granularity, a non-standard textids() method was
155 implemented. All the main functions can be supplied with a list
156 of required chunks---giving much more control to the user.
157 """
158 fileids, _ = self._resolve(fileids, categories)
159 if fileids is None: return sorted(self._t2f)
160
161 if isinstance(fileids, basestring):
162 fileids = [fileids]
163 return sorted(sum((self._f2t[d] for d in fileids), []))
164
166 fileids, textids = self._resolve(fileids, categories, textids)
167 if fileids is None: fileids = self._fileids
168 elif isinstance(fileids, basestring): fileids = [fileids]
169
170 if textids:
171 return concat([TEICorpusView(self.abspath(fileid),
172 False, False, False,
173 headLen=self.headLen,
174 textids=textids[fileid])
175 for fileid in fileids])
176 else:
177 return concat([TEICorpusView(self.abspath(fileid),
178 False, False, False,
179 headLen=self.headLen)
180 for fileid in fileids])
181
183 fileids, textids = self._resolve(fileids, categories, textids)
184 if fileids is None: fileids = self._fileids
185 elif isinstance(fileids, basestring): fileids = [fileids]
186
187 if textids:
188 return concat([TEICorpusView(self.abspath(fileid),
189 False, True, False,
190 headLen=self.headLen,
191 textids=textids[fileid])
192 for fileid in fileids])
193 else:
194 return concat([TEICorpusView(self.abspath(fileid),
195 False, True, False,
196 headLen=self.headLen)
197 for fileid in fileids])
198
200 fileids, textids = self._resolve(fileids, categories, textids)
201 if fileids is None: fileids = self._fileids
202 elif isinstance(fileids, basestring): fileids = [fileids]
203
204 if textids:
205 return concat([TEICorpusView(self.abspath(fileid),
206 False, True, True,
207 headLen=self.headLen,
208 textids=textids[fileid])
209 for fileid in fileids])
210 else:
211 return concat([TEICorpusView(self.abspath(fileid),
212 False, True, True,
213 headLen=self.headLen)
214 for fileid in fileids])
215
217 fileids, textids = self._resolve(fileids, categories, textids)
218 if fileids is None: fileids = self._fileids
219 elif isinstance(fileids, basestring): fileids = [fileids]
220
221 if textids:
222 return concat([TEICorpusView(self.abspath(fileid),
223 True, False, False,
224 headLen=self.headLen,
225 textids=textids[fileid])
226 for fileid in fileids])
227 else:
228 return concat([TEICorpusView(self.abspath(fileid),
229 True, False, False,
230 headLen=self.headLen)
231 for fileid in fileids])
232
234 fileids, textids = self._resolve(fileids, categories, textids)
235 if fileids is None: fileids = self._fileids
236 elif isinstance(fileids, basestring): fileids = [fileids]
237
238 if textids:
239 return concat([TEICorpusView(self.abspath(fileid),
240 True, True, False,
241 headLen=self.headLen,
242 textids=textids[fileid])
243 for fileid in fileids])
244 else:
245 return concat([TEICorpusView(self.abspath(fileid),
246 True, True, False,
247 headLen=self.headLen)
248 for fileid in fileids])
249
251 fileids, textids = self._resolve(fileids, categories, textids)
252 if fileids is None: fileids = self._fileids
253 elif isinstance(fileids, basestring): fileids = [fileids]
254
255 if textids:
256 return concat([TEICorpusView(self.abspath(fileid),
257 True, True, True,
258 headLen=self.headLen,
259 textids=textids[fileid])
260 for fileid in fileids])
261 else:
262 return concat([TEICorpusView(self.abspath(fileid),
263 True, True, True,
264 headLen=self.headLen)
265 for fileid in fileids])
266
268 fileids, _ = self._resolve(fileids, categories)
269 if len(fileids) == 1: return XMLCorpusReader.xml(self, fileids[0])
270 else: raise TypeError('Expected a single file')
271
277
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Mon Apr 11 14:40:01 2011 | http://epydoc.sourceforge.net |