1
2
3
4
5
6
7
8
9 from nltk.compat import *
10
11 from nltk.corpus.reader.util import StreamBackedCorpusView, concat
12 from nltk.corpus.reader.api import CorpusReader
15 def decorator(self, fileids=None, **kwargs):
16 kwargs.pop('tags', None)
17 if not fileids:
18 fileids = self.fileids()
19 return fun(self, fileids, **kwargs)
20 decorator.__name__ = fun.__name__
21 decorator.__doc__ = fun.__doc__
22 decorator.__module__ = fun.__module__
23 return decorator
24
36 """Corpus reader designed to work with corpus created by IPI PAN.
37 See http://korpus.pl/en/ for more details about IPI PAN corpus.
38
39 The corpus includes information about text domain, channel and categories.
40 You can access possible values using ipipan.domains(), ipipan.channels() and
41 ipipan.categories(). You can use also this metadata to filter files, e.g.:
42 ipipan.fileids(channel='prasa')
43 ipipan.fileids(categories='publicystyczny')
44
45 The reader supports methods: words, sents, paras and their tagged versions.
46 You can get part of speech instead of full tag by giving "simplify_tags=True"
47 parameter, e.g.:
48 ipipan.tagged_sents(simplify_tags=True)
49
50 Also you can get all tags disambiguated tags specifying parameter
51 "one_tag=False", e.g.:
52 ipipan.tagged_paras(one_tag=False)
53
54 You can get all tags that were assigned by a morphological analyzer specifying
55 parameter "disamb_only=False", e.g.
56 ipipan.tagged_words(disamb_only=False)
57
58 The IPIPAN Corpus contains tags indicating if there is a space between two
59 tokens. To add special "no space" markers, you should specify parameter
60 "append_no_space=True", e.g.
61 ipipan.tagged_words(append_no_space=True)
62 As a result in place where there should be no space between two tokens new
63 pair ('', 'no-space') will be inserted (for tagged data) and just '' for
64 methods without tags.
65
66 The corpus reader can also try to append spaces between words. To enable this
67 option, specify parameter "append_space=True", e.g.
68 ipipan.words(append_space=True)
69 As a result either ' ' or (' ', 'space') will be inserted between tokens.
70
71 By default, xml entities like " and & are replaced by corresponding
72 characters. You can turn off this feature, specifying parameter
73 "replace_xmlentities=False", e.g.
74 ipipan.words(replace_xmlentities=False)
75 """
76
79
80 - def raw(self, fileids=None):
85
90
91 - def domains(self, fileids=None):
92 if not fileids:
93 fileids = self.fileids()
94 return self._parse_header(fileids, 'domain')
95
101
102 - def fileids(self, channels=None, domains=None, categories=None):
123
124 @_parse_args
125 - def sents(self, fileids=None, **kwargs):
129
130 @_parse_args
131 - def paras(self, fileids=None, **kwargs):
135
136 @_parse_args
137 - def words(self, fileids=None, **kwargs):
140
141 @_parse_args
146
147 @_parse_args
152
153 @_parse_args
157
160
164
172
174 fileids = self.fileids()
175 ret_fileids = set()
176 for f in fileids:
177 fp = self.abspath(f).replace('morph.xml', 'header.xml')
178 values_list = self._get_tag(fp, tag)
179 for value in values_list:
180 if map is not None:
181 value = map(value)
182 if value in values:
183 ret_fileids.add(f)
184 return list(ret_fileids)
185
187 tags = []
188 header = open(f, 'r').read()
189 tag_end = 0
190 while True:
191 tag_pos = header.find('<'+tag, tag_end)
192 if tag_pos < 0: return tags
193 tag_end = header.find('</'+tag+'>', tag_pos)
194 tags.append(header[tag_pos+len(tag)+2:tag_end])
195
197 pos = cat.find('>')
198 if pos == -1:
199 return cat
200 else:
201 return cat[pos+1:]
202
203 - def _view(self, filename, **kwargs):
204 tags = kwargs.pop('tags', True)
205 mode = kwargs.pop('mode', 0)
206 simplify_tags = kwargs.pop('simplify_tags', False)
207 one_tag = kwargs.pop('one_tag', True)
208 disamb_only = kwargs.pop('disamb_only', True)
209 append_no_space = kwargs.pop('append_no_space', False)
210 append_space = kwargs.pop('append_space', False)
211 replace_xmlentities = kwargs.pop('replace_xmlentities', True)
212
213 if len(kwargs) > 0:
214 raise ValueError('Unexpected arguments: %s' % kwargs.keys())
215 if not one_tag and not disamb_only:
216 raise ValueError('You cannot specify both one_tag=False and '
217 'disamb_only=False')
218 if not tags and (simplify_tags or not one_tag or not disamb_only):
219 raise ValueError('You cannot specify simplify_tags, one_tag or '
220 'disamb_only with functions other than tagged_*')
221
222 return IPIPANCorpusView(filename,
223 tags=tags, mode=mode, simplify_tags=simplify_tags,
224 one_tag=one_tag, disamb_only=disamb_only,
225 append_no_space=append_no_space,
226 append_space=append_space,
227 replace_xmlentities=replace_xmlentities
228 )
229
232
233 WORDS_MODE = 0
234 SENTS_MODE = 1
235 PARAS_MODE = 2
236
237 - def __init__(self, filename, startpos=0, **kwargs):
238 StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
239 self.in_sentence = False
240 self.position = 0
241
242 self.show_tags = kwargs.pop('tags', True)
243 self.disamb_only = kwargs.pop('disamb_only', True)
244 self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE)
245 self.simplify_tags = kwargs.pop('simplify_tags', False)
246 self.one_tag = kwargs.pop('one_tag', True)
247 self.append_no_space = kwargs.pop('append_no_space', False)
248 self.append_space = kwargs.pop('append_space', False)
249 self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
250
252 sentence = []
253 sentences = []
254 space = False
255 no_space = False
256
257 tags = set()
258
259 lines = self._read_data(stream)
260
261 while True:
262
263
264 if len(lines) <= 1:
265 self._seek(stream)
266 lines = self._read_data(stream)
267
268 if lines == ['']:
269 assert not sentences
270 return []
271
272 line = lines.pop()
273 self.position += len(line) + 1
274
275 if line.startswith('<chunk type="s"'):
276 self.in_sentence = True
277 elif line.startswith('<chunk type="p"'):
278 pass
279 elif line.startswith('<tok'):
280 if self.append_space and space and not no_space:
281 self._append_space(sentence)
282 space = True
283 no_space = False
284 orth = ""
285 tags = set()
286 elif line.startswith('</chunk'):
287 if self.in_sentence:
288 self.in_sentence = False
289 self._seek(stream)
290 if self.mode == self.SENTS_MODE:
291 return [sentence]
292 elif self.mode == self.WORDS_MODE:
293 if self.append_space:
294 self._append_space(sentence)
295 return sentence
296 else:
297 sentences.append(sentence)
298 elif self.mode == self.PARAS_MODE:
299 self._seek(stream)
300 return [sentences]
301 elif line.startswith('<orth'):
302 orth = line[6:-7]
303 if self.replace_xmlentities:
304 orth = orth.replace('"', '"').replace('&', '&')
305 elif line.startswith('<lex'):
306 if not self.disamb_only or line.find('disamb=') != -1:
307 tag = line[line.index('<ctag')+6 : line.index('</ctag') ]
308 tags.add(tag)
309 elif line.startswith('</tok'):
310 if self.show_tags:
311 if self.simplify_tags:
312 tags = [t.split(':')[0] for t in tags]
313 if not self.one_tag or not self.disamb_only:
314 sentence.append((orth, tuple(tags)))
315 else:
316 sentence.append((orth, tags.pop()))
317 else:
318 sentence.append(orth)
319 elif line.startswith('<ns/>'):
320 if self.append_space:
321 no_space = True
322 if self.append_no_space:
323 if self.show_tags:
324 sentence.append(('', 'no-space'))
325 else:
326 sentence.append('')
327 elif line.startswith('</cesAna'):
328 pass
329
336
337 - def _seek(self, stream):
338 stream.seek(self.position)
339
341 if self.show_tags:
342 sentence.append((' ', 'space'))
343 else:
344 sentence.append(' ')
345