1
2
3
4
5
6
7
8
9 """
10 Indian Language POS-Tagged Corpus
11 Collected by A Kumaran, Microsoft Research, India
12 Distributed with permission
13
14 Contents:
15 - Bangla: IIT Kharagpur
16 - Hindi: Microsoft Research India
17 - Marathi: IIT Bombay
18 - Telugu: IIIT Hyderabad
19 """
20
21 import codecs
22
23 from nltk.tag.util import str2tuple
24
25 from util import *
26 from api import *
27
29 """
30 List of words, one per line. Blank lines are ignored.
31 """
32 - def words(self, fileids=None):
36
38 if simplify_tags:
39 tag_mapping_function = self._tag_mapping_function
40 else:
41 tag_mapping_function = None
42 return concat([IndianCorpusView(fileid, enc,
43 True, False, tag_mapping_function)
44 for (fileid, enc) in self.abspaths(fileids, True)])
45
46 - def sents(self, fileids=None):
50
52 if simplify_tags:
53 tag_mapping_function = self._tag_mapping_function
54 else:
55 tag_mapping_function = None
56 return concat([IndianCorpusView(fileid, enc,
57 True, True, tag_mapping_function)
58 for (fileid, enc) in self.abspaths(fileids, True)])
59
60 - def raw(self, fileids=None):
64
65
67 - def __init__(self, corpus_file, encoding, tagged,
68 group_by_sent, tag_mapping_function=None):
73
75 line = stream.readline()
76 if line.startswith('<'):
77 return []
78 sent = [str2tuple(word, sep='_') for word in line.split()]
79 if self._tag_mapping_function:
80 sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
81 if not self._tagged: sent = [w for (w,t) in sent]
82 if self._group_by_sent:
83 return [sent]
84 else:
85 return sent
86