1
2
3
4
5
6
7
8
9 """
10 NLTK corpus readers. The modules in this package provide functions
11 that can be used to read corpus files in a variety of formats. These
12 functions can be used to read both the corpus files that are
13 distributed in the NLTK corpus package, and corpus files that are part
14 of external corpora.
15
16 Available Corpora
17 =================
18
19 Please see http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml
20 for a complete list. Install corpora using nltk.download().
21
22 Corpus Reader Functions
23 =======================
24 Each corpus module defines one or more X{corpus reader functions},
25 which can be used to read documents from that corpus. These functions
26 take an argument, C{item}, which is used to indicate which document
27 should be read from the corpus:
28
29 - If C{item} is one of the unique identifiers listed in the corpus
30 module's C{items} variable, then the corresponding document will
31 be loaded from the NLTK corpus package.
32
33 - If C{item} is a filename, then that file will be read.
34
35 Additionally, corpus reader functions can be given lists of item
36 names; in which case, they will return a concatenation of the
37 corresponding documents.
38
39 Corpus reader functions are named based on the type of information
40 they return. Some common examples, and their return types, are:
41
42 - I{corpus}.words(): list of str
43 - I{corpus}.sents(): list of (list of str)
44 - I{corpus}.paras(): list of (list of (list of str))
45 - I{corpus}.tagged_words(): list of (str,str) tuple
46 - I{corpus}.tagged_sents(): list of (list of (str,str))
47 - I{corpus}.tagged_paras(): list of (list of (list of (str,str)))
48 - I{corpus}.chunked_sents(): list of (Tree w/ (str,str) leaves)
49 - I{corpus}.parsed_sents(): list of (Tree with str leaves)
50 - I{corpus}.parsed_paras(): list of (list of (Tree with str leaves))
51 - I{corpus}.xml(): A single xml ElementTree
52 - I{corpus}.raw(): unprocessed corpus contents
53
54 For example, to read a list of the words in the Brown Corpus, use
55 C{nltk.corpus.brown.words()}:
56
57 >>> from nltk.corpus import brown
58 >>> print brown.words()
59 ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
60
61 Corpus Metadata
62 ===============
63 Metadata about the NLTK corpora, and their individual documents, is
64 stored using U{Open Language Archives Community (OLAC)
65 <http://www.language-archives.org/>} metadata records. These records
66 can be accessed using C{nltk.corpus.I{corpus}.olac()}.
67 """
68
69 import re
70
71 from nltk.tokenize import RegexpTokenizer
72 from nltk.tag import simplify_brown_tag, simplify_wsj_tag,\
73 simplify_alpino_tag, simplify_indian_tag,\
74 simplify_tag
75
76 from util import LazyCorpusLoader
77 from reader import *
78
79 abc = LazyCorpusLoader(
80 'abc', PlaintextCorpusReader, r'(?!\.).*\.txt')
81 alpino = LazyCorpusLoader(
82 'alpino', AlpinoCorpusReader, tag_mapping_function=simplify_alpino_tag)
83 brown = LazyCorpusLoader(
84 'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d',
85 cat_file='cats.txt', tag_mapping_function=simplify_brown_tag)
86 cess_cat = LazyCorpusLoader(
87 'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf',
88 tag_mapping_function=simplify_tag)
89 cess_esp = LazyCorpusLoader(
90 'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
91 tag_mapping_function=simplify_tag)
92 cmudict = LazyCorpusLoader(
93 'cmudict', CMUDictCorpusReader, ['cmudict'])
94 comtrans = LazyCorpusLoader(
95 'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
96 conll2000 = LazyCorpusLoader(
97 'conll2000', ConllChunkCorpusReader,
98 ['train.txt', 'test.txt'], ('NP','VP','PP'),
99 tag_mapping_function=simplify_wsj_tag)
100 conll2002 = LazyCorpusLoader(
101 'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*',
102 ('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8')
103 conll2007 = LazyCorpusLoader(
104 'conll2007', DependencyCorpusReader, '.*\.(test|train).*',
105 encoding='utf-8')
106 dependency_treebank = LazyCorpusLoader(
107 'dependency_treebank', DependencyCorpusReader, '.*\.dp')
108 floresta = LazyCorpusLoader(
109 'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#',
110 tag_mapping_function=simplify_tag)
111 gazetteers = LazyCorpusLoader(
112 'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt')
113 genesis = LazyCorpusLoader(
114 'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
115 ('finnish|french|german', 'latin_1'),
116 ('swedish', 'cp865'),
117 ('.*', 'utf_8')])
118 gutenberg = LazyCorpusLoader(
119 'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt')
120
121
122
123 ieer = LazyCorpusLoader(
124 'ieer', IEERCorpusReader, r'(?!README|\.).*')
125 inaugural = LazyCorpusLoader(
126 'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt')
127
128 indian = LazyCorpusLoader(
129 'indian', IndianCorpusReader, r'(?!\.).*\.pos',
130 tag_mapping_function=simplify_indian_tag)
131 ipipan = LazyCorpusLoader(
132 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
133 jeita = LazyCorpusLoader(
134 'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
135 knbc = LazyCorpusLoader(
136 'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
137 mac_morpho = LazyCorpusLoader(
138 'mac_morpho', MacMorphoCorpusReader, r'(?!\.).*\.txt',
139 tag_mapping_function=simplify_tag, encoding='latin-1')
140 machado = LazyCorpusLoader(
141 'machado', PortugueseCategorizedPlaintextCorpusReader,
142 r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1')
143 movie_reviews = LazyCorpusLoader(
144 'movie_reviews', CategorizedPlaintextCorpusReader,
145 r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*')
146 names = LazyCorpusLoader(
147 'names', WordListCorpusReader, r'(?!\.).*\.txt')
148 nps_chat = LazyCorpusLoader(
149 'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml',
150 tag_mapping_function=simplify_wsj_tag)
151 pl196x = LazyCorpusLoader(
152 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
153 cat_file='cats.txt', textid_file='textids.txt')
154 ppattach = LazyCorpusLoader(
155 'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
156
157
158
159 qc = LazyCorpusLoader(
160 'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'])
161 reuters = LazyCorpusLoader(
162 'reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
163 cat_file='cats.txt')
164 rte = LazyCorpusLoader(
165 'rte', RTECorpusReader, r'(?!\.).*\.xml')
166 semcor = LazyCorpusLoader(
167 'semcor', XMLCorpusReader, r'brown./tagfiles/br-.*\.xml')
168 senseval = LazyCorpusLoader(
169 'senseval', SensevalCorpusReader, r'(?!\.).*\.pos')
170 shakespeare = LazyCorpusLoader(
171 'shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
172 sinica_treebank = LazyCorpusLoader(
173 'sinica_treebank', SinicaTreebankCorpusReader, ['parsed'],
174 tag_mapping_function=simplify_tag)
175 state_union = LazyCorpusLoader(
176 'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt')
177 stopwords = LazyCorpusLoader(
178 'stopwords', WordListCorpusReader, r'(?!README|\.).*')
179 swadesh = LazyCorpusLoader(
180 'swadesh', SwadeshCorpusReader, r'(?!README|\.).*')
181 switchboard = LazyCorpusLoader(
182 'switchboard', SwitchboardCorpusReader,
183 tag_mapping_function=simplify_wsj_tag)
184 timit = LazyCorpusLoader(
185 'timit', TimitCorpusReader)
186 timit_tagged = LazyCorpusLoader(
187 'timit', TimitTaggedCorpusReader, '.+\.tags',
188 tag_mapping_function=simplify_wsj_tag)
189 toolbox = LazyCorpusLoader(
190 'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)')
191 treebank = LazyCorpusLoader(
192 'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg',
193 tag_mapping_function=simplify_wsj_tag)
194 treebank_chunk = LazyCorpusLoader(
195 'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
196 sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
197 para_block_reader=tagged_treebank_para_block_reader)
198 treebank_raw = LazyCorpusLoader(
199 'treebank/raw', PlaintextCorpusReader, r'wsj_.*')
200 udhr = LazyCorpusLoader(
201 'udhr', PlaintextCorpusReader, r'(?!README|\.).*',
202
203
204
205
206 encoding=[('.*-UTF8$', 'utf-8'), ('.*-Latin1$', 'latin-1'),
207 ('.*-Hebrew$', 'hebrew'), ('.*-Arabic$', 'arabic'),
208 ('.*-Cyrillic$', 'cyrillic'), ('.*-SJIS$', 'SJIS'),
209 ('.*-GB2312$', 'GB2312'), ('.*-Latin2$', 'ISO-8859-2'),
210 ('.*-Greek$', 'greek'), ('.*-UFT8$', 'utf-8'),
211 ('Hungarian_Magyar-Unicode', 'utf-16-le')]
212 )
213 verbnet = LazyCorpusLoader(
214 'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
215 webtext = LazyCorpusLoader(
216 'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt')
217 wordnet = LazyCorpusLoader(
218 'wordnet', WordNetCorpusReader)
219 wordnet_ic = LazyCorpusLoader(
220 'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
221 words = LazyCorpusLoader(
222 'words', WordListCorpusReader, r'(?!README|\.).*')
223 ycoe = LazyCorpusLoader(
224 'ycoe', YCOECorpusReader)
225
226 propbank = LazyCorpusLoader(
227 'propbank', PropbankCorpusReader,
228 'prop.txt', 'frames/.*\.xml', 'verbs.txt',
229 lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
230 treebank)
231 nombank = LazyCorpusLoader(
232 'nombank.1.0', NombankCorpusReader,
233 'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
234 lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
235 treebank)
236
263
264
265 if __name__ == '__main__':
266
267 pass
268