NLTK corpus readers. The modules in this package provide functions
that can be used to read corpus files in a variety of formats. These
functions can be used to read both the corpus files that are distributed
in the NLTK corpus package, and corpus files that are part of external
corpora.
Please see http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml
for a complete list. Install corpora using nltk.download().
Additionally, corpus reader functions can be given lists of item
names; in which case, they will return a concatenation of the
corresponding documents.
Corpus reader functions are named based on the type of information
they return. Some common examples, and their return types, are:
Metadata about the NLTK corpora, and their individual documents, is
stored using Open Language Archives Community (OLAC) metadata
records. These records can be accessed using
nltk.corpus.corpus.olac().
|
|
abc = <PlaintextCorpusReader in '/usr/share/nltk_data/corpora/...
|
|
|
alpino = <AlpinoCorpusReader in '/usr/share/nltk_data/corpora/...
|
|
|
brown = <CategorizedTaggedCorpusReader in '.../corpora/brown' ...
|
|
|
cess_cat = <BracketParseCorpusReader in '/usr/share/nltk_data/...
|
|
|
cess_esp = <BracketParseCorpusReader in '/usr/share/nltk_data/...
|
|
|
cmudict = <CMUDictCorpusReader in '/usr/share/nltk_data/corpor...
|
|
|
comtrans = <AlignedCorpusReader in '/usr/share/nltk_data/corpo...
|
|
|
conll2000 = <ConllChunkCorpusReader in '/usr/share/nltk_data/c...
|
|
|
conll2002 = <ConllChunkCorpusReader in '/usr/share/nltk_data/c...
|
|
|
conll2007 = <DependencyCorpusReader in '.../corpora/conll2007'...
|
|
|
dependency_treebank = <DependencyCorpusReader in '.../corpora/...
|
|
|
floresta = <BracketParseCorpusReader in '/usr/share/nltk_data/...
|
|
|
gazetteers = <WordListCorpusReader in '/usr/share/nltk_data/co...
|
|
|
genesis = <PlaintextCorpusReader in '.../corpora/genesis' (not...
|
|
|
gutenberg = <PlaintextCorpusReader in '.../corpora/gutenberg' ...
|
|
|
ieer = <IEERCorpusReader in '/usr/share/nltk_data/corpora/ieer...
|
|
|
inaugural = <PlaintextCorpusReader in '.../corpora/inaugural' ...
|
|
|
indian = <IndianCorpusReader in '/usr/share/nltk_data/corpora/...
|
|
|
ipipan = <IPIPANCorpusReader in '.../corpora/ipipan' (not load...
|
|
|
jeita = <ChasenCorpusReader in '/usr/share/nltk_data/corpora/j...
|
|
|
knbc = <KNBCorpusReader in '/usr/share/nltk_data/corpora/knbc....
|
|
|
mac_morpho = <MacMorphoCorpusReader in '/usr/share/nltk_data/c...
|
|
|
machado = <PortugueseCategorizedPlaintextCorpusReader in '/usr...
|
|
|
movie_reviews = <CategorizedPlaintextCorpusReader in '/usr/sha...
|
|
|
names = <WordListCorpusReader in '/usr/share/nltk_data/corpora...
|
|
|
nps_chat = <NPSChatCorpusReader in '/usr/share/nltk_data/corpo...
|
|
|
pl196x = <Pl196xCorpusReader in '.../corpora/pl196x' (not load...
|
|
|
ppattach = <PPAttachmentCorpusReader in '/usr/share/nltk_data/...
|
|
|
qc = <StringCategoryCorpusReader in '/usr/share/nltk_data/corp...
|
|
|
reuters = <CategorizedPlaintextCorpusReader in '.../corpora/re...
|
|
|
rte = <RTECorpusReader in '/usr/share/nltk_data/corpora/rte.zi...
|
|
|
semcor = <XMLCorpusReader in '.../corpora/semcor' (not loaded ...
|
|
|
senseval = <SensevalCorpusReader in '/usr/share/nltk_data/corp...
|
|
|
shakespeare = <XMLCorpusReader in '.../corpora/shakespeare' (n...
|
|
|
sinica_treebank = <SinicaTreebankCorpusReader in '/usr/share/n...
|
|
|
state_union = <PlaintextCorpusReader in '/usr/share/nltk_data/...
|
|
|
stopwords = <WordListCorpusReader in '.../corpora/stopwords' (...
|
|
|
swadesh = <SwadeshCorpusReader in '/usr/share/nltk_data/corpor...
|
|
|
switchboard = <SwitchboardCorpusReader in '/usr/share/nltk_dat...
|
|
|
timit = <TimitCorpusReader in '.../corpora/timit' (not loaded ...
|
|
|
timit_tagged = <TimitTaggedCorpusReader in '.../corpora/timit'...
|
|
|
toolbox = <ToolboxCorpusReader in '.../corpora/toolbox' (not l...
|
|
|
treebank = <BracketParseCorpusReader in '/usr/share/nltk_data/...
|
|
|
treebank_chunk = <ChunkedCorpusReader in '/usr/share/nltk_data...
|
|
|
treebank_raw = <PlaintextCorpusReader in '/usr/share/nltk_data...
|
|
|
udhr = <PlaintextCorpusReader in '.../corpora/udhr' (not loade...
|
|
|
verbnet = <VerbnetCorpusReader in '/usr/share/nltk_data/corpor...
|
|
|
webtext = <PlaintextCorpusReader in '/usr/share/nltk_data/corp...
|
|
|
wordnet = <WordNetCorpusReader in '/usr/share/nltk_data/corpor...
|
|
|
wordnet_ic = <WordNetICCorpusReader in '/usr/share/nltk_data/c...
|
|
|
words = <WordListCorpusReader in '/usr/share/nltk_data/corpora...
|
|
|
ycoe = <YCOECorpusReader in '.../corpora/ycoe' (not loaded yet)>
|
|
|
propbank = <PropbankCorpusReader in '.../corpora/propbank' (no...
|
|
|
nombank = <NombankCorpusReader in '/usr/share/nltk_data/corpor...
|