Package nltk :: Package corpus :: Package reader :: Module string_category
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.string_category

 1  # Natural Language Toolkit: String Category Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2011 NLTK Project 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://www.nltk.org/> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Read tuples from a corpus consisting of categorized strings. 
11  For example, from the question classification corpus: 
12   
13  NUM:dist How far is it from Denver to Aspen ? 
14  LOC:city What county is Modesto , California in ? 
15  HUM:desc Who was Galileo ? 
16  DESC:def What is an atom ? 
17  NUM:date When did Hawaii become a state ? 
18  """        
19   
20  # based on PPAttachmentCorpusReader 
21   
22  import os 
23   
24  from util import * 
25  from api import * 
26   
27  # [xx] Should the order of the tuple be reversed -- in most other places 
28  # in nltk, we use the form (data, tag) -- e.g., tagged words and 
29  # labeled texts for classifiers. 
30 -class StringCategoryCorpusReader(CorpusReader):
31 - def __init__(self, root, fileids, delimiter=' ', encoding=None):
32 """ 33 @param root: The root directory for this corpus. 34 @param fileids: A list or regexp specifying the fileids in this corpus. 35 @param delimiter: Field delimiter 36 """ 37 CorpusReader.__init__(self, root, fileids, encoding) 38 self._delimiter = delimiter
39
40 - def tuples(self, fileids=None):
41 if fileids is None: fileids = self._fileids 42 elif isinstance(fileids, basestring): fileids = [fileids] 43 return concat([StreamBackedCorpusView(fileid, self._read_tuple_block, 44 encoding=enc) 45 for (fileid, enc) in self.abspaths(fileids, True)])
46
47 - def raw(self, fileids=None):
48 """ 49 @return: the text contents of the given fileids, as a single string. 50 """ 51 if fileids is None: fileids = self._fileids 52 elif isinstance(fileids, basestring): fileids = [fileids] 53 return concat([self.open(f).read() for f in fileids])
54
55 - def _read_tuple_block(self, stream):
56 line = stream.readline().strip() 57 if line: 58 return [tuple(line.split(self._delimiter, 1))] 59 else: 60 return []
61