Package nltk :: Package corpus :: Package reader :: Module ieer
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.ieer

  1  # Natural Language Toolkit: IEER Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Steven Bird <sb@csse.unimelb.edu.au> 
  5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
  6  # URL: <http://www.nltk.org/> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Corpus reader for the Information Extraction and Entity Recognition Corpus. 
 11   
 12  NIST 1999 Information Extraction: Entity Recognition Evaluation 
 13  http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm 
 14   
 15  This corpus contains the NEWSWIRE development test data for the 
 16  NIST 1999 IE-ER Evaluation.  The files were taken from the 
 17  subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt 
 18  and filenames were shortened. 
 19   
 20  The corpus contains the following files: APW_19980314, APW_19980424, 
 21  APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407. 
 22  """ 
 23   
 24  import codecs 
 25   
 26  import nltk 
 27   
 28  from api import * 
 29  from util import * 
 30   
 31  #: A dictionary whose keys are the names of documents in this corpus; 
 32  #: and whose values are descriptions of those documents' contents. 
 33  titles = { 
 34      'APW_19980314': 'Associated Press Weekly, 14 March 1998', 
 35      'APW_19980424': 'Associated Press Weekly, 24 April 1998', 
 36      'APW_19980429': 'Associated Press Weekly, 29 April 1998', 
 37      'NYT_19980315': 'New York Times, 15 March 1998', 
 38      'NYT_19980403': 'New York Times, 3 April 1998', 
 39      'NYT_19980407': 'New York Times, 7 April 1998', 
 40      } 
 41   
 42  #: A list of all documents in this corpus. 
 43  documents = sorted(titles) 
 44   
45 -class IEERDocument:
46 - def __init__(self, text, docno=None, doctype=None, 47 date_time=None, headline=''):
48 self.text = text 49 self.docno = docno 50 self.doctype = doctype 51 self.date_time = date_time 52 self.headline = headline
53 - def __repr__(self):
54 if self.headline: 55 headline = ' '.join(self.headline.leaves()) 56 else: 57 headline = ' '.join([w for w in self.text.leaves() 58 if w[:1] != '<'][:12])+'...' 59 if self.docno is not None: 60 return '<IEERDocument %s: %r>' % (self.docno, headline) 61 else: 62 return '<IEERDocument: %r>' % headline
63
64 -class IEERCorpusReader(CorpusReader):
65 """ 66 """
67 - def raw(self, fileids=None):
68 if fileids is None: fileids = self._fileids 69 elif isinstance(fileids, basestring): fileids = [fileids] 70 return concat([self.open(f).read() for f in fileids])
71
72 - def docs(self, fileids=None):
73 return concat([StreamBackedCorpusView(fileid, self._read_block, 74 encoding=enc) 75 for (fileid, enc) in self.abspaths(fileids, True)])
76
77 - def parsed_docs(self, fileids=None):
78 return concat([StreamBackedCorpusView(fileid, 79 self._read_parsed_block, 80 encoding=enc) 81 for (fileid, enc) in self.abspaths(fileids, True)])
82
83 - def _read_parsed_block(self,stream):
84 # TODO: figure out while empty documents are being returned 85 return [self._parse(doc) for doc in self._read_block(stream) 86 if self._parse(doc).docno is not None]
87
88 - def _parse(self, doc):
89 val = nltk.chunk.ieerstr2tree(doc, top_node="DOCUMENT") 90 if isinstance(val, dict): 91 return IEERDocument(**val) 92 else: 93 return IEERDocument(val)
94
95 - def _read_block(self, stream):
96 out = [] 97 # Skip any preamble. 98 while True: 99 line = stream.readline() 100 if not line: break 101 if line.strip() == '<DOC>': break 102 out.append(line) 103 # Read the document 104 while True: 105 line = stream.readline() 106 if not line: break 107 out.append(line) 108 if line.strip() == '</DOC>': break 109 # Return the document 110 return ['\n'.join(out)]
111