| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit: IEER Corpus Reader
2 #
3 # Copyright (C) 2001-2011 NLTK Project
4 # Author: Steven Bird <sb@csse.unimelb.edu.au>
5 # Edward Loper <edloper@gradient.cis.upenn.edu>
6 # URL: <http://www.nltk.org/>
7 # For license information, see LICENSE.TXT
8
9 """
10 Corpus reader for the Information Extraction and Entity Recognition Corpus.
11
12 NIST 1999 Information Extraction: Entity Recognition Evaluation
13 http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
14
15 This corpus contains the NEWSWIRE development test data for the
16 NIST 1999 IE-ER Evaluation. The files were taken from the
17 subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt
18 and filenames were shortened.
19
20 The corpus contains the following files: APW_19980314, APW_19980424,
21 APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
22 """
23
24 import codecs
25
26 import nltk
27
28 from api import *
29 from util import *
30
31 #: A dictionary whose keys are the names of documents in this corpus;
32 #: and whose values are descriptions of those documents' contents.
33 titles = {
34 'APW_19980314': 'Associated Press Weekly, 14 March 1998',
35 'APW_19980424': 'Associated Press Weekly, 24 April 1998',
36 'APW_19980429': 'Associated Press Weekly, 29 April 1998',
37 'NYT_19980315': 'New York Times, 15 March 1998',
38 'NYT_19980403': 'New York Times, 3 April 1998',
39 'NYT_19980407': 'New York Times, 7 April 1998',
40 }
41
42 #: A list of all documents in this corpus.
43 documents = sorted(titles)
44
48 self.text = text
49 self.docno = docno
50 self.doctype = doctype
51 self.date_time = date_time
52 self.headline = headline
54 if self.headline:
55 headline = ' '.join(self.headline.leaves())
56 else:
57 headline = ' '.join([w for w in self.text.leaves()
58 if w[:1] != '<'][:12])+'...'
59 if self.docno is not None:
60 return '<IEERDocument %s: %r>' % (self.docno, headline)
61 else:
62 return '<IEERDocument: %r>' % headline
63
65 """
66 """
68 if fileids is None: fileids = self._fileids
69 elif isinstance(fileids, basestring): fileids = [fileids]
70 return concat([self.open(f).read() for f in fileids])
71
73 return concat([StreamBackedCorpusView(fileid, self._read_block,
74 encoding=enc)
75 for (fileid, enc) in self.abspaths(fileids, True)])
76
78 return concat([StreamBackedCorpusView(fileid,
79 self._read_parsed_block,
80 encoding=enc)
81 for (fileid, enc) in self.abspaths(fileids, True)])
82
84 # TODO: figure out while empty documents are being returned
85 return [self._parse(doc) for doc in self._read_block(stream)
86 if self._parse(doc).docno is not None]
87
89 val = nltk.chunk.ieerstr2tree(doc, top_node="DOCUMENT")
90 if isinstance(val, dict):
91 return IEERDocument(**val)
92 else:
93 return IEERDocument(val)
94
96 out = []
97 # Skip any preamble.
98 while True:
99 line = stream.readline()
100 if not line: break
101 if line.strip() == '<DOC>': break
102 out.append(line)
103 # Read the document
104 while True:
105 line = stream.readline()
106 if not line: break
107 out.append(line)
108 if line.strip() == '</DOC>': break
109 # Return the document
110 return ['\n'.join(out)]
111
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Mon Apr 11 14:40:06 2011 | http://epydoc.sourceforge.net |