Package nltk :: Package corpus :: Package reader :: Module ppattach
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.ppattach

 1  # Natural Language Toolkit: PP Attachment Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2011 NLTK Project 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://www.nltk.org/> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Read lines from the Prepositional Phrase Attachment Corpus. 
11   
12  The PP Attachment Corpus contains several files having the format: 
13   
14  sentence_id verb noun1 preposition noun2 attachment 
15   
16  For example: 
17   
18  42960 gives authority to administration V 
19  46742 gives inventors of microchip N 
20   
21  The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.: 
22   
23  (VP gives (NP authority) (PP to administration)) 
24  (VP gives (NP inventors (PP of microchip))) 
25   
26  The corpus contains the following files: 
27   
28  training:   training set 
29  devset:     development test set, used for algorithm development. 
30  test:       test set, used to report results 
31  bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal. 
32   
33  Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional 
34  Phrase Attachment.  Proceedings of the ARPA Human Language Technology 
35  Conference.  [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps] 
36   
37  The PP Attachment Corpus is distributed with NLTK with the permission 
38  of the author. 
39  """        
40   
41  import codecs 
42   
43  from util import * 
44  from api import * 
45   
46 -class PPAttachment:
47 - def __init__(self, sent, verb, noun1, prep, noun2, attachment):
48 self.sent = sent 49 self.verb = verb 50 self.noun1 = noun1 51 self.prep = prep 52 self.noun2 = noun2 53 self.attachment = attachment
54
55 - def __repr__(self):
56 return ('PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, ' 57 'noun2=%r, attachment=%r)' % 58 (self.sent, self.verb, self.noun1, self.prep, 59 self.noun2, self.attachment))
60
61 -class PPAttachmentCorpusReader(CorpusReader):
62 """ 63 sentence_id verb noun1 preposition noun2 attachment 64 """
65 - def attachments(self, fileids):
66 return concat([StreamBackedCorpusView(fileid, self._read_obj_block, 67 encoding=enc) 68 for (fileid, enc) in self.abspaths(fileids, True)])
69
70 - def tuples(self, fileids):
71 return concat([StreamBackedCorpusView(fileid, self._read_tuple_block, 72 encoding=enc) 73 for (fileid, enc) in self.abspaths(fileids, True)])
74
75 - def raw(self, fileids=None):
76 if fileids is None: fileids = self._fileids 77 elif isinstance(fileids, basestring): fileids = [fileids] 78 return concat([self.open(f).read() for f in fileids])
79
80 - def _read_tuple_block(self, stream):
81 line = stream.readline() 82 if line: 83 return [tuple(line.split())] 84 else: 85 return []
86
87 - def _read_obj_block(self, stream):
88 line = stream.readline() 89 if line: 90 return [PPAttachment(*line.split())] 91 else: 92 return []
93