Package nltk :: Package corpus :: Package reader :: Module dependency
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.dependency

 1  # Natural Language Toolkit: Dependency Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2011 NLTK Project 
 4  # Author: Kepa Sarasola <kepa.sarasola@ehu.es> 
 5  #         Iker Manterola <returntothehangar@hotmail.com> 
 6  #          
 7  # URL: <http://www.nltk.org/> 
 8  # For license information, see LICENSE.TXT 
 9   
10  from nltk.parse import DependencyGraph 
11  from nltk.tokenize import * 
12   
13  from util import * 
14  from api import * 
15   
16 -class DependencyCorpusReader(SyntaxCorpusReader):
17
18 - def __init__(self, root, fileids, encoding=None, 19 word_tokenizer=TabTokenizer(), 20 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 21 para_block_reader=read_blankline_block):
24 25 ######################################################### 26
27 - def raw(self, fileids=None):
28 """ 29 @return: the given file(s) as a single string. 30 @rtype: C{str} 31 """ 32 return concat([open(fileid).read() 33 for fileid in self.abspaths(fileids)])
34
35 - def words(self, fileids=None):
36 return concat([DependencyCorpusView(fileid, False, False, False) 37 for fileid in self.abspaths(fileids)])
38
39 - def tagged_words(self, fileids=None):
40 return concat([DependencyCorpusView(fileid, True, False, False) 41 for fileid in self.abspaths(fileids)])
42
43 - def sents(self, fileids=None):
44 return concat([DependencyCorpusView(fileid, False, True, False) 45 for fileid in self.abspaths(fileids)])
46
47 - def tagged_sents(self, fileids=None):
48 return concat([DependencyCorpusView(fileid, True, True, False) 49 for fileid in self.abspaths(fileids)])
50
51 - def parsed_sents(self, fileids=None):
52 sents=concat([DependencyCorpusView(fileid, False, True, True) 53 for fileid in self.abspaths(fileids)]) 54 return [DependencyGraph(sent) for sent in sents]
55 56
57 -class DependencyCorpusView(StreamBackedCorpusView):
58 _DOCSTART = '-DOCSTART- -DOCSTART- O\n' #dokumentu hasiera definitzen da 59
60 - def __init__(self, corpus_file, tagged, group_by_sent, dependencies, 61 chunk_types=None):
62 self._tagged = tagged 63 self._dependencies = dependencies 64 self._group_by_sent = group_by_sent 65 self._chunk_types = chunk_types 66 StreamBackedCorpusView.__init__(self, corpus_file)
67
68 - def read_block(self, stream):
69 # Read the next sentence. 70 sent = read_blankline_block(stream)[0].strip() 71 # Strip off the docstart marker, if present. 72 if sent.startswith(self._DOCSTART): 73 sent = sent[len(self._DOCSTART):].lstrip() 74 75 # extract word and tag from any of the formats 76 if not self._dependencies: 77 lines = [line.split('\t') for line in sent.split('\n')] 78 if len(lines[0]) == 3 or len(lines[0]) == 4: 79 sent = [(line[0], line[1]) for line in lines] 80 elif len(lines[0]) == 10: 81 sent = [(line[1], line[4]) for line in lines] 82 else: 83 raise ValueError('Unexpected number of fields in dependency tree file') 84 85 # discard tags if they weren't requested 86 if not self._tagged: 87 sent = [word for (word, tag) in sent] 88 89 # Return the result. 90 if self._group_by_sent: 91 return [sent] 92 else: 93 return list(sent)
94