| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit: Dependency Corpus Reader 2 # 3 # Copyright (C) 2001-2011 NLTK Project 4 # Author: Kepa Sarasola <kepa.sarasola@ehu.es> 5 # Iker Manterola <returntothehangar@hotmail.com> 6 # 7 # URL: <http://www.nltk.org/> 8 # For license information, see LICENSE.TXT 9 10 from nltk.parse import DependencyGraph 11 from nltk.tokenize import * 12 13 from util import * 14 from api import * 151755 5618 - def __init__(self, root, fileids, encoding=None, 19 word_tokenizer=TabTokenizer(), 20 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 21 para_block_reader=read_blankline_block):24 25 ######################################################### 2628 """ 29 @return: the given file(s) as a single string. 30 @rtype: C{str} 31 """ 32 return concat([open(fileid).read() 33 for fileid in self.abspaths(fileids)])3436 return concat([DependencyCorpusView(fileid, False, False, False) 37 for fileid in self.abspaths(fileids)])3840 return concat([DependencyCorpusView(fileid, True, False, False) 41 for fileid in self.abspaths(fileids)])4244 return concat([DependencyCorpusView(fileid, False, True, False) 45 for fileid in self.abspaths(fileids)])4648 return concat([DependencyCorpusView(fileid, True, True, False) 49 for fileid in self.abspaths(fileids)])5058 _DOCSTART = '-DOCSTART- -DOCSTART- O\n' #dokumentu hasiera definitzen da 599462 self._tagged = tagged 63 self._dependencies = dependencies 64 self._group_by_sent = group_by_sent 65 self._chunk_types = chunk_types 66 StreamBackedCorpusView.__init__(self, corpus_file)6769 # Read the next sentence. 70 sent = read_blankline_block(stream)[0].strip() 71 # Strip off the docstart marker, if present. 72 if sent.startswith(self._DOCSTART): 73 sent = sent[len(self._DOCSTART):].lstrip() 74 75 # extract word and tag from any of the formats 76 if not self._dependencies: 77 lines = [line.split('\t') for line in sent.split('\n')] 78 if len(lines[0]) == 3 or len(lines[0]) == 4: 79 sent = [(line[0], line[1]) for line in lines] 80 elif len(lines[0]) == 10: 81 sent = [(line[1], line[4]) for line in lines] 82 else: 83 raise ValueError('Unexpected number of fields in dependency tree file') 84 85 # discard tags if they weren't requested 86 if not self._tagged: 87 sent = [word for (word, tag) in sent] 88 89 # Return the result. 90 if self._group_by_sent: 91 return [sent] 92 else: 93 return list(sent)
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Mon Apr 11 14:39:55 2011 | http://epydoc.sourceforge.net |