1
2
3
4
5
6
7
8
9 """
10 Interface for tagging each token in a sentence with supplementary
11 information, such as its part of speech.
12 """
13
14 from nltk.internals import overridden
15 from nltk.metrics import accuracy as _accuracy
16 from util import untag
17
19 """
20 A processing interface for assigning a tag to each token in a list.
21 Tags are case sensitive strings that identify some property of each
22 token, such as its part of speech or its sense.
23
24 Some taggers require specific types for their tokens. This is
25 generally indicated by the use of a sub-interface to C{TaggerI}.
26 For example, I{featureset taggers}, which are subclassed from
27 L{FeaturesetTaggerI}, require that each token be a I{featureset}.
28
29 Subclasses must define:
30 - either L{tag()} or L{batch_tag()} (or both)
31 """
32 - def tag(self, tokens):
33 """
34 Determine the most appropriate tag sequence for the given
35 token sequence, and return a corresponding list of tagged
36 tokens. A tagged token is encoded as a tuple C{(token, tag)}.
37
38 @rtype: C{list} of C{(token, tag)}
39 """
40 if overridden(self.batch_tag):
41 return self.batch_tag([tokens])[0]
42 else:
43 raise NotImplementedError()
44
46 """
47 Apply L{self.tag()} to each element of C{sentences}. I.e.:
48
49 >>> return [self.tag(sent) for sent in sentences]
50 """
51 return [self.tag(sent) for sent in sentences]
52
54 """
55 Score the accuracy of the tagger against the gold standard.
56 Strip the tags from the gold standard text, retag it using
57 the tagger, then compute the accuracy score.
58
59 @type gold: C{list} of C{list} of C{(token, tag)}
60 @param gold: The list of tagged sentences to score the tagger on.
61 @rtype: C{float}
62 """
63
64 tagged_sents = self.batch_tag([untag(sent) for sent in gold])
65 gold_tokens = sum(gold, [])
66 test_tokens = sum(tagged_sents, [])
67 return _accuracy(gold_tokens, test_tokens)
68
70 if (train and model) or (not train and not model):
71 raise ValueError('Must specify either training data or trained model.')
72
74 """
75 A tagger that requires tokens to be I{featuresets}. A featureset
76 is a dictionary that maps from I{feature names} to I{feature
77 values}. See L{nltk.classify} for more information about features
78 and featuresets.
79 """
80
81
100