1
2
3
4
5
6
7
8 import sys
9 import subprocess
10
11 from nltk.internals import find_binary
12 try:
13 import numpy
14 except ImportError:
15 numpy = None
16
17 _tadm_bin = None
25
27 """
28 Generate an input file for C{tadm} based on the given corpus of
29 classified tokens.
30
31 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str})
32 @param train_toks: Training data, represented as a list of
33 pairs, the first member of which is a feature dictionary,
34 and the second of which is a classification label.
35
36 @type encoding: L{TadmEventMaxentFeatureEncoding}
37 @param encoding: A feature encoding, used to convert featuresets
38 into feature vectors.
39
40 @type stream: C{stream}
41 @param stream: The stream to which the C{tadm} input file should be
42 written.
43 """
44
45
46
47
48 labels = encoding.labels()
49 for featureset, label in train_toks:
50 stream.write('%d\n' % len(labels))
51 for known_label in labels:
52 v = encoding.encode(featureset, known_label)
53 stream.write('%d %d %s\n' % (int(label == known_label), len(v),
54 ' '.join('%d %d' % u for u in v)))
55
57 """
58 Given the stdout output generated by C{tadm} when training a
59 model, return a C{numpy} array containing the corresponding weight
60 vector.
61 """
62 weights = []
63 for line in paramfile:
64 weights.append(float(line.strip()))
65 return numpy.array(weights, 'd')
66
68 """
69 Call the C{tadm} binary with the given arguments.
70 """
71 if isinstance(args, basestring):
72 raise TypeError('args should be a list of strings')
73 if _tadm_bin is None:
74 config_tadm()
75
76
77 cmd = [_tadm_bin] + args
78 p = subprocess.Popen(cmd, stdout=sys.stdout)
79 (stdout, stderr) = p.communicate()
80
81
82 if p.returncode != 0:
83 print
84 print stderr
85 raise OSError('tadm command failed!')
86
91
93 import sys
94 from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
95 from nltk.classify.tadm import write_tadm_file
96 tokens = [({'f0':1, 'f1':1, 'f3':1}, 'A'),
97 ({'f0':1, 'f2':1, 'f4':1}, 'B'),
98 ({'f0':2, 'f2':1, 'f3':1, 'f4':1}, 'A')]
99 encoding = TadmEventMaxentFeatureEncoding.train(tokens)
100 write_tadm_file(tokens, encoding, sys.stdout)
101 print
102 for i in range(encoding.length()):
103 print '%s --> %d' % (encoding.describe(i), i)
104 print
105
106 if __name__ == '__main__':
107 encoding_demo()
108 names_demo()
109