Package nltk :: Package classify :: Module tadm
[hide private]
[frames] | no frames]

Source Code for Module nltk.classify.tadm

  1  # Natural Language Toolkit: Interface to TADM Classifier 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project  
  4  # Author: Joseph Frazee <jfrazee@mail.utexas.edu> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import sys 
  9  import subprocess 
 10   
 11  from nltk.internals import find_binary 
 12  try: 
 13      import numpy 
 14  except ImportError: 
 15      numpy = None 
 16   
 17  _tadm_bin = None 
18 -def config_tadm(bin=None):
19 global _tadm_bin 20 _tadm_bin = find_binary( 21 'tadm', bin, 22 env_vars=['TADM_DIR'], 23 binary_names=['tadm'], 24 url='http://tadm.sf.net')
25
26 -def write_tadm_file(train_toks, encoding, stream):
27 """ 28 Generate an input file for C{tadm} based on the given corpus of 29 classified tokens. 30 31 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str}) 32 @param train_toks: Training data, represented as a list of 33 pairs, the first member of which is a feature dictionary, 34 and the second of which is a classification label. 35 36 @type encoding: L{TadmEventMaxentFeatureEncoding} 37 @param encoding: A feature encoding, used to convert featuresets 38 into feature vectors. 39 40 @type stream: C{stream} 41 @param stream: The stream to which the C{tadm} input file should be 42 written. 43 """ 44 # See the following for a file format description: 45 # 46 # http://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054 47 # http://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054 48 labels = encoding.labels() 49 for featureset, label in train_toks: 50 stream.write('%d\n' % len(labels)) 51 for known_label in labels: 52 v = encoding.encode(featureset, known_label) 53 stream.write('%d %d %s\n' % (int(label == known_label), len(v), 54 ' '.join('%d %d' % u for u in v)))
55
56 -def parse_tadm_weights(paramfile):
57 """ 58 Given the stdout output generated by C{tadm} when training a 59 model, return a C{numpy} array containing the corresponding weight 60 vector. 61 """ 62 weights = [] 63 for line in paramfile: 64 weights.append(float(line.strip())) 65 return numpy.array(weights, 'd')
66
67 -def call_tadm(args):
68 """ 69 Call the C{tadm} binary with the given arguments. 70 """ 71 if isinstance(args, basestring): 72 raise TypeError('args should be a list of strings') 73 if _tadm_bin is None: 74 config_tadm() 75 76 # Call tadm via a subprocess 77 cmd = [_tadm_bin] + args 78 p = subprocess.Popen(cmd, stdout=sys.stdout) 79 (stdout, stderr) = p.communicate() 80 81 # Check the return code. 82 if p.returncode != 0: 83 print 84 print stderr 85 raise OSError('tadm command failed!')
86
87 -def names_demo():
88 from nltk.classify.util import names_demo 89 from nltk.classify.maxent import TadmMaxentClassifier 90 classifier = names_demo(TadmMaxentClassifier.train)
91
92 -def encoding_demo():
93 import sys 94 from nltk.classify.maxent import TadmEventMaxentFeatureEncoding 95 from nltk.classify.tadm import write_tadm_file 96 tokens = [({'f0':1, 'f1':1, 'f3':1}, 'A'), 97 ({'f0':1, 'f2':1, 'f4':1}, 'B'), 98 ({'f0':2, 'f2':1, 'f3':1, 'f4':1}, 'A')] 99 encoding = TadmEventMaxentFeatureEncoding.train(tokens) 100 write_tadm_file(tokens, encoding, sys.stdout) 101 print 102 for i in range(encoding.length()): 103 print '%s --> %d' % (encoding.describe(i), i) 104 print
105 106 if __name__ == '__main__': 107 encoding_demo() 108 names_demo() 109