Package nltk :: Package metrics :: Module distance
[hide private]
[frames] | no frames]

Source Code for Module nltk.metrics.distance

  1  # Natural Language Toolkit: Distance Metrics 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  #         Tom Lippincott <tom@cs.columbia.edu> 
  7  # URL: <http://www.nltk.org/> 
  8  # For license information, see LICENSE.TXT 
  9  # 
 10   
 11  """ 
 12  Distance Metrics. 
 13   
 14  Compute the distance between two items (usually strings). 
 15  As metrics, they must satisfy the following three requirements: 
 16   
 17  1. d(a, a) = 0 
 18  2. d(a, b) >= 0 
 19  3. d(a, c) <= d(a, b) + d(b, c) 
 20    
 21  """ 
 22   
23 -def _edit_dist_init(len1, len2):
24 lev = [] 25 for i in range(len1): 26 lev.append([0] * len2) # initialize 2-D array to zero 27 for i in range(len1): 28 lev[i][0] = i # column 0: 0,1,2,3,4,... 29 for j in range(len2): 30 lev[0][j] = j # row 0: 0,1,2,3,4,... 31 return lev
32
33 -def _edit_dist_step(lev, i, j, c1, c2):
34 a = lev[i-1][j ] + 1 # skipping s1[i] 35 b = lev[i-1][j-1] + (c1 != c2) # matching s1[i] with s2[j] 36 c = lev[i ][j-1] + 1 # skipping s2[j] 37 lev[i][j] = min(a,b,c) # pick the cheapest
38
39 -def edit_distance(s1, s2):
40 """ 41 Calculate the Levenshtein edit-distance between two strings. 42 The edit distance is the number of characters that need to be 43 substituted, inserted, or deleted, to transform s1 into s2. For 44 example, transforming "rain" to "shine" requires three steps, 45 consisting of two substitutions and one insertion: 46 "rain" -> "sain" -> "shin" -> "shine". These operations could have 47 been done in other orders, but at least three steps are needed. 48 49 @param s1, s2: The strings to be analysed 50 @type s1: C{string} 51 @type s2: C{string} 52 @rtype C{int} 53 """ 54 # set up a 2-D array 55 len1 = len(s1); len2 = len(s2) 56 lev = _edit_dist_init(len1+1, len2+1) 57 58 # iterate over the array 59 for i in range(len1): 60 for j in range (len2): 61 _edit_dist_step(lev, i+1, j+1, s1[i], s2[j]) 62 return lev[len1][len2]
63 64
65 -def binary_distance(label1, label2):
66 """Simple equality test. 67 68 0.0 if the labels are identical, 1.0 if they are different. 69 70 >>> binary_distance(1,1) 71 0.0 72 73 >>> binary_distance(1,3) 74 1.0 75 """ 76 77 if label1 == label2: 78 return 0.0 79 else: 80 return 1.0
81 82
83 -def jaccard_distance(label1, label2):
84 """Distance metric comparing set-similarity. 85 86 """ 87 return (len(label1.union(label2)) - len(label1.intersection(label2)))/float(len(label1.union(label2)))
88 89
90 -def masi_distance(label1, label2):
91 """Distance metric that takes into account partial agreement when multiple 92 labels are assigned. 93 94 >>> masi_distance(set([1,2]),set([1,2,3,4])) 95 0.5 96 97 Passonneau 2005, Measuring Agreement on Set-Valued Items (MASI) for Semantic and Pragmatic Annotation. 98 """ 99 100 return 1 - float(len(label1.intersection(label2)))/float(max(len(label1),len(label2)))
101 102
103 -def interval_distance(label1,label2):
104 """Krippendorff'1 interval distance metric 105 106 >>> interval_distance(1,10) 107 81 108 109 Krippendorff 1980, Content Analysis: An Introduction to its Methodology 110 """ 111 try: 112 return pow(label1-label2,2) 113 # return pow(list(label1)[0]-list(label2)[0],2) 114 except: 115 print "non-numeric labels not supported with interval distance"
116 117
118 -def presence(label):
119 """Higher-order function to test presence of a given label 120 121 """ 122 return lambda x,y: 1.0*((label in x) == (label in y))
123 124
125 -def fractional_presence(label):
126 return lambda x,y:abs((float(1.0/len(x)) - float(1.0/len(y))))*(label in x and label in y) or 0.0*(label not in x and label not in y) or abs((float(1.0/len(x))))*(label in x and label not in y) or ((float(1.0/len(y))))*(label not in x and label in y)
127 128
129 -def custom_distance(file):
130 data = {} 131 for l in open(file): 132 labelA, labelB, dist = l.strip().split("\t") 133 labelA = frozenset([labelA]) 134 labelB = frozenset([labelB]) 135 data[frozenset([labelA,labelB])] = float(dist) 136 return lambda x,y:data[frozenset([x,y])]
137 138
139 -def demo():
140 s1 = "rain" 141 s2 = "shine" 142 print "Edit distance between '%s' and '%s':" % (s1,s2), edit_distance(s1, s2) 143 144 s1 = set([1,2,3,4]) 145 s2 = set([3,4,5]) 146 print "s1:", s1 147 print "s2:", s2 148 print "Binary distance:", binary_distance(s1, s2) 149 print "Jaccard distance:", jaccard_distance(s1, s2) 150 print "MASI distance:", masi_distance(s1, s2)
151 152 if __name__ == '__main__': 153 demo() 154