1
2
3
4
5
6
7
8
9
10
11 """
12 Distance Metrics.
13
14 Compute the distance between two items (usually strings).
15 As metrics, they must satisfy the following three requirements:
16
17 1. d(a, a) = 0
18 2. d(a, b) >= 0
19 3. d(a, c) <= d(a, b) + d(b, c)
20
21 """
22
24 lev = []
25 for i in range(len1):
26 lev.append([0] * len2)
27 for i in range(len1):
28 lev[i][0] = i
29 for j in range(len2):
30 lev[0][j] = j
31 return lev
32
34 a = lev[i-1][j ] + 1
35 b = lev[i-1][j-1] + (c1 != c2)
36 c = lev[i ][j-1] + 1
37 lev[i][j] = min(a,b,c)
38
40 """
41 Calculate the Levenshtein edit-distance between two strings.
42 The edit distance is the number of characters that need to be
43 substituted, inserted, or deleted, to transform s1 into s2. For
44 example, transforming "rain" to "shine" requires three steps,
45 consisting of two substitutions and one insertion:
46 "rain" -> "sain" -> "shin" -> "shine". These operations could have
47 been done in other orders, but at least three steps are needed.
48
49 @param s1, s2: The strings to be analysed
50 @type s1: C{string}
51 @type s2: C{string}
52 @rtype C{int}
53 """
54
55 len1 = len(s1); len2 = len(s2)
56 lev = _edit_dist_init(len1+1, len2+1)
57
58
59 for i in range(len1):
60 for j in range (len2):
61 _edit_dist_step(lev, i+1, j+1, s1[i], s2[j])
62 return lev[len1][len2]
63
64
66 """Simple equality test.
67
68 0.0 if the labels are identical, 1.0 if they are different.
69
70 >>> binary_distance(1,1)
71 0.0
72
73 >>> binary_distance(1,3)
74 1.0
75 """
76
77 if label1 == label2:
78 return 0.0
79 else:
80 return 1.0
81
82
84 """Distance metric comparing set-similarity.
85
86 """
87 return (len(label1.union(label2)) - len(label1.intersection(label2)))/float(len(label1.union(label2)))
88
89
91 """Distance metric that takes into account partial agreement when multiple
92 labels are assigned.
93
94 >>> masi_distance(set([1,2]),set([1,2,3,4]))
95 0.5
96
97 Passonneau 2005, Measuring Agreement on Set-Valued Items (MASI) for Semantic and Pragmatic Annotation.
98 """
99
100 return 1 - float(len(label1.intersection(label2)))/float(max(len(label1),len(label2)))
101
102
104 """Krippendorff'1 interval distance metric
105
106 >>> interval_distance(1,10)
107 81
108
109 Krippendorff 1980, Content Analysis: An Introduction to its Methodology
110 """
111 try:
112 return pow(label1-label2,2)
113
114 except:
115 print "non-numeric labels not supported with interval distance"
116
117
119 """Higher-order function to test presence of a given label
120
121 """
122 return lambda x,y: 1.0*((label in x) == (label in y))
123
124
126 return lambda x,y:abs((float(1.0/len(x)) - float(1.0/len(y))))*(label in x and label in y) or 0.0*(label not in x and label not in y) or abs((float(1.0/len(x))))*(label in x and label not in y) or ((float(1.0/len(y))))*(label not in x and label in y)
127
128
130 data = {}
131 for l in open(file):
132 labelA, labelB, dist = l.strip().split("\t")
133 labelA = frozenset([labelA])
134 labelB = frozenset([labelB])
135 data[frozenset([labelA,labelB])] = float(dist)
136 return lambda x,y:data[frozenset([x,y])]
137
138
140 s1 = "rain"
141 s2 = "shine"
142 print "Edit distance between '%s' and '%s':" % (s1,s2), edit_distance(s1, s2)
143
144 s1 = set([1,2,3,4])
145 s2 = set([3,4,5])
146 print "s1:", s1
147 print "s2:", s2
148 print "Binary distance:", binary_distance(s1, s2)
149 print "Jaccard distance:", jaccard_distance(s1, s2)
150 print "MASI distance:", masi_distance(s1, s2)
151
152 if __name__ == '__main__':
153 demo()
154