Package nltk :: Package metrics :: Module windowdiff'
[hide private]
[frames] | no frames]

Source Code for Module nltk.metrics.windowdiff'

 1  # Natural Language Toolkit: Windowdiff 
 2  # 
 3  # Copyright (C) 2001-2011 NLTK Project 
 4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
 5  #         Steven Bird <sb@csse.unimelb.edu.au> 
 6  # URL: <http://www.nltk.org/> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  ########################################################################## 
10  # Windowdiff 
11  # Pevzner, L., and Hearst, M., A Critique and Improvement of 
12  #   an Evaluation Metric for Text Segmentation, 
13  # Computational Linguistics,, 28 (1), March 2002, pp. 19-36 
14  ########################################################################## 
15   
16 -def windowdiff(seg1, seg2, k, boundary="1"):
17 """ 18 Compute the windowdiff score for a pair of segmentations. A segmentation is any sequence 19 over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used 20 to mark the edge of a segmentation. 21 22 >>> s1 = "00000010000000001000000" 23 >>> s2 = "00000001000000010000000" 24 >>> s3 = "00010000000000000001000" 25 >>> windowdiff(s1, s1, 3) 26 0 27 >>> windowdiff(s1, s2, 3) 28 4 29 >>> windowdiff(s2, s3, 3) 30 16 31 32 @param seg1: a segmentation 33 @type seg1: C{string} or C{list} 34 @param seg2: a segmentation 35 @type seg2: C{string} or C{list} 36 @param k: window width 37 @type k: C{int} 38 @param boundary: boundary value 39 @type boundary: C{string} or C{int} or C{bool} 40 @rtype: C{int} 41 """ 42 43 if len(seg1) != len(seg2): 44 raise ValueError, "Segmentations have unequal length" 45 wd = 0 46 for i in range(len(seg1) - k): 47 wd += abs(seg1[i:i+k+1].count(boundary) - seg2[i:i+k+1].count(boundary)) 48 return wd
49
50 -def demo():
51 s1 = "00000010000000001000000" 52 s2 = "00000001000000010000000" 53 s3 = "00010000000000000001000" 54 print "s1:", s1 55 print "s2:", s2 56 print "s3:", s3 57 58 print "windowdiff(s1, s1, 3) = ", windowdiff(s1, s1, 3) 59 print "windowdiff(s1, s2, 3) = ", windowdiff(s1, s2, 3) 60 print "windowdiff(s2, s3, 3) = ", windowdiff(s2, s3, 3)
61