Commit 9d0b56950245aee7a556ed37b37a827473fefeac
1 parent
96e36f1c
Exists in
master
and in
1 other branch
Considering all measures as dissimilarities.
Showing
1 changed file
with
90 additions
and
0 deletions
Show diff stats
... | ... | @@ -0,0 +1,90 @@ |
1 | +#!/usr/bin/python | |
2 | + | |
3 | +# similarity - python module for classes and methods related to similarity | |
4 | +# measuring between two sets of data. | |
5 | +# | |
6 | +# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com> | |
7 | +# | |
8 | +# This program is free software: you can redistribute it and/or modify | |
9 | +# it under the terms of the GNU General Public License as published by | |
10 | +# the Free Software Foundation, either version 3 of the License, or | |
11 | +# (at your option) any later version. | |
12 | +# | |
13 | +# This program is distributed in the hope that it will be useful, | |
14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | +# GNU General Public License for more details. | |
17 | +# | |
18 | +# You should have received a copy of the GNU General Public License | |
19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | + | |
21 | +import math | |
22 | +import stats | |
23 | + | |
24 | +def norm(x): | |
25 | + """ | |
26 | + Return norm of numeric vector x. | |
27 | + """ | |
28 | + return math.sqrt(sum([x_i**2 for x_i in x])) | |
29 | + | |
30 | +def dot_product(x,y): | |
31 | + """ | |
32 | + Return dot product of numeric vectors 'x' and 'y'. | |
33 | + """ | |
34 | + return sum([(x[i] * y[i]) for i in range(len(x))]) | |
35 | + | |
36 | +class Dissimilarity: | |
37 | + """ | |
38 | + Abstraction for different measures of dissimilarity between two sets or | |
39 | + vectors. | |
40 | + """ | |
41 | + | |
42 | +class EuclidianDistance(Dissimilarity): | |
43 | + """ | |
44 | + Euclidian distance between two vectors. | |
45 | + """ | |
46 | + def __call__(self,x,y): | |
47 | + """ | |
48 | + Return euclidian distance between vectors 'x' and 'y'. | |
49 | + """ | |
50 | + sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))]) | |
51 | + return math.sqrt(sum_pow) | |
52 | + | |
53 | +class CosineDissimilarity(Dissimilarity): | |
54 | + """ | |
55 | + Dissimilarity measure complementary to the cosine similarity which is defined by the | |
56 | + cosine of the angle between two vectors. | |
57 | + """ | |
58 | + def __call__(self,x,y): | |
59 | + """ | |
60 | + Return complement of the cosine of angle between vectors 'x' and 'y'. | |
61 | + """ | |
62 | + return 1-(float(dot_product(x,y)/(norm(x)*norm(y)))) | |
63 | + | |
64 | +class JaccardDistance(Dissimilarity): | |
65 | + """ | |
66 | + Dissimilarity measure complentary to Jaccard Index which is defined by the quantity of | |
67 | + common values divided by the size of the two sets union. | |
68 | + """ | |
69 | + def __call__(self,x,y): | |
70 | + """ | |
71 | + Return Jaccard Index between sets 'x' and 'y'. | |
72 | + """ | |
73 | + common = [v for v in x if v in y] | |
74 | + return 1-(float(len(common))/(len(x)+len(y)-len(common))) | |
75 | + | |
76 | +class DiffCoefficient(Dissimilarity): | |
77 | + """ | |
78 | + Measure the difference between the two sets in terms of how many items should be added and | |
79 | + removed from one set to transform it into the other set. Similar to edit distance, but the | |
80 | + items positions are not relevant for sets. | |
81 | + """ | |
82 | + def __call__(self,x,y): | |
83 | + """ | |
84 | + Return the diff coeficient between sets 'x' and 'y'. | |
85 | + """ | |
86 | + add = [v for v in x if v not in y] | |
87 | + delete = [v for v in y if v not in x] | |
88 | + common = [v for v in x if v in y] | |
89 | + | |
90 | + return float((len(add)+len(delete))/(len(x)+len(y)-len(common))) | ... | ... |