Commit 9d0b56950245aee7a556ed37b37a827473fefeac
1 parent
96e36f1c
Exists in
master
and in
1 other branch
Considering all measures as dissimilarities.
Showing
1 changed file
with
90 additions
and
0 deletions
Show diff stats
| @@ -0,0 +1,90 @@ | @@ -0,0 +1,90 @@ | ||
| 1 | +#!/usr/bin/python | ||
| 2 | + | ||
| 3 | +# similarity - python module for classes and methods related to similarity | ||
| 4 | +# measuring between two sets of data. | ||
| 5 | +# | ||
| 6 | +# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com> | ||
| 7 | +# | ||
| 8 | +# This program is free software: you can redistribute it and/or modify | ||
| 9 | +# it under the terms of the GNU General Public License as published by | ||
| 10 | +# the Free Software Foundation, either version 3 of the License, or | ||
| 11 | +# (at your option) any later version. | ||
| 12 | +# | ||
| 13 | +# This program is distributed in the hope that it will be useful, | ||
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 16 | +# GNU General Public License for more details. | ||
| 17 | +# | ||
| 18 | +# You should have received a copy of the GNU General Public License | ||
| 19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
| 20 | + | ||
| 21 | +import math | ||
| 22 | +import stats | ||
| 23 | + | ||
| 24 | +def norm(x): | ||
| 25 | + """ | ||
| 26 | + Return norm of numeric vector x. | ||
| 27 | + """ | ||
| 28 | + return math.sqrt(sum([x_i**2 for x_i in x])) | ||
| 29 | + | ||
| 30 | +def dot_product(x,y): | ||
| 31 | + """ | ||
| 32 | + Return dot product of numeric vectors 'x' and 'y'. | ||
| 33 | + """ | ||
| 34 | + return sum([(x[i] * y[i]) for i in range(len(x))]) | ||
| 35 | + | ||
| 36 | +class Dissimilarity: | ||
| 37 | + """ | ||
| 38 | + Abstraction for different measures of dissimilarity between two sets or | ||
| 39 | + vectors. | ||
| 40 | + """ | ||
| 41 | + | ||
| 42 | +class EuclidianDistance(Dissimilarity): | ||
| 43 | + """ | ||
| 44 | + Euclidian distance between two vectors. | ||
| 45 | + """ | ||
| 46 | + def __call__(self,x,y): | ||
| 47 | + """ | ||
| 48 | + Return euclidian distance between vectors 'x' and 'y'. | ||
| 49 | + """ | ||
| 50 | + sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))]) | ||
| 51 | + return math.sqrt(sum_pow) | ||
| 52 | + | ||
| 53 | +class CosineDissimilarity(Dissimilarity): | ||
| 54 | + """ | ||
| 55 | + Dissimilarity measure complementary to the cosine similarity which is defined by the | ||
| 56 | + cosine of the angle between two vectors. | ||
| 57 | + """ | ||
| 58 | + def __call__(self,x,y): | ||
| 59 | + """ | ||
| 60 | + Return complement of the cosine of angle between vectors 'x' and 'y'. | ||
| 61 | + """ | ||
| 62 | + return 1-(float(dot_product(x,y)/(norm(x)*norm(y)))) | ||
| 63 | + | ||
| 64 | +class JaccardDistance(Dissimilarity): | ||
| 65 | + """ | ||
| 66 | + Dissimilarity measure complentary to Jaccard Index which is defined by the quantity of | ||
| 67 | + common values divided by the size of the two sets union. | ||
| 68 | + """ | ||
| 69 | + def __call__(self,x,y): | ||
| 70 | + """ | ||
| 71 | + Return Jaccard Index between sets 'x' and 'y'. | ||
| 72 | + """ | ||
| 73 | + common = [v for v in x if v in y] | ||
| 74 | + return 1-(float(len(common))/(len(x)+len(y)-len(common))) | ||
| 75 | + | ||
| 76 | +class DiffCoefficient(Dissimilarity): | ||
| 77 | + """ | ||
| 78 | + Measure the difference between the two sets in terms of how many items should be added and | ||
| 79 | + removed from one set to transform it into the other set. Similar to edit distance, but the | ||
| 80 | + items positions are not relevant for sets. | ||
| 81 | + """ | ||
| 82 | + def __call__(self,x,y): | ||
| 83 | + """ | ||
| 84 | + Return the diff coeficient between sets 'x' and 'y'. | ||
| 85 | + """ | ||
| 86 | + add = [v for v in x if v not in y] | ||
| 87 | + delete = [v for v in y if v not in x] | ||
| 88 | + common = [v for v in x if v in y] | ||
| 89 | + | ||
| 90 | + return float((len(add)+len(delete))/(len(x)+len(y)-len(common))) |