Commit 9d0b56950245aee7a556ed37b37a827473fefeac

Authored by Tássia Camões Araújo
1 parent 96e36f1c
Exists in master and in 1 other branch add_vagrant

Considering all measures as dissimilarities.

Showing 1 changed file with 90 additions and 0 deletions   Show diff stats
src/dissimilarity.py 0 → 100644
... ... @@ -0,0 +1,90 @@
  1 +#!/usr/bin/python
  2 +
  3 +# similarity - python module for classes and methods related to similarity
  4 +# measuring between two sets of data.
  5 +#
  6 +# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>
  7 +#
  8 +# This program is free software: you can redistribute it and/or modify
  9 +# it under the terms of the GNU General Public License as published by
  10 +# the Free Software Foundation, either version 3 of the License, or
  11 +# (at your option) any later version.
  12 +#
  13 +# This program is distributed in the hope that it will be useful,
  14 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 +# GNU General Public License for more details.
  17 +#
  18 +# You should have received a copy of the GNU General Public License
  19 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +
  21 +import math
  22 +import stats
  23 +
  24 +def norm(x):
  25 + """
  26 + Return norm of numeric vector x.
  27 + """
  28 + return math.sqrt(sum([x_i**2 for x_i in x]))
  29 +
  30 +def dot_product(x,y):
  31 + """
  32 + Return dot product of numeric vectors 'x' and 'y'.
  33 + """
  34 + return sum([(x[i] * y[i]) for i in range(len(x))])
  35 +
  36 +class Dissimilarity:
  37 + """
  38 + Abstraction for different measures of dissimilarity between two sets or
  39 + vectors.
  40 + """
  41 +
  42 +class EuclidianDistance(Dissimilarity):
  43 + """
  44 + Euclidian distance between two vectors.
  45 + """
  46 + def __call__(self,x,y):
  47 + """
  48 + Return euclidian distance between vectors 'x' and 'y'.
  49 + """
  50 + sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
  51 + return math.sqrt(sum_pow)
  52 +
  53 +class CosineDissimilarity(Dissimilarity):
  54 + """
  55 + Dissimilarity measure complementary to the cosine similarity which is defined by the
  56 + cosine of the angle between two vectors.
  57 + """
  58 + def __call__(self,x,y):
  59 + """
  60 + Return complement of the cosine of angle between vectors 'x' and 'y'.
  61 + """
  62 + return 1-(float(dot_product(x,y)/(norm(x)*norm(y))))
  63 +
  64 +class JaccardDistance(Dissimilarity):
  65 + """
  66 + Dissimilarity measure complentary to Jaccard Index which is defined by the quantity of
  67 + common values divided by the size of the two sets union.
  68 + """
  69 + def __call__(self,x,y):
  70 + """
  71 + Return Jaccard Index between sets 'x' and 'y'.
  72 + """
  73 + common = [v for v in x if v in y]
  74 + return 1-(float(len(common))/(len(x)+len(y)-len(common)))
  75 +
  76 +class DiffCoefficient(Dissimilarity):
  77 + """
  78 + Measure the difference between the two sets in terms of how many items should be added and
  79 + removed from one set to transform it into the other set. Similar to edit distance, but the
  80 + items positions are not relevant for sets.
  81 + """
  82 + def __call__(self,x,y):
  83 + """
  84 + Return the diff coeficient between sets 'x' and 'y'.
  85 + """
  86 + add = [v for v in x if v not in y]
  87 + delete = [v for v in y if v not in x]
  88 + common = [v for v in x if v in y]
  89 +
  90 + return float((len(add)+len(delete))/(len(x)+len(y)-len(common)))
... ...