Implementation of missing metrics and small fixes.

Tássia Camões Araújo
1 parent 2255aea0
Showing 9 changed files with 159 additions and 116 deletions Show diff stats
doc/doxy_config
src/app_recommender.py
src/cross_validation.py
src/data.py
src/evaluation.py
src/generate_doc.sh
src/recommender.py
src/similarity.py
src/similarity_measure.py
@@ -31,7 +31,7 @@ PROJECT_NAME           = AppRecommender
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
  
-PROJECT_NUMBER		= v0.1
+PROJECT_NUMBER		= v0.3
  
 # Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
  
@@ -26,7 +26,7 @@ from datetime import timedelta
 from config import *
 from data import *
 from evaluation import *
-from similarity_measure import *
+from similarity import *
 from recommender import *
 from strategy import *
 from user import *
@@ -27,7 +27,7 @@ from datetime import timedelta
 from config import *
 from data import *
 from evaluation import *
-from similarity_measure import *
+from similarity import *
 from recommender import *
 from strategy import *
 from user import *
@@ -77,11 +77,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
         self.db_path = os.path.expanduser(cfg.tags_db)
         self.debtags_db = debtags.DB()
  
-        db = open(self.db_path)
+        try:
+            db_file = open(self.db_path)
+        except IOError:
+            logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
+            raise Error
         md5 = hashlib.md5()
-        md5.update(db.read())
+        md5.update(db_file.read())
         self.db_md5 = md5.hexdigest()
-
+        db_file.close()
         self.load_index(cfg.reindex)
  
     def load_db(self):
@@ -92,8 +96,9 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
         try:
             db_file = open(self.db_path, "r")
             self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
-        except IOError:  #FIXME try is not catching this
-            logging.error("Could not load DebtagsDB from %s." % self.db_path)
+            db_file.close()
+        except:
+            logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
             raise Error
  
     def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
@@ -33,7 +33,7 @@ class Metric:
  
 class Precision(Metric):
     """
-    Accuracy evaluation metric defined as the percentage of relevant itens
+    Classification accuracy metric defined as the percentage of relevant itens
     among the predicted ones.
     """
     def __init__(self):
@@ -50,7 +50,7 @@ class Precision(Metric):
  
 class Recall(Metric):
     """
-    Accuracy evaluation metric defined as the percentage of relevant itens
+    Classification ccuracy metric defined as the percentage of relevant itens
     which were predicted as so.
     """
     def __init__(self):
@@ -66,7 +66,10 @@ class Recall(Metric):
         return float(len(evaluation.predicted_real))/len(evaluation.real_relevant)
  
 class F1(Metric):
-    """  """
+    """
+    Classification accuracy metric which correlates precision and recall into an
+    unique measure.
+    """
     def __init__(self):
         """
         Set metric description.
@@ -79,24 +82,45 @@ class F1(Metric):
         """
         p = Precision().run(evaluation)
         r = Recall().run(evaluation)
-        return float((2*p*r)/(p+r))
+        return float((2*p*r))/(p+r)
  
 class MAE(Metric):
-    """  """
+    """
+    Prediction accuracy metric defined as the mean absolute error.
+    """
     def __init__(self):
         """
         Set metric description.
         """
         self.desc = "    MAE    "
  
+    def get_errors(self,evaluation):
+        """
+        Compute prediction errors.
+        """
+        keys = evaluation.predicted_item_scores.keys()
+        keys.extend(evaluation.real_item_scores.keys())
+        errors = []
+        for k in keys:
+            if k not in evaluation.real_item_scores:
+                evaluation.real_item_scores[k] = 0.0
+            if k not in evaluation.predicted_item_scores:
+                evaluation.predicted_item_scores[k] = 0.0
+            errors.append(float(evaluation.predicted_item_scores[k]-
+                          evaluation.real_item_scores[k]))
+        return errors
+
     def run(self,evaluation):
         """
         Compute metric.
         """
-        print "---" #FIXME
+        errors = self.get_errors(evaluation)
+        return sum(errors)/len(errors)
  
-class MSE(Metric):
-    """  """
+class MSE(MAE):
+    """
+    Prediction accuracy metric defined as the mean square error. 
+    """
     def __init__(self):
         """
         Set metric description.
@@ -107,21 +131,34 @@ class MSE(Metric):
         """
         Compute metric.
         """
-        print "---" #FIXME
+        errors = self.get_errors(evaluation)
+        square_errors = [pow(x,2) for x in errors]
+        return sum(square_errors)/len(square_errors)
  
 class Coverage(Metric):
-    """  """
-    def __init__(self):
+    """
+    Evaluation metric defined as the percentage of itens covered by the
+    recommender (have been recommended at least once).
+    """
+    def __init__(self,repository_size):
         """
-        Set metric description.
+        Set initial parameters.
         """
         self.desc = "  Coverage "
+        self.repository_size = repository_size
+        self.covered = set()
+
+    def save_covered(self,recommended_list):
+        """
+        Register that a list of itens has been recommended.
+        """
+        self.covered.update(set(recommended_list))
  
     def run(self,evaluation):
         """
         Compute metric.
         """
-        print "---" #FIXME
+        return float(self.covered.size)/self.repository_size
  
 class Evaluation:
     """
@@ -158,8 +195,7 @@ class CrossValidation:
         if partition_proportion<1 and partition_proportion>0:
             self.partition_proportion = partition_proportion
         else:
-            logging.critical("Partition proportion must be a value in the
-                              interval [0,1].")
+            logging.critical("Partition proportion must be a value in the interval [0,1].")
             raise Error
         self.rounds = rounds
         self.recommender = rec
@@ -195,7 +231,6 @@ class CrossValidation:
         """
         cross_item_score = dict.fromkeys(user.pkg_profile,1)
         partition_size = int(len(cross_item_score)*self.partition_proportion)
-        #cross_item_score = user.item_score.copy()
         for r in range(self.rounds):
             round_partition = {}
             for j in range(partition_size):
@@ -19,8 +19,10 @@
  
 # Get project version from git repository
 TAG=$(git describe --tags --abbrev=0)
+echo "Generating documentation for git tag $TAG"
 sed -i "s/^PROJECT_NUMBER.*$/PROJECT_NUMBER\t\t= $TAG/" ../doc/doxy_config
 rm -Rf ../doc/html
-../doc/doxygen ../doc/doxy_config
-#scp -r html/* tassia@www.ime.usp.br:public_html/ 
+../doc/doxygen-1.7.3 ../doc/doxy_config
+scp -r html/ tassia@eclipse.ime.usp.br:
+echo "---> Remember to place doc in the right location on server side."
 mv html/ ../doc/
@@ -61,7 +61,8 @@ class Recommender:
         try:
             strategy = "self."+cfg.strategy+"(cfg)"
             exec(strategy)
-        except (NameError, AttributeError, SyntaxError):
+        except (NameError, AttributeError, SyntaxError) as err:
+            print err
             logging.critical("Could not perform recommendation strategy '%s'" %
                               cfg.strategy)
             raise Error
@@ -0,0 +1,89 @@
+#!/usr/bin/python
+
+#  similarity - python module for classes and methods related to similarity
+#               measuring between two sets of data.
+#
+#  Copyright (C) 2010  Tassia Camoes <tassia@gmail.com>
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import math
+import stats
+
+def norm(x):
+    """
+    Return vector norm.
+    """
+    return math.sqrt(sum([x_i**2 for x_i in x]))
+
+def dot_product(x,y):
+    """
+    Return dot product of vectors 'x' and 'y'.
+    """
+    return sum([(x[i] * y[i]) for i in range(len(x))])
+
+class SimilarityMeasure:
+    """
+    Abstraction for diferent similarity measure approaches.
+    """
+
+class Distance(SimilarityMeasure):
+    """
+    Euclidian distance measure.
+    """
+    def __call__(self,x,y):
+        """
+        Return euclidian distance between vectors 'x' and 'y'.
+        """
+        sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
+        return math.sqrt(sum_pow)
+
+class Cosine(SimilarityMeasure):
+    """
+    Cosine similarity measure.
+    """
+    def __call__(self,x,y):
+        """
+        Return cosine of angle between vectors 'x' and 'y'.
+        """
+        return float(dot_product(x,y)/(norm(x)*norm(y)))
+
+class Pearson(SimilarityMeasure):
+    """
+    Pearson coeficient measure.
+    """
+    def __call__(self,x,y):
+        """ Return Pearson coeficient between vectors 'x' and 'y'. """
+        return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
+
+class Spearman(SimilarityMeasure):
+    """
+    Spearman correlation measure.
+    """
+    def __call__(self,x,y):
+        """
+        Return Spearman correlation between vectors 'x' and 'y'.
+        """
+        return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
+
+class Tanimoto(SimilarityMeasure):
+    """
+    Tanimoto coeficient measure.
+    """
+    def __call__(self,x,y):
+        """
+        Return Tanimoto coeficient between vectors 'x' and 'y'.
+        """
+        z = [v for v in x if v in y]
+        return float(len(z))/(len(x)+len(y)-len(z))
@@ -1,89 +0,0 @@
-#!/usr/bin/python
-
-#  similarity-measure - python module for classes and methods related to
-#                       measuring similarity between two sets of data.
-#
-#  Copyright (C) 2010  Tassia Camoes <tassia@gmail.com>
-#
-#  This program is free software: you can redistribute it and/or modify
-#  it under the terms of the GNU General Public License as published by
-#  the Free Software Foundation, either version 3 of the License, or
-#  (at your option) any later version.
-#
-#  This program is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#  GNU General Public License for more details.
-#
-#  You should have received a copy of the GNU General Public License
-#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-import math
-import stats
-
-def norm(x):
-    """
-    Return vector norm.
-    """
-    return math.sqrt(sum([x_i**2 for x_i in x]))
-
-def dot_product(x,y):
-    """
-    Return dot product of vectors 'x' and 'y'.
-    """
-    return sum([(x[i] * y[i]) for i in range(len(x))])
-
-class SimilarityMeasure:
-    """
-    Abstraction for diferent similarity measure approaches.
-    """
-
-class Distance(SimilarityMeasure):
-    """
-    Euclidian distance measure.
-    """
-    def __call__(self,x,y):
-        """
-        Return euclidian distance between vectors 'x' and 'y'.
-        """
-        sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
-        return math.sqrt(sum_pow)
-
-class Cosine(SimilarityMeasure):
-    """
-    Cosine similarity measure.
-    """
-    def __call__(self,x,y):
-        """
-        Return cosine of angle between vectors 'x' and 'y'.
-        """
-        return float(dot_product(x,y)/(norm(x)*norm(y)))
-
-class Pearson(SimilarityMeasure):
-    """
-    Pearson coeficient measure.
-    """
-    def __call__(self,x,y):
-        """ Return Pearson coeficient between vectors 'x' and 'y'. """
-        return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
-
-class Spearman(SimilarityMeasure):
-    """
-    Spearman correlation measure.
-    """
-    def __call__(self,x,y):
-        """
-        Return Spearman correlation between vectors 'x' and 'y'.
-        """
-        return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
-
-class Tanimoto(SimilarityMeasure):
-    """
-    Tanimoto coeficient measure.
-    """
-    def __call__(self,x,y):
-        """
-        Return Tanimoto coeficient between vectors 'x' and 'y'.
-        """
-        z = [v for v in x if v in y]
-        return float(len(z))/(len(x)+len(y)-len(z))
...	...	@@ -31,7 +31,7 @@ PROJECT_NAME = AppRecommender
31	31	# This could be handy for archiving the generated documentation or
32	32	# if some version control system is used.
33	33
34		-PROJECT_NUMBER = v0.1
	34	+PROJECT_NUMBER = v0.3
35	35
36	36	# Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
37	37
...	...
...	...	@@ -26,7 +26,7 @@ from datetime import timedelta
26	26	from config import *
27	27	from data import *
28	28	from evaluation import *
29		-from similarity_measure import *
	29	+from similarity import *
30	30	from recommender import *
31	31	from strategy import *
32	32	from user import *
...	...
...	...	@@ -27,7 +27,7 @@ from datetime import timedelta
27	27	from config import *
28	28	from data import *
29	29	from evaluation import *
30		-from similarity_measure import *
	30	+from similarity import *
31	31	from recommender import *
32	32	from strategy import *
33	33	from user import *
...	...
...	...	@@ -77,11 +77,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
77	77	self.db_path = os.path.expanduser(cfg.tags_db)
78	78	self.debtags_db = debtags.DB()
79	79
80		- db = open(self.db_path)
	80	+ try:
	81	+ db_file = open(self.db_path)
	82	+ except IOError:
	83	+ logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
	84	+ raise Error
81	85	md5 = hashlib.md5()
82		- md5.update(db.read())
	86	+ md5.update(db_file.read())
83	87	self.db_md5 = md5.hexdigest()
84		-
	88	+ db_file.close()
85	89	self.load_index(cfg.reindex)
86	90
87	91	def load_db(self):
...	...	@@ -92,8 +96,9 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
92	96	try:
93	97	db_file = open(self.db_path, "r")
94	98	self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
95		- except IOError: #FIXME try is not catching this
96		- logging.error("Could not load DebtagsDB from %s." % self.db_path)
	99	+ db_file.close()
	100	+ except:
	101	+ logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
97	102	raise Error
98	103
99	104	def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
...	...
...	...	@@ -33,7 +33,7 @@ class Metric:
33	33
34	34	class Precision(Metric):
35	35	"""
36		- Accuracy evaluation metric defined as the percentage of relevant itens
	36	+ Classification accuracy metric defined as the percentage of relevant itens
37	37	among the predicted ones.
38	38	"""
39	39	def __init__(self):
...	...	@@ -50,7 +50,7 @@ class Precision(Metric):
50	50
51	51	class Recall(Metric):
52	52	"""
53		- Accuracy evaluation metric defined as the percentage of relevant itens
	53	+ Classification ccuracy metric defined as the percentage of relevant itens
54	54	which were predicted as so.
55	55	"""
56	56	def __init__(self):
...	...	@@ -66,7 +66,10 @@ class Recall(Metric):
66	66	return float(len(evaluation.predicted_real))/len(evaluation.real_relevant)
67	67
68	68	class F1(Metric):
69		- """ """
	69	+ """
	70	+ Classification accuracy metric which correlates precision and recall into an
	71	+ unique measure.
	72	+ """
70	73	def __init__(self):
71	74	"""
72	75	Set metric description.
...	...	@@ -79,24 +82,45 @@ class F1(Metric):
79	82	"""
80	83	p = Precision().run(evaluation)
81	84	r = Recall().run(evaluation)
82		- return float((2pr)/(p+r))
	85	+ return float((2pr))/(p+r)
83	86
84	87	class MAE(Metric):
85		- """ """
	88	+ """
	89	+ Prediction accuracy metric defined as the mean absolute error.
	90	+ """
86	91	def __init__(self):
87	92	"""
88	93	Set metric description.
89	94	"""
90	95	self.desc = " MAE "
91	96
	97	+ def get_errors(self,evaluation):
	98	+ """
	99	+ Compute prediction errors.
	100	+ """
	101	+ keys = evaluation.predicted_item_scores.keys()
	102	+ keys.extend(evaluation.real_item_scores.keys())
	103	+ errors = []
	104	+ for k in keys:
	105	+ if k not in evaluation.real_item_scores:
	106	+ evaluation.real_item_scores[k] = 0.0
	107	+ if k not in evaluation.predicted_item_scores:
	108	+ evaluation.predicted_item_scores[k] = 0.0
	109	+ errors.append(float(evaluation.predicted_item_scores[k]-
	110	+ evaluation.real_item_scores[k]))
	111	+ return errors
	112	+
92	113	def run(self,evaluation):
93	114	"""
94	115	Compute metric.
95	116	"""
96		- print "---" #FIXME
	117	+ errors = self.get_errors(evaluation)
	118	+ return sum(errors)/len(errors)
97	119
98		-class MSE(Metric):
99		- """ """
	120	+class MSE(MAE):
	121	+ """
	122	+ Prediction accuracy metric defined as the mean square error.
	123	+ """
100	124	def __init__(self):
101	125	"""
102	126	Set metric description.
...	...	@@ -107,21 +131,34 @@ class MSE(Metric):
107	131	"""
108	132	Compute metric.
109	133	"""
110		- print "---" #FIXME
	134	+ errors = self.get_errors(evaluation)
	135	+ square_errors = [pow(x,2) for x in errors]
	136	+ return sum(square_errors)/len(square_errors)
111	137
112	138	class Coverage(Metric):
113		- """ """
114		- def __init__(self):
	139	+ """
	140	+ Evaluation metric defined as the percentage of itens covered by the
	141	+ recommender (have been recommended at least once).
	142	+ """
	143	+ def __init__(self,repository_size):
115	144	"""
116		- Set metric description.
	145	+ Set initial parameters.
117	146	"""
118	147	self.desc = " Coverage "
	148	+ self.repository_size = repository_size
	149	+ self.covered = set()
	150	+
	151	+ def save_covered(self,recommended_list):
	152	+ """
	153	+ Register that a list of itens has been recommended.
	154	+ """
	155	+ self.covered.update(set(recommended_list))
119	156
120	157	def run(self,evaluation):
121	158	"""
122	159	Compute metric.
123	160	"""
124		- print "---" #FIXME
	161	+ return float(self.covered.size)/self.repository_size
125	162
126	163	class Evaluation:
127	164	"""
...	...	@@ -158,8 +195,7 @@ class CrossValidation:
158	195	if partition_proportion<1 and partition_proportion>0:
159	196	self.partition_proportion = partition_proportion
160	197	else:
161		- logging.critical("Partition proportion must be a value in the
162		- interval [0,1].")
	198	+ logging.critical("Partition proportion must be a value in the interval [0,1].")
163	199	raise Error
164	200	self.rounds = rounds
165	201	self.recommender = rec
...	...	@@ -195,7 +231,6 @@ class CrossValidation:
195	231	"""
196	232	cross_item_score = dict.fromkeys(user.pkg_profile,1)
197	233	partition_size = int(len(cross_item_score)*self.partition_proportion)
198		- #cross_item_score = user.item_score.copy()
199	234	for r in range(self.rounds):
200	235	round_partition = {}
201	236	for j in range(partition_size):
...	...
...	...	@@ -19,8 +19,10 @@
19	19
20	20	# Get project version from git repository
21	21	TAG=$(git describe --tags --abbrev=0)
	22	+echo "Generating documentation for git tag $TAG"
22	23	sed -i "s/^PROJECT_NUMBER.*$/PROJECT_NUMBER\t\t= $TAG/" ../doc/doxy_config
23	24	rm -Rf ../doc/html
24		-../doc/doxygen ../doc/doxy_config
25		-#scp -r html/* tassia@www.ime.usp.br:public_html/
	25	+../doc/doxygen-1.7.3 ../doc/doxy_config
	26	+scp -r html/ tassia@eclipse.ime.usp.br:
	27	+echo "---> Remember to place doc in the right location on server side."
26	28	mv html/ ../doc/
...	...
...	...	@@ -61,7 +61,8 @@ class Recommender:
61	61	try:
62	62	strategy = "self."+cfg.strategy+"(cfg)"
63	63	exec(strategy)
64		- except (NameError, AttributeError, SyntaxError):
	64	+ except (NameError, AttributeError, SyntaxError) as err:
	65	+ print err
65	66	logging.critical("Could not perform recommendation strategy '%s'" %
66	67	cfg.strategy)
67	68	raise Error
...	...
...	...	@@ -0,0 +1,89 @@
	1	+#!/usr/bin/python
	2	+
	3	+# similarity - python module for classes and methods related to similarity
	4	+# measuring between two sets of data.
	5	+#
	6	+# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>
	7	+#
	8	+# This program is free software: you can redistribute it and/or modify
	9	+# it under the terms of the GNU General Public License as published by
	10	+# the Free Software Foundation, either version 3 of the License, or
	11	+# (at your option) any later version.
	12	+#
	13	+# This program is distributed in the hope that it will be useful,
	14	+# but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+# GNU General Public License for more details.
	17	+#
	18	+# You should have received a copy of the GNU General Public License
	19	+# along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+
	21	+import math
	22	+import stats
	23	+
	24	+def norm(x):
	25	+ """
	26	+ Return vector norm.
	27	+ """
	28	+ return math.sqrt(sum([x_i**2 for x_i in x]))
	29	+
	30	+def dot_product(x,y):
	31	+ """
	32	+ Return dot product of vectors 'x' and 'y'.
	33	+ """
	34	+ return sum([(x[i] * y[i]) for i in range(len(x))])
	35	+
	36	+class SimilarityMeasure:
	37	+ """
	38	+ Abstraction for diferent similarity measure approaches.
	39	+ """
	40	+
	41	+class Distance(SimilarityMeasure):
	42	+ """
	43	+ Euclidian distance measure.
	44	+ """
	45	+ def __call__(self,x,y):
	46	+ """
	47	+ Return euclidian distance between vectors 'x' and 'y'.
	48	+ """
	49	+ sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
	50	+ return math.sqrt(sum_pow)
	51	+
	52	+class Cosine(SimilarityMeasure):
	53	+ """
	54	+ Cosine similarity measure.
	55	+ """
	56	+ def __call__(self,x,y):
	57	+ """
	58	+ Return cosine of angle between vectors 'x' and 'y'.
	59	+ """
	60	+ return float(dot_product(x,y)/(norm(x)*norm(y)))
	61	+
	62	+class Pearson(SimilarityMeasure):
	63	+ """
	64	+ Pearson coeficient measure.
	65	+ """
	66	+ def __call__(self,x,y):
	67	+ """ Return Pearson coeficient between vectors 'x' and 'y'. """
	68	+ return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
	69	+
	70	+class Spearman(SimilarityMeasure):
	71	+ """
	72	+ Spearman correlation measure.
	73	+ """
	74	+ def __call__(self,x,y):
	75	+ """
	76	+ Return Spearman correlation between vectors 'x' and 'y'.
	77	+ """
	78	+ return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
	79	+
	80	+class Tanimoto(SimilarityMeasure):
	81	+ """
	82	+ Tanimoto coeficient measure.
	83	+ """
	84	+ def __call__(self,x,y):
	85	+ """
	86	+ Return Tanimoto coeficient between vectors 'x' and 'y'.
	87	+ """
	88	+ z = [v for v in x if v in y]
	89	+ return float(len(z))/(len(x)+len(y)-len(z))
...	...
...	...	@@ -1,89 +0,0 @@
1		-#!/usr/bin/python
2		-
3		-# similarity-measure - python module for classes and methods related to
4		-# measuring similarity between two sets of data.
5		-#
6		-# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>
7		-#
8		-# This program is free software: you can redistribute it and/or modify
9		-# it under the terms of the GNU General Public License as published by
10		-# the Free Software Foundation, either version 3 of the License, or
11		-# (at your option) any later version.
12		-#
13		-# This program is distributed in the hope that it will be useful,
14		-# but WITHOUT ANY WARRANTY; without even the implied warranty of
15		-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16		-# GNU General Public License for more details.
17		-#
18		-# You should have received a copy of the GNU General Public License
19		-# along with this program. If not, see <http://www.gnu.org/licenses/>.
20		-
21		-import math
22		-import stats
23		-
24		-def norm(x):
25		- """
26		- Return vector norm.
27		- """
28		- return math.sqrt(sum([x_i**2 for x_i in x]))
29		-
30		-def dot_product(x,y):
31		- """
32		- Return dot product of vectors 'x' and 'y'.
33		- """
34		- return sum([(x[i] * y[i]) for i in range(len(x))])
35		-
36		-class SimilarityMeasure:
37		- """
38		- Abstraction for diferent similarity measure approaches.
39		- """
40		-
41		-class Distance(SimilarityMeasure):
42		- """
43		- Euclidian distance measure.
44		- """
45		- def __call__(self,x,y):
46		- """
47		- Return euclidian distance between vectors 'x' and 'y'.
48		- """
49		- sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
50		- return math.sqrt(sum_pow)
51		-
52		-class Cosine(SimilarityMeasure):
53		- """
54		- Cosine similarity measure.
55		- """
56		- def __call__(self,x,y):
57		- """
58		- Return cosine of angle between vectors 'x' and 'y'.
59		- """
60		- return float(dot_product(x,y)/(norm(x)*norm(y)))
61		-
62		-class Pearson(SimilarityMeasure):
63		- """
64		- Pearson coeficient measure.
65		- """
66		- def __call__(self,x,y):
67		- """ Return Pearson coeficient between vectors 'x' and 'y'. """
68		- return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
69		-
70		-class Spearman(SimilarityMeasure):
71		- """
72		- Spearman correlation measure.
73		- """
74		- def __call__(self,x,y):
75		- """
76		- Return Spearman correlation between vectors 'x' and 'y'.
77		- """
78		- return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
79		-
80		-class Tanimoto(SimilarityMeasure):
81		- """
82		- Tanimoto coeficient measure.
83		- """
84		- def __call__(self,x,y):
85		- """
86		- Return Tanimoto coeficient between vectors 'x' and 'y'.
87		- """
88		- z = [v for v in x if v in y]
89		- return float(len(z))/(len(x)+len(y)-len(z))