Implementation of missing metrics and small fixes.

Tássia Camões Araújo
1 parent 2255aea0
Showing 9 changed files with 159 additions and 116 deletions Show diff stats
doc/doxy_config
src/app_recommender.py
src/cross_validation.py
src/data.py
src/evaluation.py
src/generate_doc.sh
src/recommender.py
src/similarity.py
src/similarity_measure.py
@@ -31,7 +31,7 @@ PROJECT_NAME           = AppRecommender
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
-PROJECT_NUMBER		= v0.1
+PROJECT_NUMBER		= v0.3
 # Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
@@ -26,7 +26,7 @@ from datetime import timedelta
 from config import *
 from data import *
 from evaluation import *
-from similarity_measure import *
+from similarity import *
 from recommender import *
 from strategy import *
 from user import *
@@ -27,7 +27,7 @@ from datetime import timedelta
 from config import *
 from data import *
 from evaluation import *
-from similarity_measure import *
+from similarity import *
 from recommender import *
 from strategy import *
 from user import *
@@ -77,11 +77,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
         self.db_path = os.path.expanduser(cfg.tags_db)
         self.debtags_db = debtags.DB()
-        db = open(self.db_path)
+        try:
+            db_file = open(self.db_path)
+        except IOError:
+            logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
+            raise Error
         md5 = hashlib.md5()
-        md5.update(db.read())
+        md5.update(db_file.read())
         self.db_md5 = md5.hexdigest()
-
+        db_file.close()
         self.load_index(cfg.reindex)
     def load_db(self):
@@ -92,8 +96,9 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
         try:
             db_file = open(self.db_path, "r")
             self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
-        except IOError:  #FIXME try is not catching this
-            logging.error("Could not load DebtagsDB from %s." % self.db_path)
+            db_file.close()
+        except:
+            logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
             raise Error
     def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
@@ -33,7 +33,7 @@ class Metric:
 class Precision(Metric):
     """
-    Accuracy evaluation metric defined as the percentage of relevant itens
+    Classification accuracy metric defined as the percentage of relevant itens
     among the predicted ones.
     """
     def __init__(self):
@@ -50,7 +50,7 @@ class Precision(Metric):
 class Recall(Metric):
     """
-    Accuracy evaluation metric defined as the percentage of relevant itens
+    Classification ccuracy metric defined as the percentage of relevant itens
     which were predicted as so.
     """
     def __init__(self):
@@ -66,7 +66,10 @@ class Recall(Metric):
         return float(len(evaluation.predicted_real))/len(evaluation.real_relevant)
 class F1(Metric):
-    """  """
+    """
+    Classification accuracy metric which correlates precision and recall into an
+    unique measure.
+    """
     def __init__(self):
         """
         Set metric description.
@@ -79,24 +82,45 @@ class F1(Metric):
         """
         p = Precision().run(evaluation)
         r = Recall().run(evaluation)
-        return float((2*p*r)/(p+r))
+        return float((2*p*r))/(p+r)
 class MAE(Metric):
-    """  """
+    """
+    Prediction accuracy metric defined as the mean absolute error.
+    """
     def __init__(self):
         """
         Set metric description.
         """
         self.desc = "    MAE    "
+    def get_errors(self,evaluation):
+        """
+        Compute prediction errors.
+        """
+        keys = evaluation.predicted_item_scores.keys()
+        keys.extend(evaluation.real_item_scores.keys())
+        errors = []
+        for k in keys:
+            if k not in evaluation.real_item_scores:
+                evaluation.real_item_scores[k] = 0.0
+            if k not in evaluation.predicted_item_scores:
+                evaluation.predicted_item_scores[k] = 0.0
+            errors.append(float(evaluation.predicted_item_scores[k]-
+                          evaluation.real_item_scores[k]))
+        return errors
+
     def run(self,evaluation):
         """
         Compute metric.
         """
-        print "---" #FIXME
+        errors = self.get_errors(evaluation)
+        return sum(errors)/len(errors)
-class MSE(Metric):
-    """  """
+class MSE(MAE):
+    """
+    Prediction accuracy metric defined as the mean square error. 
+    """
     def __init__(self):
         """
         Set metric description.
@@ -107,21 +131,34 @@ class MSE(Metric):
         """
         Compute metric.
         """
-        print "---" #FIXME
+        errors = self.get_errors(evaluation)
+        square_errors = [pow(x,2) for x in errors]
+        return sum(square_errors)/len(square_errors)
 class Coverage(Metric):
-    """  """
-    def __init__(self):
+    """
+    Evaluation metric defined as the percentage of itens covered by the
+    recommender (have been recommended at least once).
+    """
+    def __init__(self,repository_size):
         """
-        Set metric description.
+        Set initial parameters.
         """
         self.desc = "  Coverage "
+        self.repository_size = repository_size
+        self.covered = set()
+
+    def save_covered(self,recommended_list):
+        """
+        Register that a list of itens has been recommended.
+        """
+        self.covered.update(set(recommended_list))
     def run(self,evaluation):
         """
         Compute metric.
         """
-        print "---" #FIXME
+        return float(self.covered.size)/self.repository_size
 class Evaluation:
     """
@@ -158,8 +195,7 @@ class CrossValidation:
         if partition_proportion<1 and partition_proportion>0:
             self.partition_proportion = partition_proportion
         else:
-            logging.critical("Partition proportion must be a value in the
-                              interval [0,1].")
+            logging.critical("Partition proportion must be a value in the interval [0,1].")
             raise Error
         self.rounds = rounds
         self.recommender = rec
@@ -195,7 +231,6 @@ class CrossValidation:
         """
         cross_item_score = dict.fromkeys(user.pkg_profile,1)
         partition_size = int(len(cross_item_score)*self.partition_proportion)
-        #cross_item_score = user.item_score.copy()
         for r in range(self.rounds):
             round_partition = {}
             for j in range(partition_size):
@@ -19,8 +19,10 @@
 # Get project version from git repository
 TAG=$(git describe --tags --abbrev=0)
+echo "Generating documentation for git tag $TAG"
 sed -i "s/^PROJECT_NUMBER.*$/PROJECT_NUMBER\t\t= $TAG/" ../doc/doxy_config
 rm -Rf ../doc/html
-../doc/doxygen ../doc/doxy_config
-#scp -r html/* tassia@www.ime.usp.br:public_html/ 
+../doc/doxygen-1.7.3 ../doc/doxy_config
+scp -r html/ tassia@eclipse.ime.usp.br:
+echo "---> Remember to place doc in the right location on server side."
 mv html/ ../doc/
@@ -61,7 +61,8 @@ class Recommender:
         try:
             strategy = "self."+cfg.strategy+"(cfg)"
             exec(strategy)
-        except (NameError, AttributeError, SyntaxError):
+        except (NameError, AttributeError, SyntaxError) as err:
+            print err
             logging.critical("Could not perform recommendation strategy '%s'" %
                               cfg.strategy)
             raise Error
@@ -0,0 +1,89 @@
+#!/usr/bin/python
+
+#  similarity - python module for classes and methods related to similarity
+#               measuring between two sets of data.
+#
+#  Copyright (C) 2010  Tassia Camoes <tassia@gmail.com>
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import math
+import stats
+
+def norm(x):
+    """
+    Return vector norm.
+    """
+    return math.sqrt(sum([x_i**2 for x_i in x]))
+
+def dot_product(x,y):
+    """
+    Return dot product of vectors 'x' and 'y'.
+    """
+    return sum([(x[i] * y[i]) for i in range(len(x))])
+
+class SimilarityMeasure:
+    """
+    Abstraction for diferent similarity measure approaches.
+    """
+
+class Distance(SimilarityMeasure):
+    """
+    Euclidian distance measure.
+    """
+    def __call__(self,x,y):
+        """
+        Return euclidian distance between vectors 'x' and 'y'.
+        """
+        sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
+        return math.sqrt(sum_pow)
+
+class Cosine(SimilarityMeasure):
+    """
+    Cosine similarity measure.
+    """
+    def __call__(self,x,y):
+        """
+        Return cosine of angle between vectors 'x' and 'y'.
+        """
+        return float(dot_product(x,y)/(norm(x)*norm(y)))
+
+class Pearson(SimilarityMeasure):
+    """
+    Pearson coeficient measure.
+    """
+    def __call__(self,x,y):
+        """ Return Pearson coeficient between vectors 'x' and 'y'. """
+        return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
+
+class Spearman(SimilarityMeasure):
+    """
+    Spearman correlation measure.
+    """
+    def __call__(self,x,y):
+        """
+        Return Spearman correlation between vectors 'x' and 'y'.
+        """
+        return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
+
+class Tanimoto(SimilarityMeasure):
+    """
+    Tanimoto coeficient measure.
+    """
+    def __call__(self,x,y):
+        """
+        Return Tanimoto coeficient between vectors 'x' and 'y'.
+        """
+        z = [v for v in x if v in y]
+        return float(len(z))/(len(x)+len(y)-len(z))
@@ -1,89 +0,0 @@
-#!/usr/bin/python
-
-#  similarity-measure - python module for classes and methods related to
-#                       measuring similarity between two sets of data.
-#
-#  Copyright (C) 2010  Tassia Camoes <tassia@gmail.com>
-#
-#  This program is free software: you can redistribute it and/or modify
-#  it under the terms of the GNU General Public License as published by
-#  the Free Software Foundation, either version 3 of the License, or
-#  (at your option) any later version.
-#
-#  This program is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#  GNU General Public License for more details.
-#
-#  You should have received a copy of the GNU General Public License
-#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-import math
-import stats
-
-def norm(x):
-    """
-    Return vector norm.
-    """
-    return math.sqrt(sum([x_i**2 for x_i in x]))
-
-def dot_product(x,y):
-    """
-    Return dot product of vectors 'x' and 'y'.
-    """
-    return sum([(x[i] * y[i]) for i in range(len(x))])
-
-class SimilarityMeasure:
-    """
-    Abstraction for diferent similarity measure approaches.
-    """
-
-class Distance(SimilarityMeasure):
-    """
-    Euclidian distance measure.
-    """
-    def __call__(self,x,y):
-        """
-        Return euclidian distance between vectors 'x' and 'y'.
-        """
-        sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
-        return math.sqrt(sum_pow)
-
-class Cosine(SimilarityMeasure):
-    """
-    Cosine similarity measure.
-    """
-    def __call__(self,x,y):
-        """
-        Return cosine of angle between vectors 'x' and 'y'.
-        """
-        return float(dot_product(x,y)/(norm(x)*norm(y)))
-
-class Pearson(SimilarityMeasure):
-    """
-    Pearson coeficient measure.
-    """
-    def __call__(self,x,y):
-        """ Return Pearson coeficient between vectors 'x' and 'y'. """
-        return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
-
-class Spearman(SimilarityMeasure):
-    """
-    Spearman correlation measure.
-    """
-    def __call__(self,x,y):
-        """
-        Return Spearman correlation between vectors 'x' and 'y'.
-        """
-        return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
-
-class Tanimoto(SimilarityMeasure):
-    """
-    Tanimoto coeficient measure.
-    """
-    def __call__(self,x,y):
-        """
-        Return Tanimoto coeficient between vectors 'x' and 'y'.
-        """
-        z = [v for v in x if v in y]
-        return float(len(z))/(len(x)+len(y)-len(z))
	@@ -31,7 +31,7 @@ PROJECT_NAME = AppRecommender		@@ -31,7 +31,7 @@ PROJECT_NAME = AppRecommender
31	# This could be handy for archiving the generated documentation or	31	# This could be handy for archiving the generated documentation or
32	# if some version control system is used.	32	# if some version control system is used.
33		33
34	-PROJECT_NUMBER = v0.1	34	+PROJECT_NUMBER = v0.3
35		35
36	# Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.	36	# Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
37		37
	@@ -26,7 +26,7 @@ from datetime import timedelta		@@ -26,7 +26,7 @@ from datetime import timedelta
26	from config import *	26	from config import *
27	from data import *	27	from data import *
28	from evaluation import *	28	from evaluation import *
29	-from similarity_measure import *	29	+from similarity import *
30	from recommender import *	30	from recommender import *
31	from strategy import *	31	from strategy import *
32	from user import *	32	from user import *
	@@ -27,7 +27,7 @@ from datetime import timedelta		@@ -27,7 +27,7 @@ from datetime import timedelta
27	from config import *	27	from config import *
28	from data import *	28	from data import *
29	from evaluation import *	29	from evaluation import *
30	-from similarity_measure import *	30	+from similarity import *
31	from recommender import *	31	from recommender import *
32	from strategy import *	32	from strategy import *
33	from user import *	33	from user import *
@@ -0,0 +1,89 @@		@@ -0,0 +1,89 @@
	1	+#!/usr/bin/python
	2	+
	3	+# similarity - python module for classes and methods related to similarity
	4	+# measuring between two sets of data.
	5	+#
	6	+# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>
	7	+#
	8	+# This program is free software: you can redistribute it and/or modify
	9	+# it under the terms of the GNU General Public License as published by
	10	+# the Free Software Foundation, either version 3 of the License, or
	11	+# (at your option) any later version.
	12	+#
	13	+# This program is distributed in the hope that it will be useful,
	14	+# but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+# GNU General Public License for more details.
	17	+#
	18	+# You should have received a copy of the GNU General Public License
	19	+# along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+
	21	+import math
	22	+import stats
	23	+
	24	+def norm(x):
	25	+ """
	26	+ Return vector norm.
	27	+ """
	28	+ return math.sqrt(sum([x_i**2 for x_i in x]))
	29	+
	30	+def dot_product(x,y):
	31	+ """
	32	+ Return dot product of vectors 'x' and 'y'.
	33	+ """
	34	+ return sum([(x[i] * y[i]) for i in range(len(x))])
	35	+
	36	+class SimilarityMeasure:
	37	+ """
	38	+ Abstraction for diferent similarity measure approaches.
	39	+ """
	40	+
	41	+class Distance(SimilarityMeasure):
	42	+ """
	43	+ Euclidian distance measure.
	44	+ """
	45	+ def __call__(self,x,y):
	46	+ """
	47	+ Return euclidian distance between vectors 'x' and 'y'.
	48	+ """
	49	+ sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
	50	+ return math.sqrt(sum_pow)
	51	+
	52	+class Cosine(SimilarityMeasure):
	53	+ """
	54	+ Cosine similarity measure.
	55	+ """
	56	+ def __call__(self,x,y):
	57	+ """
	58	+ Return cosine of angle between vectors 'x' and 'y'.
	59	+ """
	60	+ return float(dot_product(x,y)/(norm(x)*norm(y)))
	61	+
	62	+class Pearson(SimilarityMeasure):
	63	+ """
	64	+ Pearson coeficient measure.
	65	+ """
	66	+ def __call__(self,x,y):
	67	+ """ Return Pearson coeficient between vectors 'x' and 'y'. """
	68	+ return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
	69	+
	70	+class Spearman(SimilarityMeasure):
	71	+ """
	72	+ Spearman correlation measure.
	73	+ """
	74	+ def __call__(self,x,y):
	75	+ """
	76	+ Return Spearman correlation between vectors 'x' and 'y'.
	77	+ """
	78	+ return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
	79	+
	80	+class Tanimoto(SimilarityMeasure):
	81	+ """
	82	+ Tanimoto coeficient measure.
	83	+ """
	84	+ def __call__(self,x,y):
	85	+ """
	86	+ Return Tanimoto coeficient between vectors 'x' and 'y'.
	87	+ """
	88	+ z = [v for v in x if v in y]
	89	+ return float(len(z))/(len(x)+len(y)-len(z))
	@@ -1,89 +0,0 @@	@@ -1,89 +0,0 @@
1	-#!/usr/bin/python
2	-
3	-# similarity-measure - python module for classes and methods related to
4	-# measuring similarity between two sets of data.
5	-#
6	-# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>
7	-#
8	-# This program is free software: you can redistribute it and/or modify
9	-# it under the terms of the GNU General Public License as published by
10	-# the Free Software Foundation, either version 3 of the License, or
11	-# (at your option) any later version.
12	-#
13	-# This program is distributed in the hope that it will be useful,
14	-# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	-# GNU General Public License for more details.
17	-#
18	-# You should have received a copy of the GNU General Public License
19	-# along with this program. If not, see <http://www.gnu.org/licenses/>.
20	-
21	-import math
22	-import stats
23	-
24	-def norm(x):
25	- """
26	- Return vector norm.
27	- """
28	- return math.sqrt(sum([x_i**2 for x_i in x]))
29	-
30	-def dot_product(x,y):
31	- """
32	- Return dot product of vectors 'x' and 'y'.
33	- """
34	- return sum([(x[i] * y[i]) for i in range(len(x))])
35	-
36	-class SimilarityMeasure:
37	- """
38	- Abstraction for diferent similarity measure approaches.
39	- """
40	-
41	-class Distance(SimilarityMeasure):
42	- """
43	- Euclidian distance measure.
44	- """
45	- def __call__(self,x,y):
46	- """
47	- Return euclidian distance between vectors 'x' and 'y'.
48	- """
49	- sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
50	- return math.sqrt(sum_pow)
51	-
52	-class Cosine(SimilarityMeasure):
53	- """
54	- Cosine similarity measure.
55	- """
56	- def __call__(self,x,y):
57	- """
58	- Return cosine of angle between vectors 'x' and 'y'.
59	- """
60	- return float(dot_product(x,y)/(norm(x)*norm(y)))
61	-
62	-class Pearson(SimilarityMeasure):
63	- """
64	- Pearson coeficient measure.
65	- """
66	- def __call__(self,x,y):
67	- """ Return Pearson coeficient between vectors 'x' and 'y'. """
68	- return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
69	-
70	-class Spearman(SimilarityMeasure):
71	- """
72	- Spearman correlation measure.
73	- """
74	- def __call__(self,x,y):
75	- """
76	- Return Spearman correlation between vectors 'x' and 'y'.
77	- """
78	- return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
79	-
80	-class Tanimoto(SimilarityMeasure):
81	- """
82	- Tanimoto coeficient measure.
83	- """
84	- def __call__(self,x,y):
85	- """
86	- Return Tanimoto coeficient between vectors 'x' and 'y'.
87	- """
88	- z = [v for v in x if v in y]
89	- return float(len(z))/(len(x)+len(y)-len(z))