Commit 88ca87f69c91537a217a06644f6bd354f6ed79ef

Authored by Tássia Camões Araújo
1 parent 2255aea0
Exists in master and in 1 other branch add_vagrant

Implementation of missing metrics and small fixes.

doc/doxy_config
@@ -31,7 +31,7 @@ PROJECT_NAME = AppRecommender @@ -31,7 +31,7 @@ PROJECT_NAME = AppRecommender
31 # This could be handy for archiving the generated documentation or 31 # This could be handy for archiving the generated documentation or
32 # if some version control system is used. 32 # if some version control system is used.
33 33
34 -PROJECT_NUMBER = v0.1 34 +PROJECT_NUMBER = v0.3
35 35
36 # Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short. 36 # Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
37 37
src/app_recommender.py
@@ -26,7 +26,7 @@ from datetime import timedelta @@ -26,7 +26,7 @@ from datetime import timedelta
26 from config import * 26 from config import *
27 from data import * 27 from data import *
28 from evaluation import * 28 from evaluation import *
29 -from similarity_measure import * 29 +from similarity import *
30 from recommender import * 30 from recommender import *
31 from strategy import * 31 from strategy import *
32 from user import * 32 from user import *
src/cross_validation.py
@@ -27,7 +27,7 @@ from datetime import timedelta @@ -27,7 +27,7 @@ from datetime import timedelta
27 from config import * 27 from config import *
28 from data import * 28 from data import *
29 from evaluation import * 29 from evaluation import *
30 -from similarity_measure import * 30 +from similarity import *
31 from recommender import * 31 from recommender import *
32 from strategy import * 32 from strategy import *
33 from user import * 33 from user import *
@@ -77,11 +77,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): @@ -77,11 +77,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
77 self.db_path = os.path.expanduser(cfg.tags_db) 77 self.db_path = os.path.expanduser(cfg.tags_db)
78 self.debtags_db = debtags.DB() 78 self.debtags_db = debtags.DB()
79 79
80 - db = open(self.db_path) 80 + try:
  81 + db_file = open(self.db_path)
  82 + except IOError:
  83 + logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
  84 + raise Error
81 md5 = hashlib.md5() 85 md5 = hashlib.md5()
82 - md5.update(db.read()) 86 + md5.update(db_file.read())
83 self.db_md5 = md5.hexdigest() 87 self.db_md5 = md5.hexdigest()
84 - 88 + db_file.close()
85 self.load_index(cfg.reindex) 89 self.load_index(cfg.reindex)
86 90
87 def load_db(self): 91 def load_db(self):
@@ -92,8 +96,9 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton): @@ -92,8 +96,9 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
92 try: 96 try:
93 db_file = open(self.db_path, "r") 97 db_file = open(self.db_path, "r")
94 self.debtags_db.read(db_file,lambda x: not tag_filter.match(x)) 98 self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
95 - except IOError: #FIXME try is not catching this  
96 - logging.error("Could not load DebtagsDB from %s." % self.db_path) 99 + db_file.close()
  100 + except:
  101 + logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
97 raise Error 102 raise Error
98 103
99 def relevant_tags_from_db(self,pkgs_list,qtd_of_tags): 104 def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
src/evaluation.py
@@ -33,7 +33,7 @@ class Metric: @@ -33,7 +33,7 @@ class Metric:
33 33
34 class Precision(Metric): 34 class Precision(Metric):
35 """ 35 """
36 - Accuracy evaluation metric defined as the percentage of relevant itens 36 + Classification accuracy metric defined as the percentage of relevant itens
37 among the predicted ones. 37 among the predicted ones.
38 """ 38 """
39 def __init__(self): 39 def __init__(self):
@@ -50,7 +50,7 @@ class Precision(Metric): @@ -50,7 +50,7 @@ class Precision(Metric):
50 50
51 class Recall(Metric): 51 class Recall(Metric):
52 """ 52 """
53 - Accuracy evaluation metric defined as the percentage of relevant itens 53 + Classification ccuracy metric defined as the percentage of relevant itens
54 which were predicted as so. 54 which were predicted as so.
55 """ 55 """
56 def __init__(self): 56 def __init__(self):
@@ -66,7 +66,10 @@ class Recall(Metric): @@ -66,7 +66,10 @@ class Recall(Metric):
66 return float(len(evaluation.predicted_real))/len(evaluation.real_relevant) 66 return float(len(evaluation.predicted_real))/len(evaluation.real_relevant)
67 67
68 class F1(Metric): 68 class F1(Metric):
69 - """ """ 69 + """
  70 + Classification accuracy metric which correlates precision and recall into an
  71 + unique measure.
  72 + """
70 def __init__(self): 73 def __init__(self):
71 """ 74 """
72 Set metric description. 75 Set metric description.
@@ -79,24 +82,45 @@ class F1(Metric): @@ -79,24 +82,45 @@ class F1(Metric):
79 """ 82 """
80 p = Precision().run(evaluation) 83 p = Precision().run(evaluation)
81 r = Recall().run(evaluation) 84 r = Recall().run(evaluation)
82 - return float((2*p*r)/(p+r)) 85 + return float((2*p*r))/(p+r)
83 86
84 class MAE(Metric): 87 class MAE(Metric):
85 - """ """ 88 + """
  89 + Prediction accuracy metric defined as the mean absolute error.
  90 + """
86 def __init__(self): 91 def __init__(self):
87 """ 92 """
88 Set metric description. 93 Set metric description.
89 """ 94 """
90 self.desc = " MAE " 95 self.desc = " MAE "
91 96
  97 + def get_errors(self,evaluation):
  98 + """
  99 + Compute prediction errors.
  100 + """
  101 + keys = evaluation.predicted_item_scores.keys()
  102 + keys.extend(evaluation.real_item_scores.keys())
  103 + errors = []
  104 + for k in keys:
  105 + if k not in evaluation.real_item_scores:
  106 + evaluation.real_item_scores[k] = 0.0
  107 + if k not in evaluation.predicted_item_scores:
  108 + evaluation.predicted_item_scores[k] = 0.0
  109 + errors.append(float(evaluation.predicted_item_scores[k]-
  110 + evaluation.real_item_scores[k]))
  111 + return errors
  112 +
92 def run(self,evaluation): 113 def run(self,evaluation):
93 """ 114 """
94 Compute metric. 115 Compute metric.
95 """ 116 """
96 - print "---" #FIXME 117 + errors = self.get_errors(evaluation)
  118 + return sum(errors)/len(errors)
97 119
98 -class MSE(Metric):  
99 - """ """ 120 +class MSE(MAE):
  121 + """
  122 + Prediction accuracy metric defined as the mean square error.
  123 + """
100 def __init__(self): 124 def __init__(self):
101 """ 125 """
102 Set metric description. 126 Set metric description.
@@ -107,21 +131,34 @@ class MSE(Metric): @@ -107,21 +131,34 @@ class MSE(Metric):
107 """ 131 """
108 Compute metric. 132 Compute metric.
109 """ 133 """
110 - print "---" #FIXME 134 + errors = self.get_errors(evaluation)
  135 + square_errors = [pow(x,2) for x in errors]
  136 + return sum(square_errors)/len(square_errors)
111 137
112 class Coverage(Metric): 138 class Coverage(Metric):
113 - """ """  
114 - def __init__(self): 139 + """
  140 + Evaluation metric defined as the percentage of itens covered by the
  141 + recommender (have been recommended at least once).
  142 + """
  143 + def __init__(self,repository_size):
115 """ 144 """
116 - Set metric description. 145 + Set initial parameters.
117 """ 146 """
118 self.desc = " Coverage " 147 self.desc = " Coverage "
  148 + self.repository_size = repository_size
  149 + self.covered = set()
  150 +
  151 + def save_covered(self,recommended_list):
  152 + """
  153 + Register that a list of itens has been recommended.
  154 + """
  155 + self.covered.update(set(recommended_list))
119 156
120 def run(self,evaluation): 157 def run(self,evaluation):
121 """ 158 """
122 Compute metric. 159 Compute metric.
123 """ 160 """
124 - print "---" #FIXME 161 + return float(self.covered.size)/self.repository_size
125 162
126 class Evaluation: 163 class Evaluation:
127 """ 164 """
@@ -158,8 +195,7 @@ class CrossValidation: @@ -158,8 +195,7 @@ class CrossValidation:
158 if partition_proportion<1 and partition_proportion>0: 195 if partition_proportion<1 and partition_proportion>0:
159 self.partition_proportion = partition_proportion 196 self.partition_proportion = partition_proportion
160 else: 197 else:
161 - logging.critical("Partition proportion must be a value in the  
162 - interval [0,1].") 198 + logging.critical("Partition proportion must be a value in the interval [0,1].")
163 raise Error 199 raise Error
164 self.rounds = rounds 200 self.rounds = rounds
165 self.recommender = rec 201 self.recommender = rec
@@ -195,7 +231,6 @@ class CrossValidation: @@ -195,7 +231,6 @@ class CrossValidation:
195 """ 231 """
196 cross_item_score = dict.fromkeys(user.pkg_profile,1) 232 cross_item_score = dict.fromkeys(user.pkg_profile,1)
197 partition_size = int(len(cross_item_score)*self.partition_proportion) 233 partition_size = int(len(cross_item_score)*self.partition_proportion)
198 - #cross_item_score = user.item_score.copy()  
199 for r in range(self.rounds): 234 for r in range(self.rounds):
200 round_partition = {} 235 round_partition = {}
201 for j in range(partition_size): 236 for j in range(partition_size):
src/generate_doc.sh
@@ -19,8 +19,10 @@ @@ -19,8 +19,10 @@
19 19
20 # Get project version from git repository 20 # Get project version from git repository
21 TAG=$(git describe --tags --abbrev=0) 21 TAG=$(git describe --tags --abbrev=0)
  22 +echo "Generating documentation for git tag $TAG"
22 sed -i "s/^PROJECT_NUMBER.*$/PROJECT_NUMBER\t\t= $TAG/" ../doc/doxy_config 23 sed -i "s/^PROJECT_NUMBER.*$/PROJECT_NUMBER\t\t= $TAG/" ../doc/doxy_config
23 rm -Rf ../doc/html 24 rm -Rf ../doc/html
24 -../doc/doxygen ../doc/doxy_config  
25 -#scp -r html/* tassia@www.ime.usp.br:public_html/ 25 +../doc/doxygen-1.7.3 ../doc/doxy_config
  26 +scp -r html/ tassia@eclipse.ime.usp.br:
  27 +echo "---> Remember to place doc in the right location on server side."
26 mv html/ ../doc/ 28 mv html/ ../doc/
src/recommender.py
@@ -61,7 +61,8 @@ class Recommender: @@ -61,7 +61,8 @@ class Recommender:
61 try: 61 try:
62 strategy = "self."+cfg.strategy+"(cfg)" 62 strategy = "self."+cfg.strategy+"(cfg)"
63 exec(strategy) 63 exec(strategy)
64 - except (NameError, AttributeError, SyntaxError): 64 + except (NameError, AttributeError, SyntaxError) as err:
  65 + print err
65 logging.critical("Could not perform recommendation strategy '%s'" % 66 logging.critical("Could not perform recommendation strategy '%s'" %
66 cfg.strategy) 67 cfg.strategy)
67 raise Error 68 raise Error
src/similarity.py 0 → 100644
@@ -0,0 +1,89 @@ @@ -0,0 +1,89 @@
  1 +#!/usr/bin/python
  2 +
  3 +# similarity - python module for classes and methods related to similarity
  4 +# measuring between two sets of data.
  5 +#
  6 +# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>
  7 +#
  8 +# This program is free software: you can redistribute it and/or modify
  9 +# it under the terms of the GNU General Public License as published by
  10 +# the Free Software Foundation, either version 3 of the License, or
  11 +# (at your option) any later version.
  12 +#
  13 +# This program is distributed in the hope that it will be useful,
  14 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 +# GNU General Public License for more details.
  17 +#
  18 +# You should have received a copy of the GNU General Public License
  19 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +
  21 +import math
  22 +import stats
  23 +
  24 +def norm(x):
  25 + """
  26 + Return vector norm.
  27 + """
  28 + return math.sqrt(sum([x_i**2 for x_i in x]))
  29 +
  30 +def dot_product(x,y):
  31 + """
  32 + Return dot product of vectors 'x' and 'y'.
  33 + """
  34 + return sum([(x[i] * y[i]) for i in range(len(x))])
  35 +
  36 +class SimilarityMeasure:
  37 + """
  38 + Abstraction for diferent similarity measure approaches.
  39 + """
  40 +
  41 +class Distance(SimilarityMeasure):
  42 + """
  43 + Euclidian distance measure.
  44 + """
  45 + def __call__(self,x,y):
  46 + """
  47 + Return euclidian distance between vectors 'x' and 'y'.
  48 + """
  49 + sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
  50 + return math.sqrt(sum_pow)
  51 +
  52 +class Cosine(SimilarityMeasure):
  53 + """
  54 + Cosine similarity measure.
  55 + """
  56 + def __call__(self,x,y):
  57 + """
  58 + Return cosine of angle between vectors 'x' and 'y'.
  59 + """
  60 + return float(dot_product(x,y)/(norm(x)*norm(y)))
  61 +
  62 +class Pearson(SimilarityMeasure):
  63 + """
  64 + Pearson coeficient measure.
  65 + """
  66 + def __call__(self,x,y):
  67 + """ Return Pearson coeficient between vectors 'x' and 'y'. """
  68 + return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
  69 +
  70 +class Spearman(SimilarityMeasure):
  71 + """
  72 + Spearman correlation measure.
  73 + """
  74 + def __call__(self,x,y):
  75 + """
  76 + Return Spearman correlation between vectors 'x' and 'y'.
  77 + """
  78 + return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
  79 +
  80 +class Tanimoto(SimilarityMeasure):
  81 + """
  82 + Tanimoto coeficient measure.
  83 + """
  84 + def __call__(self,x,y):
  85 + """
  86 + Return Tanimoto coeficient between vectors 'x' and 'y'.
  87 + """
  88 + z = [v for v in x if v in y]
  89 + return float(len(z))/(len(x)+len(y)-len(z))
src/similarity_measure.py
@@ -1,89 +0,0 @@ @@ -1,89 +0,0 @@
1 -#!/usr/bin/python  
2 -  
3 -# similarity-measure - python module for classes and methods related to  
4 -# measuring similarity between two sets of data.  
5 -#  
6 -# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>  
7 -#  
8 -# This program is free software: you can redistribute it and/or modify  
9 -# it under the terms of the GNU General Public License as published by  
10 -# the Free Software Foundation, either version 3 of the License, or  
11 -# (at your option) any later version.  
12 -#  
13 -# This program is distributed in the hope that it will be useful,  
14 -# but WITHOUT ANY WARRANTY; without even the implied warranty of  
15 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
16 -# GNU General Public License for more details.  
17 -#  
18 -# You should have received a copy of the GNU General Public License  
19 -# along with this program. If not, see <http://www.gnu.org/licenses/>.  
20 -  
21 -import math  
22 -import stats  
23 -  
24 -def norm(x):  
25 - """  
26 - Return vector norm.  
27 - """  
28 - return math.sqrt(sum([x_i**2 for x_i in x]))  
29 -  
30 -def dot_product(x,y):  
31 - """  
32 - Return dot product of vectors 'x' and 'y'.  
33 - """  
34 - return sum([(x[i] * y[i]) for i in range(len(x))])  
35 -  
36 -class SimilarityMeasure:  
37 - """  
38 - Abstraction for diferent similarity measure approaches.  
39 - """  
40 -  
41 -class Distance(SimilarityMeasure):  
42 - """  
43 - Euclidian distance measure.  
44 - """  
45 - def __call__(self,x,y):  
46 - """  
47 - Return euclidian distance between vectors 'x' and 'y'.  
48 - """  
49 - sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])  
50 - return math.sqrt(sum_pow)  
51 -  
52 -class Cosine(SimilarityMeasure):  
53 - """  
54 - Cosine similarity measure.  
55 - """  
56 - def __call__(self,x,y):  
57 - """  
58 - Return cosine of angle between vectors 'x' and 'y'.  
59 - """  
60 - return float(dot_product(x,y)/(norm(x)*norm(y)))  
61 -  
62 -class Pearson(SimilarityMeasure):  
63 - """  
64 - Pearson coeficient measure.  
65 - """  
66 - def __call__(self,x,y):  
67 - """ Return Pearson coeficient between vectors 'x' and 'y'. """  
68 - return stats.pearsonr(x,y) # FIXME: ZeroDivisionError  
69 -  
70 -class Spearman(SimilarityMeasure):  
71 - """  
72 - Spearman correlation measure.  
73 - """  
74 - def __call__(self,x,y):  
75 - """  
76 - Return Spearman correlation between vectors 'x' and 'y'.  
77 - """  
78 - return stats.spearmanr(x,y) # FIXME: ZeroDivisionError  
79 -  
80 -class Tanimoto(SimilarityMeasure):  
81 - """  
82 - Tanimoto coeficient measure.  
83 - """  
84 - def __call__(self,x,y):  
85 - """  
86 - Return Tanimoto coeficient between vectors 'x' and 'y'.  
87 - """  
88 - z = [v for v in x if v in y]  
89 - return float(len(z))/(len(x)+len(y)-len(z))