Commit 88ca87f69c91537a217a06644f6bd354f6ed79ef

Authored by Tássia Camões Araújo
1 parent 2255aea0
Exists in master and in 1 other branch add_vagrant

Implementation of missing metrics and small fixes.

doc/doxy_config
... ... @@ -31,7 +31,7 @@ PROJECT_NAME = AppRecommender
31 31 # This could be handy for archiving the generated documentation or
32 32 # if some version control system is used.
33 33  
34   -PROJECT_NUMBER = v0.1
  34 +PROJECT_NUMBER = v0.3
35 35  
36 36 # Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
37 37  
... ...
src/app_recommender.py
... ... @@ -26,7 +26,7 @@ from datetime import timedelta
26 26 from config import *
27 27 from data import *
28 28 from evaluation import *
29   -from similarity_measure import *
  29 +from similarity import *
30 30 from recommender import *
31 31 from strategy import *
32 32 from user import *
... ...
src/cross_validation.py
... ... @@ -27,7 +27,7 @@ from datetime import timedelta
27 27 from config import *
28 28 from data import *
29 29 from evaluation import *
30   -from similarity_measure import *
  30 +from similarity import *
31 31 from recommender import *
32 32 from strategy import *
33 33 from user import *
... ...
src/data.py
... ... @@ -77,11 +77,15 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
77 77 self.db_path = os.path.expanduser(cfg.tags_db)
78 78 self.debtags_db = debtags.DB()
79 79  
80   - db = open(self.db_path)
  80 + try:
  81 + db_file = open(self.db_path)
  82 + except IOError:
  83 + logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
  84 + raise Error
81 85 md5 = hashlib.md5()
82   - md5.update(db.read())
  86 + md5.update(db_file.read())
83 87 self.db_md5 = md5.hexdigest()
84   -
  88 + db_file.close()
85 89 self.load_index(cfg.reindex)
86 90  
87 91 def load_db(self):
... ... @@ -92,8 +96,9 @@ class TagsXapianIndex(xapian.WritableDatabase,Singleton):
92 96 try:
93 97 db_file = open(self.db_path, "r")
94 98 self.debtags_db.read(db_file,lambda x: not tag_filter.match(x))
95   - except IOError: #FIXME try is not catching this
96   - logging.error("Could not load DebtagsDB from %s." % self.db_path)
  99 + db_file.close()
  100 + except:
  101 + logging.error("Could not load DebtagsDB from '%s'." % self.db_path)
97 102 raise Error
98 103  
99 104 def relevant_tags_from_db(self,pkgs_list,qtd_of_tags):
... ...
src/evaluation.py
... ... @@ -33,7 +33,7 @@ class Metric:
33 33  
34 34 class Precision(Metric):
35 35 """
36   - Accuracy evaluation metric defined as the percentage of relevant itens
  36 + Classification accuracy metric defined as the percentage of relevant itens
37 37 among the predicted ones.
38 38 """
39 39 def __init__(self):
... ... @@ -50,7 +50,7 @@ class Precision(Metric):
50 50  
51 51 class Recall(Metric):
52 52 """
53   - Accuracy evaluation metric defined as the percentage of relevant itens
  53 + Classification ccuracy metric defined as the percentage of relevant itens
54 54 which were predicted as so.
55 55 """
56 56 def __init__(self):
... ... @@ -66,7 +66,10 @@ class Recall(Metric):
66 66 return float(len(evaluation.predicted_real))/len(evaluation.real_relevant)
67 67  
68 68 class F1(Metric):
69   - """ """
  69 + """
  70 + Classification accuracy metric which correlates precision and recall into an
  71 + unique measure.
  72 + """
70 73 def __init__(self):
71 74 """
72 75 Set metric description.
... ... @@ -79,24 +82,45 @@ class F1(Metric):
79 82 """
80 83 p = Precision().run(evaluation)
81 84 r = Recall().run(evaluation)
82   - return float((2*p*r)/(p+r))
  85 + return float((2*p*r))/(p+r)
83 86  
84 87 class MAE(Metric):
85   - """ """
  88 + """
  89 + Prediction accuracy metric defined as the mean absolute error.
  90 + """
86 91 def __init__(self):
87 92 """
88 93 Set metric description.
89 94 """
90 95 self.desc = " MAE "
91 96  
  97 + def get_errors(self,evaluation):
  98 + """
  99 + Compute prediction errors.
  100 + """
  101 + keys = evaluation.predicted_item_scores.keys()
  102 + keys.extend(evaluation.real_item_scores.keys())
  103 + errors = []
  104 + for k in keys:
  105 + if k not in evaluation.real_item_scores:
  106 + evaluation.real_item_scores[k] = 0.0
  107 + if k not in evaluation.predicted_item_scores:
  108 + evaluation.predicted_item_scores[k] = 0.0
  109 + errors.append(float(evaluation.predicted_item_scores[k]-
  110 + evaluation.real_item_scores[k]))
  111 + return errors
  112 +
92 113 def run(self,evaluation):
93 114 """
94 115 Compute metric.
95 116 """
96   - print "---" #FIXME
  117 + errors = self.get_errors(evaluation)
  118 + return sum(errors)/len(errors)
97 119  
98   -class MSE(Metric):
99   - """ """
  120 +class MSE(MAE):
  121 + """
  122 + Prediction accuracy metric defined as the mean square error.
  123 + """
100 124 def __init__(self):
101 125 """
102 126 Set metric description.
... ... @@ -107,21 +131,34 @@ class MSE(Metric):
107 131 """
108 132 Compute metric.
109 133 """
110   - print "---" #FIXME
  134 + errors = self.get_errors(evaluation)
  135 + square_errors = [pow(x,2) for x in errors]
  136 + return sum(square_errors)/len(square_errors)
111 137  
112 138 class Coverage(Metric):
113   - """ """
114   - def __init__(self):
  139 + """
  140 + Evaluation metric defined as the percentage of itens covered by the
  141 + recommender (have been recommended at least once).
  142 + """
  143 + def __init__(self,repository_size):
115 144 """
116   - Set metric description.
  145 + Set initial parameters.
117 146 """
118 147 self.desc = " Coverage "
  148 + self.repository_size = repository_size
  149 + self.covered = set()
  150 +
  151 + def save_covered(self,recommended_list):
  152 + """
  153 + Register that a list of itens has been recommended.
  154 + """
  155 + self.covered.update(set(recommended_list))
119 156  
120 157 def run(self,evaluation):
121 158 """
122 159 Compute metric.
123 160 """
124   - print "---" #FIXME
  161 + return float(self.covered.size)/self.repository_size
125 162  
126 163 class Evaluation:
127 164 """
... ... @@ -158,8 +195,7 @@ class CrossValidation:
158 195 if partition_proportion<1 and partition_proportion>0:
159 196 self.partition_proportion = partition_proportion
160 197 else:
161   - logging.critical("Partition proportion must be a value in the
162   - interval [0,1].")
  198 + logging.critical("Partition proportion must be a value in the interval [0,1].")
163 199 raise Error
164 200 self.rounds = rounds
165 201 self.recommender = rec
... ... @@ -195,7 +231,6 @@ class CrossValidation:
195 231 """
196 232 cross_item_score = dict.fromkeys(user.pkg_profile,1)
197 233 partition_size = int(len(cross_item_score)*self.partition_proportion)
198   - #cross_item_score = user.item_score.copy()
199 234 for r in range(self.rounds):
200 235 round_partition = {}
201 236 for j in range(partition_size):
... ...
src/generate_doc.sh
... ... @@ -19,8 +19,10 @@
19 19  
20 20 # Get project version from git repository
21 21 TAG=$(git describe --tags --abbrev=0)
  22 +echo "Generating documentation for git tag $TAG"
22 23 sed -i "s/^PROJECT_NUMBER.*$/PROJECT_NUMBER\t\t= $TAG/" ../doc/doxy_config
23 24 rm -Rf ../doc/html
24   -../doc/doxygen ../doc/doxy_config
25   -#scp -r html/* tassia@www.ime.usp.br:public_html/
  25 +../doc/doxygen-1.7.3 ../doc/doxy_config
  26 +scp -r html/ tassia@eclipse.ime.usp.br:
  27 +echo "---> Remember to place doc in the right location on server side."
26 28 mv html/ ../doc/
... ...
src/recommender.py
... ... @@ -61,7 +61,8 @@ class Recommender:
61 61 try:
62 62 strategy = "self."+cfg.strategy+"(cfg)"
63 63 exec(strategy)
64   - except (NameError, AttributeError, SyntaxError):
  64 + except (NameError, AttributeError, SyntaxError) as err:
  65 + print err
65 66 logging.critical("Could not perform recommendation strategy '%s'" %
66 67 cfg.strategy)
67 68 raise Error
... ...
src/similarity.py 0 → 100644
... ... @@ -0,0 +1,89 @@
  1 +#!/usr/bin/python
  2 +
  3 +# similarity - python module for classes and methods related to similarity
  4 +# measuring between two sets of data.
  5 +#
  6 +# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>
  7 +#
  8 +# This program is free software: you can redistribute it and/or modify
  9 +# it under the terms of the GNU General Public License as published by
  10 +# the Free Software Foundation, either version 3 of the License, or
  11 +# (at your option) any later version.
  12 +#
  13 +# This program is distributed in the hope that it will be useful,
  14 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 +# GNU General Public License for more details.
  17 +#
  18 +# You should have received a copy of the GNU General Public License
  19 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +
  21 +import math
  22 +import stats
  23 +
  24 +def norm(x):
  25 + """
  26 + Return vector norm.
  27 + """
  28 + return math.sqrt(sum([x_i**2 for x_i in x]))
  29 +
  30 +def dot_product(x,y):
  31 + """
  32 + Return dot product of vectors 'x' and 'y'.
  33 + """
  34 + return sum([(x[i] * y[i]) for i in range(len(x))])
  35 +
  36 +class SimilarityMeasure:
  37 + """
  38 + Abstraction for diferent similarity measure approaches.
  39 + """
  40 +
  41 +class Distance(SimilarityMeasure):
  42 + """
  43 + Euclidian distance measure.
  44 + """
  45 + def __call__(self,x,y):
  46 + """
  47 + Return euclidian distance between vectors 'x' and 'y'.
  48 + """
  49 + sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
  50 + return math.sqrt(sum_pow)
  51 +
  52 +class Cosine(SimilarityMeasure):
  53 + """
  54 + Cosine similarity measure.
  55 + """
  56 + def __call__(self,x,y):
  57 + """
  58 + Return cosine of angle between vectors 'x' and 'y'.
  59 + """
  60 + return float(dot_product(x,y)/(norm(x)*norm(y)))
  61 +
  62 +class Pearson(SimilarityMeasure):
  63 + """
  64 + Pearson coeficient measure.
  65 + """
  66 + def __call__(self,x,y):
  67 + """ Return Pearson coeficient between vectors 'x' and 'y'. """
  68 + return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
  69 +
  70 +class Spearman(SimilarityMeasure):
  71 + """
  72 + Spearman correlation measure.
  73 + """
  74 + def __call__(self,x,y):
  75 + """
  76 + Return Spearman correlation between vectors 'x' and 'y'.
  77 + """
  78 + return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
  79 +
  80 +class Tanimoto(SimilarityMeasure):
  81 + """
  82 + Tanimoto coeficient measure.
  83 + """
  84 + def __call__(self,x,y):
  85 + """
  86 + Return Tanimoto coeficient between vectors 'x' and 'y'.
  87 + """
  88 + z = [v for v in x if v in y]
  89 + return float(len(z))/(len(x)+len(y)-len(z))
... ...
src/similarity_measure.py
... ... @@ -1,89 +0,0 @@
1   -#!/usr/bin/python
2   -
3   -# similarity-measure - python module for classes and methods related to
4   -# measuring similarity between two sets of data.
5   -#
6   -# Copyright (C) 2010 Tassia Camoes <tassia@gmail.com>
7   -#
8   -# This program is free software: you can redistribute it and/or modify
9   -# it under the terms of the GNU General Public License as published by
10   -# the Free Software Foundation, either version 3 of the License, or
11   -# (at your option) any later version.
12   -#
13   -# This program is distributed in the hope that it will be useful,
14   -# but WITHOUT ANY WARRANTY; without even the implied warranty of
15   -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16   -# GNU General Public License for more details.
17   -#
18   -# You should have received a copy of the GNU General Public License
19   -# along with this program. If not, see <http://www.gnu.org/licenses/>.
20   -
21   -import math
22   -import stats
23   -
24   -def norm(x):
25   - """
26   - Return vector norm.
27   - """
28   - return math.sqrt(sum([x_i**2 for x_i in x]))
29   -
30   -def dot_product(x,y):
31   - """
32   - Return dot product of vectors 'x' and 'y'.
33   - """
34   - return sum([(x[i] * y[i]) for i in range(len(x))])
35   -
36   -class SimilarityMeasure:
37   - """
38   - Abstraction for diferent similarity measure approaches.
39   - """
40   -
41   -class Distance(SimilarityMeasure):
42   - """
43   - Euclidian distance measure.
44   - """
45   - def __call__(self,x,y):
46   - """
47   - Return euclidian distance between vectors 'x' and 'y'.
48   - """
49   - sum_pow = sum([((x[i] - y[i]) ** 2) for i in range(len(x))])
50   - return math.sqrt(sum_pow)
51   -
52   -class Cosine(SimilarityMeasure):
53   - """
54   - Cosine similarity measure.
55   - """
56   - def __call__(self,x,y):
57   - """
58   - Return cosine of angle between vectors 'x' and 'y'.
59   - """
60   - return float(dot_product(x,y)/(norm(x)*norm(y)))
61   -
62   -class Pearson(SimilarityMeasure):
63   - """
64   - Pearson coeficient measure.
65   - """
66   - def __call__(self,x,y):
67   - """ Return Pearson coeficient between vectors 'x' and 'y'. """
68   - return stats.pearsonr(x,y) # FIXME: ZeroDivisionError
69   -
70   -class Spearman(SimilarityMeasure):
71   - """
72   - Spearman correlation measure.
73   - """
74   - def __call__(self,x,y):
75   - """
76   - Return Spearman correlation between vectors 'x' and 'y'.
77   - """
78   - return stats.spearmanr(x,y) # FIXME: ZeroDivisionError
79   -
80   -class Tanimoto(SimilarityMeasure):
81   - """
82   - Tanimoto coeficient measure.
83   - """
84   - def __call__(self,x,y):
85   - """
86   - Return Tanimoto coeficient between vectors 'x' and 'y'.
87   - """
88   - z = [v for v in x if v in y]
89   - return float(len(z))/(len(x)+len(y)-len(z))