strategies-suite.py 11.1 KB
Edit Raw Blame History Permalink



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274


#!/usr/bin/env python
"""
    recommender suite - recommender experiments suite 
"""
__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
__license__ = """
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import sys
sys.path.insert(0,'../')
from config import Config
from data import PopconXapianIndex, PopconSubmission, AppAptXapianIndex
from recommender import Recommender
from user import LocalSystem, User
from evaluation import *
import logging
import random
import Gnuplot

#iterations = 3
#sample_proportions = [0.9]
#weighting = [('bm25',1.2)]
#collaborative = ['knn']
#content_based = []
#hybrid = ['knnco']
#profile_size = [50,100]
#popcon_size = ["1000"]
#neighbors = [50]

iterations = 10
sample_proportions = [0.5, 0.6, 0.7, 0.8, 0.9]
weighting = [('bm25',1.2), ('bm25',1.6), ('bm25',2.0), ('trad',0)]
content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
collaborative = ['knn_eset','knn','knn_plus']
hybrid = ['knnco','knnco_eset']

profile_size = range(20,100,20)
#popcon_size = [1000,10000,50000,'full']
neighbors = range(10,510,50)

def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
    # Write recall log
    output = open(("%s-%d" % (log_file,n)),'w')
    output.write("# %s-n\n" % label["description"])
    output.write("# %s-%d\n" % (label["values"],n))
    output.write("\n%d %d %d\n" % \
                 (repo_size,profile_size,len(sample)))
    if hasattr(recommendation,"ranking"):
        notfound = []
        ranks = []
        for pkg in sample.keys():
            if pkg in recommendation.ranking:
                ranks.append(recommendation.ranking.index(pkg))
            else:
                notfound.append(pkg)
        for r in sorted(ranks):
            output.write(str(r)+"\n")
        if notfound:
            output.write("Out of recommendation:\n")
            for pkg in notfound:
                output.write(pkg+"\n")
    output.close()

def plot_summary(precision,recall,f1,f05,accuracy,log_file):
    # Plot metrics summary
    g = Gnuplot.Gnuplot()
    g('set style data lines')
    g.xlabel('Recommendation size')
    g.title("Setup: %s" % log_file.split("/")[-1])
    g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
           Gnuplot.Data(precision,title="Precision"),
           Gnuplot.Data(recall,title="Recall"),
           Gnuplot.Data(f1,title="F_1"),
           Gnuplot.Data(f05,title="F_0.5"))
    g.hardcopy(log_file+".png",terminal="png")
    g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
    g('set logscale x')
    g('replot')
    g.hardcopy(log_file+"-logscale.png",terminal="png")
    g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)

def get_label(cfg,sample_proportion):
    label = {}
    if cfg.strategy in content_based:
        label["description"] = "strategy-filter-profile-k1_bm25-sample"
        label["values"] = ("%s-profile%d-%s-kbm%.1f-sample%.1f" %
                           (cfg.strategy,cfg.profile_size,
                            cfg.pkgs_filter.split("/")[-1],
                            cfg.bm25_k1,sample_proportion))
    elif cfg.strategy in collaborative:
       label["description"] = "strategy-knn-filter-k1_bm25-sample"
       label["values"] = ("%s-k%d-%s-kbm%.1f-sample%.1f" %
                          (cfg.strategy,cfg.k_neighbors,
                           cfg.pkgs_filter.split("/")[-1],
                           cfg.bm25_k1,sample_proportion))
    elif cfg.strategy in hybrid:
       label["description"] = "strategy-knn-filter-profile-k1_bm25-sample"
       label["values"] = ("%s-k%d-profile%d-%s-kbm%.1f-sample%.1f" %
                          (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
                           cfg.pkgs_filter.split("/")[-1],
                           cfg.bm25_k1,sample_proportion))
    else:
        print "Unknown strategy"
    return label

class ExperimentResults:
    def __init__(self,repo_size):
        self.repository_size = repo_size
        self.accuracy = {}
        self.precision = {}
        self.recall = {}
        self.f1 = {}
        self.f05 = {}
        points = [1]+range(10,200,10)+range(200,self.repository_size,100)
        for size in points:
            self.accuracy[size] = []
            self.precision[size] = []
            self.recall[size] = []
            self.f1[size] = []
            self.f05[size] = []

    def add_result(self,ranking,sample):
        for size in self.accuracy.keys():
            predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
            real = RecommendationResult(sample)
            evaluation = Evaluation(predicted,real,self.repository_size)
            self.accuracy[size].append(evaluation.run(Accuracy()))
            self.precision[size].append(evaluation.run(Precision()))
            self.recall[size].append(evaluation.run(Recall()))
            self.f1[size].append(evaluation.run(F_score(1)))
            self.f05[size].append(evaluation.run(F_score(0.5)))

    def get_precision_summary(self):
        summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
        return sorted(summary)

    def get_recall_summary(self):
        summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
        return sorted(summary)

    def get_f1_summary(self):
        summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
        return sorted(summary)

    def get_f05_summary(self):
        summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
        return sorted(summary)

    def get_accuracy_summary(self):
        summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
        return sorted(summary)

    def best_precision(self):
        size = max(self.precision, key = lambda x: max(self.precision[x]))
        return (size,max(self.precision[size]))

    def best_f1(self):
        size = max(self.f1, key = lambda x: max(self.f1[x]))
        return (size,max(self.f1[size]))

    def best_f05(self):
        size = max(self.f05, key = lambda x: max(self.f05[x]))
        return (size,max(self.f05[size]))

def run_strategy(cfg,user):
    for weight in weighting:
        cfg.weight = weight[0]
        cfg.bm25_k1 = weight[1]
        rec = Recommender(cfg)
        repo_size = rec.items_repository.get_doccount()
        for proportion in sample_proportions:
            results = ExperimentResults(repo_size)
            label = get_label(cfg,proportion)
            log_file = "results/strategies/"+label["values"]
            for n in range(iterations):
                # Fill sample profile
                profile_size = len(user.pkg_profile)
                item_score = {}
                for pkg in user.pkg_profile:
                    item_score[pkg] = user.item_score[pkg]
                sample = {}
                sample_size = int(profile_size*proportion)
                for i in range(sample_size):
                     key = random.choice(item_score.keys())
                     sample[key] = item_score.pop(key)
                iteration_user = User(item_score)
                recommendation = rec.get_recommendation(iteration_user,repo_size)
                write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
                if hasattr(recommendation,"ranking"):
                    results.add_result(recommendation.ranking,sample)
            with open(log_file,'w') as f:
                precision_10 = sum(results.precision[10])/len(results.precision[10])
                f1_10 = sum(results.f1[10])/len(results.f1[10])
                f05_10 = sum(results.f05[10])/len(results.f05[10])
                f.write("# %s\n# %s\n\ncoverage %d\n\n" %
                        (label["description"],label["values"],recommendation.size))
                f.write("# best results (recommendation size; metric)\n")
                f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
                        (results.best_precision()[0],results.best_precision()[1],
                         results.best_f1()[0],results.best_f1()[1],
                         results.best_f05()[0],results.best_f05()[1]))
                f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
                        (precision_10,f1_10,f05_10))
            precision = results.get_precision_summary()
            recall = results.get_recall_summary()
            f1 = results.get_f1_summary()
            f05 = results.get_f05_summary()
            accuracy = results.get_accuracy_summary()
            plot_summary(precision,recall,f1,f05,accuracy,log_file)

def run_content(user,cfg):
    for strategy in content_based:
        cfg.strategy = strategy
        for size in profile_size:
            cfg.profile_size = size
            run_strategy(cfg,user)

def run_collaborative(user,cfg):
    popcon_desktopapps = cfg.popcon_desktopapps
    popcon_programs = cfg.popcon_programs
    for strategy in collaborative:
        cfg.strategy = strategy
        for k in neighbors:
            cfg.k_neighbors = k
            #for size in popcon_size:
            #    if size:
            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
            #        cfg.popcon_programs = popcon_programs+"_"+size
            run_strategy(cfg,user)

def run_hybrid(user,cfg):
    popcon_desktopapps = cfg.popcon_desktopapps
    popcon_programs = cfg.popcon_programs
    for strategy in hybrid:
        cfg.strategy = strategy
        for k in neighbors:
            cfg.k_neighbors = k
            #for size in popcon_size:
            #    if size:
            #        cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
            #        cfg.popcon_programs = popcon_programs+"_"+size
            for size in profile_size:
                cfg.profile_size = size
                run_strategy(cfg,user)

if __name__ == '__main__':
    #user = LocalSystem()
    #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))

    cfg = Config()
    user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
    #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
    user.filter_pkg_profile(cfg.pkgs_filter)
    user.maximal_pkg_profile()

    if "content" in sys.argv or len(sys.argv)<2:
        run_content(user,cfg)
    if "collaborative" in sys.argv or len(sys.argv)<2:
        run_collaborative(user,cfg)
    if "hybrid" in sys.argv or len(sys.argv)<2:
        run_hybrid(user,cfg)