k-suite.py 7.32 KB
Edit Raw Blame History Permalink



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186


#!/usr/bin/env python
"""
    k-suite - experiment different neighborhood sizes
"""
__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
__license__ = """
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import sys
sys.path.insert(0,'../')
from config import Config
from data import PopconXapianIndex, PopconSubmission
from recommender import Recommender
from user import LocalSystem, User
from evaluation import *
import logging
import random
import Gnuplot
import numpy

def plot_roc(k,roc_points,log_file):
    g = Gnuplot.Gnuplot()
    g('set style data points')
    g.xlabel('False Positive Rate')
    g.ylabel('True Positive Rate')
    g('set xrange [0:1.0]')
    g('set yrange [0:1.0]')
    g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
    g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
           Gnuplot.Data(roc_points))
    g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
    g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)

def plot_summary(precision,f05,mcc,log_file):
    g = Gnuplot.Gnuplot()
    g('set style data lines')
    g.xlabel('Neighborhood (k)')
    g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
    g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
           Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
           Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
    g.hardcopy(log_file+(".png"),terminal="png")
    g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)

class ExperimentResults:
    def __init__(self,repo_size):
        self.repository_size = repo_size
        self.precision = []
        self.recall = []
        self.fpr = []
        self.f05 = []
        self.mcc = []

    def add_result(self,ranking,sample):
        predicted = RecommendationResult(dict.fromkeys(ranking,1))
        real = RecommendationResult(sample)
        evaluation = Evaluation(predicted,real,self.repository_size)
        self.precision.append(evaluation.run(Precision()))
        self.recall.append(evaluation.run(Recall()))
        self.fpr.append(evaluation.run(FPR()))
        self.f05.append(evaluation.run(F_score(0.5)))
        self.mcc.append(evaluation.run(MCC()))

    def get_roc_point(self):
        tpr = self.recall
        fpr = self.fpr
        if not tpr or not fpr:
            return [0,0]
        return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]

    def get_precision_summary(self):
        if not self.precision: return 0
        return  sum(self.precision)/len(self.precision)

    def get_f05_summary(self):
        if not self.f05: return 0
        return  sum(self.f05)/len(self.f05)

    def get_mcc_summary(self):
        if not self.mcc: return 0
        return  sum(self.mcc)/len(self.mcc)

if __name__ == '__main__':
    if len(sys.argv)<3:
        print "Usage: k-suite strategy_str sample_file"
        exit(1)
    threshold = 20
    iterations = 30
    neighbors = [3,5,10,50,100,150,200,300,400,500]
    cfg = Config()
    cfg.strategy = sys.argv[1]
    sample_file = sys.argv[2]
    population_sample = []
    with open(sample_file,'r') as f:
        for line in f.readlines():
            user_id = line.strip('\n')
            population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
    # setup dictionaries and files
    roc_summary = {}
    recommended = {}
    precision_summary = {}
    f05_summary = {}
    mcc_summary = {}
    sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
    if not os.path.exists(sample_dir):
        os.makedirs(sample_dir)
    log_file = os.path.join(sample_dir,cfg.strategy)
    with open(log_file,'w') as f:
        f.write("# %s\n\n" % sample_file.split('/')[-1])
        f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
                (cfg.strategy,threshold,iterations))
        f.write("# k coverage \tprecision \tf05 \tmcc\n\n")

    for k in neighbors:
        roc_summary[k] = []
        recommended[k] = set()
        precision_summary[k] = []
        f05_summary[k] = []
        mcc_summary[k] = []
        with open(log_file+"-k%.3d"%k,'w') as f:
            f.write("# %s\n\n" % sample_file.split('/')[-1])
            f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
            f.write("# roc_point \tprecision \tf05 \tmcc\n\n")

    # main loop per user
    for submission_file in population_sample:
        user = PopconSystem(submission_file)
        user.filter_pkg_profile(cfg.pkgs_filter)
        user.maximal_pkg_profile()
        for k in neighbors:
            cfg.k_neighbors = k
            rec = Recommender(cfg)
            repo_size = rec.items_repository.get_doccount()
            results = ExperimentResults(repo_size)
            # n iterations for same recommender and user
            for n in range(iterations):
                # Fill sample profile
                profile_len = len(user.pkg_profile)
                item_score = {}
                for pkg in user.pkg_profile:
                    item_score[pkg] = user.item_score[pkg]
                sample = {}
                sample_size = int(profile_len*0.9)
                for i in range(sample_size):
                     key = random.choice(item_score.keys())
                     sample[key] = item_score.pop(key)
                iteration_user = User(item_score)
                recommendation = rec.get_recommendation(iteration_user,threshold)
                if hasattr(recommendation,"ranking"):
                    results.add_result(recommendation.ranking,sample)
                    recommended[k] = recommended[k].union(recommendation.ranking)
            # save summary
            roc_point = results.get_roc_point()
            roc_summary[k].append(roc_point)
            precision = results.get_precision_summary()
            precision_summary[k].append(precision)
            f05 = results.get_f05_summary()
            f05_summary[k].append(f05)
            mcc = results.get_mcc_summary()
            mcc_summary[k].append(mcc)
            with open(log_file+"-k%.3d"%k,'a') as f:
                f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
                        (roc_point[0],roc_point[1],precision,f05,mcc))
    # back to main flow
    with open(log_file,'a') as f:
        plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
        for k in neighbors:
            coverage = len(recommended[size])/float(repo_size)
            f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
                    (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
                     float(sum(f05_summary[k]))/len(f05_summary[k]),
                     float(sum(mcc_summary[k]))/len(mcc_summary[k])))
            plot_roc(k,roc_summary[k],log_file)