Commit c673b9b2395d8d6128af4791057e5845517b20d1

Authored by Tássia Camões Araújo
1 parent 4d01144b
Exists in master and in 1 other branch add_vagrant

Updated experiments scripts

src/experiments/extract-sample-db.py 0 → 100755
... ... @@ -0,0 +1,49 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon - extract a sample from popcon population
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import xapian
  23 +import os
  24 +import random
  25 +import sys
  26 +
  27 +if __name__ == '__main__':
  28 + try:
  29 + sample_file = sys.argv[1]
  30 + popcon = xapian.WritableDatabase(sys.argv[2],xapian.DB_OPEN)
  31 + except:
  32 + print "Usage: extract-sample-db sample_file popcon_index"
  33 + exit(1)
  34 + enquire = xapian.Enquire(popcon)
  35 + print sample_file.split("/")
  36 + new_popcon = xapian.WritableDatabase(sys.argv[2]+"-"+sample_file.split("/")[-1],xapian.DB_CREATE_OR_OVERWRITE)
  37 + print ("Popcon repository size: %d" % popcon.get_doccount())
  38 + for submission in open(sample_file):
  39 + print "ID"+submission.strip()
  40 + query = xapian.Query("ID"+submission.strip())
  41 + enquire.set_query(query)
  42 + mset = enquire.get_mset(0,20)
  43 + for m in mset:
  44 + print "Adding doc %s"%m.docid
  45 + new_popcon.add_document(popcon.get_document(m.docid))
  46 + print "Removing doc %s"%m.docid
  47 + popcon.delete_document(m.docid)
  48 + print ("Popcon repository size: %d" % popcon.get_doccount())
  49 + print ("Popcon repository size: %d" % new_popcon.get_doccount())
... ...
src/experiments/hybrid.py 0 → 100755
... ... @@ -0,0 +1,197 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + hybrid-suite
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +if __name__ == '__main__':
  35 + if len(sys.argv)<2:
  36 + print "Usage: hybrid strategy sample_file"
  37 + exit(1)
  38 +
  39 + iterations = 20
  40 + profile_size = [10,40,70,100,170,240]
  41 + neighbor_size = [3,10,50,100,200,400]
  42 +
  43 + #hybrid_strategies = ['knnco','knnco_eset']
  44 +
  45 + #iterations = 1
  46 + #profile_size = [10,20,30]
  47 + #neighbor_size = [10,20,30]
  48 +
  49 + cfg = Config()
  50 + population_sample = []
  51 + strategy = sys.argv[1]
  52 + sample_file = sys.argv[2]
  53 + sample_str = sample_file.split('/')[-1]
  54 + with open(sample_file,'r') as f:
  55 + for line in f.readlines():
  56 + user_id = line.strip('\n')
  57 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  58 + sample_dir = ("results/hybrid/%s" % sample_str)
  59 + if not os.path.exists(sample_dir):
  60 + os.makedirs(sample_dir)
  61 +
  62 + cfg.strategy = strategy
  63 + p_20_summary = {}
  64 + f05_100_summary = {}
  65 + c_20 = {}
  66 + c_100 = {}
  67 +
  68 + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
  69 + graph_20 = {}
  70 + graph_100 = {}
  71 + graph_20_jpg = {}
  72 + graph_100_jpg = {}
  73 + comment_20 = {}
  74 + comment_100 = {}
  75 + for k in neighbor_size:
  76 + graph_20[k] = log_file+("-neighboorhod%.3d-020.png"%k)
  77 + graph_100[k] = log_file+("-neighboorhod%.3d-100.png"%k)
  78 + graph_20_jpg[k] = graph_20[k].strip(".png")+".jpg"
  79 + graph_100_jpg[k] = graph_100[k].strip(".png")+".jpg"
  80 + comment_20[k] = graph_20_jpg[k]+".comment"
  81 + comment_100[k] = graph_100_jpg[k]+".comment"
  82 +
  83 + with open(comment_20[k],'w') as f:
  84 + f.write("# %s\n" % sample_str)
  85 + f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
  86 + (cfg.strategy,iterations))
  87 + f.write("# neighboorhood\tprofile\tp_20\tc_20\n\n")
  88 + with open(comment_100[k],'w') as f:
  89 + f.write("# %s\n" % sample_str)
  90 + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  91 + (cfg.strategy,iterations))
  92 + f.write("# neighboorhood\tprofile\tf05_100\tc_100\n\n")
  93 +
  94 + c_20[k] = {}
  95 + c_100[k] = {}
  96 + p_20_summary[k] = {}
  97 + f05_100_summary[k] = {}
  98 + for size in profile_size:
  99 + c_20[k][size] = set()
  100 + c_100[k][size] = set()
  101 + p_20_summary[k][size] = []
  102 + f05_100_summary[k][size] = []
  103 + with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'w') as f:
  104 + f.write("# %s\n" % sample_str)
  105 + f.write("# strategy %s-neighboorhood%.3d-profile%.3d\n\n" % (cfg.strategy,k,size))
  106 + f.write("# p_20\t\tf05_100\n\n")
  107 +
  108 + # main loop per user
  109 + for submission_file in population_sample:
  110 + user = PopconSystem(submission_file)
  111 + user.filter_pkg_profile(cfg.pkgs_filter)
  112 + user.maximal_pkg_profile()
  113 + for k in neighbor_size:
  114 + cfg.k_neighbors = k
  115 + for size in profile_size:
  116 + cfg.profile_size = size
  117 + rec = Recommender(cfg)
  118 + repo_size = rec.items_repository.get_doccount()
  119 + p_20 = []
  120 + f05_100 = []
  121 + for n in range(iterations):
  122 + # Fill sample profile
  123 + profile_len = len(user.pkg_profile)
  124 + item_score = {}
  125 + for pkg in user.pkg_profile:
  126 + item_score[pkg] = user.item_score[pkg]
  127 + sample = {}
  128 + sample_size = int(profile_len*0.9)
  129 + for i in range(sample_size):
  130 + key = random.choice(item_score.keys())
  131 + sample[key] = item_score.pop(key)
  132 + iteration_user = User(item_score)
  133 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  134 + if hasattr(recommendation,"ranking"):
  135 + ranking = recommendation.ranking
  136 + real = RecommendationResult(sample)
  137 + predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
  138 + evaluation = Evaluation(predicted_20,real,repo_size)
  139 + p_20.append(evaluation.run(Precision()))
  140 + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
  141 + evaluation = Evaluation(predicted_100,real,repo_size)
  142 + f05_100.append(evaluation.run(F_score(0.5)))
  143 + c_20[k][size] = c_20[k][size].union(recommendation.ranking[:20])
  144 + c_100[k][size] = c_100[k][size].union(recommendation.ranking[:100])
  145 + # save summary
  146 + if p_20:
  147 + p_20_summary[k][size].append(sum(p_20)/len(p_20))
  148 + if f05_100:
  149 + f05_100_summary[k][size].append(sum(f05_100)/len(f05_100))
  150 +
  151 + with open(log_file+"-neighboorhood%.3d-profile%.3d"%(k,size),'a') as f:
  152 + f.write("%.4f\t\t%.4f\n" %
  153 + ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
  154 +
  155 + # back to main flow
  156 + coverage_20 = {}
  157 + coverage_100 = {}
  158 + for k in neighbor_size:
  159 + coverage_20[k] = {}
  160 + coverage_100[k] = {}
  161 + with open(comment_20[k],'a') as f:
  162 + for size in profile_size:
  163 + coverage_20[k][size] = len(c_20[k][size])/float(repo_size)
  164 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
  165 + (k,size,float(sum(p_20_summary[k][size]))/len(p_20_summary[k][size]),coverage_20[k][size]))
  166 + with open(comment_100[k],'a') as f:
  167 + for size in profile_size:
  168 + coverage_100[k][size] = len(c_100[k][size])/float(repo_size)
  169 + f.write("%3d\t\t%3d\t\t%.4f\t%.4f\n" %
  170 + (k,size,float(sum(f05_100_summary[k][size]))/len(f05_100_summary[k][size]),coverage_100[k][size]))
  171 +
  172 + for k in neighbor_size:
  173 + # plot results summary
  174 + g = Gnuplot.Gnuplot()
  175 + g('set style data lines')
  176 + g('set yrange [0:1.0]')
  177 + g.xlabel('Profile size')
  178 + g.title("Setup: %s-neighboorhood%3d (threshold 20)" % (cfg.strategy,k))
  179 + g.plot(Gnuplot.Data(sorted([[i,sum(p_20_summary[k][i])/len(p_20_summary[k][i])]
  180 + for i in p_20_summary[k].keys()]),title="Precision"),
  181 + Gnuplot.Data(sorted([[i,coverage_20[k][i]]
  182 + for i in coverage_20[k].keys()]),title="Coverage"))
  183 + g.hardcopy(graph_20[k],terminal="png")
  184 + #commands.getoutput("convert -quality 100 %s %s" %
  185 + # (graph_20[k],graph_20_jpg[k]))
  186 + g = Gnuplot.Gnuplot()
  187 + g('set style data lines')
  188 + g('set yrange [0:1.0]')
  189 + g.xlabel('Profile size')
  190 + g.title("Setup: %s-neighboorhood%3d (threshold 100)" % (cfg.strategy,k))
  191 + g.plot(Gnuplot.Data(sorted([[i,sum(f05_100_summary[k][i])/len(f05_100_summary[k][i])]
  192 + for i in f05_100_summary[k].keys()]),title="F05"),
  193 + Gnuplot.Data(sorted([[i,coverage_100[k][i]]
  194 + for i in coverage_100[k].keys()]),title="Coverage"))
  195 + g.hardcopy(graph_100[k],terminal="png")
  196 + #commands.getoutput("convert -quality 100 %s %s" %
  197 + # (graph_100[k],graph_100_jpg[k]))
... ...
src/experiments/k-suite.py
1 1 #!/usr/bin/env python
2 2 """
3   - recommender suite - recommender experiments suite
  3 + k-suite - experiment different neighborhood sizes
4 4 """
5 5 __author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
6 6 __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
... ... @@ -31,25 +31,38 @@ import random
31 31 import Gnuplot
32 32 import numpy
33 33  
34   -def plot_roc(p,roc_points,log_file):
  34 +def plot_roc(k,roc_points,log_file):
35 35 g = Gnuplot.Gnuplot()
36 36 g('set style data points')
37 37 g.xlabel('False Positive Rate')
38 38 g.ylabel('True Positive Rate')
39 39 g('set xrange [0:1.0]')
40 40 g('set yrange [0:1.0]')
41   - g.title("Setup: %s" % log_file.split("/")[-1])
  41 + g.title("Setup: %s-k%d" % (log_file.split("/")[-1],k))
42 42 g.plot(Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
43   - Gnuplot.Data(roc_points,title="k %d"%k))
  43 + Gnuplot.Data(roc_points))
44 44 g.hardcopy(log_file+("-k%.3d.png"%k),terminal="png")
45 45 g.hardcopy(log_file+("-k%.3d.ps"%k),terminal="postscript",enhanced=1,color=1)
46 46  
  47 +def plot_summary(precision,f05,mcc,log_file):
  48 + g = Gnuplot.Gnuplot()
  49 + g('set style data lines')
  50 + g.xlabel('Neighborhood (k)')
  51 + g.title("Setup: %s-size20" % (log_file.split("/")[-1]))
  52 + g.plot(Gnuplot.Data([[k,sum(precision[k])/len(precision[k])] for k in precision.keys()],title="P"),
  53 + Gnuplot.Data([[k,sum(f05[k])/len(f05[k])] for k in f05.keys()],title="F05"),
  54 + Gnuplot.Data([[k,sum(mcc[k])/len(mcc[k])] for k in mcc.keys()],title="MCC"))
  55 + g.hardcopy(log_file+(".png"),terminal="png")
  56 + g.hardcopy(log_file+(".ps"),terminal="postscript",enhanced=1,color=1)
  57 +
47 58 class ExperimentResults:
48 59 def __init__(self,repo_size):
49 60 self.repository_size = repo_size
50 61 self.precision = []
51 62 self.recall = []
52 63 self.fpr = []
  64 + self.f05 = []
  65 + self.mcc = []
53 66  
54 67 def add_result(self,ranking,sample):
55 68 predicted = RecommendationResult(dict.fromkeys(ranking,1))
... ... @@ -58,49 +71,72 @@ class ExperimentResults:
58 71 self.precision.append(evaluation.run(Precision()))
59 72 self.recall.append(evaluation.run(Recall()))
60 73 self.fpr.append(evaluation.run(FPR()))
  74 + self.f05.append(evaluation.run(F_score(0.5)))
  75 + self.mcc.append(evaluation.run(MCC()))
61 76  
62   - # Average ROC by threshold (whici is the size)
63 77 def get_roc_point(self):
64 78 tpr = self.recall
65 79 fpr = self.fpr
  80 + if not tpr or not fpr:
  81 + return [0,0]
66 82 return [sum(fpr)/len(fpr),sum(tpr)/len(tpr)]
67 83  
68 84 def get_precision_summary(self):
  85 + if not self.precision: return 0
69 86 return sum(self.precision)/len(self.precision)
70 87  
71   - def get_recall_summary(self):
72   - return sum(self.recall)/len(self.recall)
  88 + def get_f05_summary(self):
  89 + if not self.f05: return 0
  90 + return sum(self.f05)/len(self.f05)
  91 +
  92 + def get_mcc_summary(self):
  93 + if not self.mcc: return 0
  94 + return sum(self.mcc)/len(self.mcc)
73 95  
74 96 if __name__ == '__main__':
75   - # experiment parameters
  97 + if len(sys.argv)<3:
  98 + print "Usage: k-suite strategy_str sample_file"
  99 + exit(1)
76 100 threshold = 20
77 101 iterations = 30
78   - sample_file = "results/misc-popcon/sample-050-100"
79 102 neighbors = [3,5,10,50,100,150,200,300,400,500]
80 103 cfg = Config()
81   - cfg.strategy = "knn"
82   - print cfg.popcon_index
83   - sample = []
  104 + cfg.strategy = sys.argv[1]
  105 + sample_file = sys.argv[2]
  106 + population_sample = []
84 107 with open(sample_file,'r') as f:
85 108 for line in f.readlines():
86 109 user_id = line.strip('\n')
87   - sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  110 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
88 111 # setup dictionaries and files
89   - roc_points = {}
  112 + roc_summary = {}
90 113 recommended = {}
91   - precisions = {}
92   - aucs = {}
93   - log_file = "results/k-suite/sample-050-100/%s" % (cfg.strategy)
  114 + precision_summary = {}
  115 + f05_summary = {}
  116 + mcc_summary = {}
  117 + sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])
  118 + if not os.path.exists(sample_dir):
  119 + os.makedirs(sample_dir)
  120 + log_file = os.path.join(sample_dir,cfg.strategy)
  121 + with open(log_file,'w') as f:
  122 + f.write("# %s\n\n" % sample_file.split('/')[-1])
  123 + f.write("# strategy %s recommendation_size %d iterations %d\n\n" %
  124 + (cfg.strategy,threshold,iterations))
  125 + f.write("# k coverage \tprecision \tf05 \tmcc\n\n")
  126 +
94 127 for k in neighbors:
95   - roc_points[k] = []
  128 + roc_summary[k] = []
96 129 recommended[k] = set()
97   - precisions[k] = []
98   - aucs[k] = []
  130 + precision_summary[k] = []
  131 + f05_summary[k] = []
  132 + mcc_summary[k] = []
99 133 with open(log_file+"-k%.3d"%k,'w') as f:
  134 + f.write("# %s\n\n" % sample_file.split('/')[-1])
100 135 f.write("# strategy-k %s-k%.3d\n\n" % (cfg.strategy,k))
101   - f.write("# roc_point \tp(20) \tauc\n\n")
  136 + f.write("# roc_point \tprecision \tf05 \tmcc\n\n")
  137 +
102 138 # main loop per user
103   - for submission_file in sample:
  139 + for submission_file in population_sample:
104 140 user = PopconSystem(submission_file)
105 141 user.filter_pkg_profile(cfg.pkgs_filter)
106 142 user.maximal_pkg_profile()
... ... @@ -112,12 +148,12 @@ if __name__ == &#39;__main__&#39;:
112 148 # n iterations for same recommender and user
113 149 for n in range(iterations):
114 150 # Fill sample profile
115   - profile_size = len(user.pkg_profile)
  151 + profile_len = len(user.pkg_profile)
116 152 item_score = {}
117 153 for pkg in user.pkg_profile:
118 154 item_score[pkg] = user.item_score[pkg]
119 155 sample = {}
120   - sample_size = int(profile_size*0.9)
  156 + sample_size = int(profile_len*0.9)
121 157 for i in range(sample_size):
122 158 key = random.choice(item_score.keys())
123 159 sample[key] = item_score.pop(key)
... ... @@ -125,28 +161,26 @@ if __name__ == &#39;__main__&#39;:
125 161 recommendation = rec.get_recommendation(iteration_user,threshold)
126 162 if hasattr(recommendation,"ranking"):
127 163 results.add_result(recommendation.ranking,sample)
128   - print "ranking",recommendation.ranking
129   - print "recommended_%d"%k,recommended[k]
130 164 recommended[k] = recommended[k].union(recommendation.ranking)
131   - print recommended[k]
132 165 # save summary
133 166 roc_point = results.get_roc_point()
134   - auc = numpy.trapz(y=[0,roc_point[1],1],x=[0,roc_point[0],1])
135   - p_20 = results.get_precision_summary()
136   - roc_points[k].append(roc_point)
137   - aucs[k].append(auc)
138   - precisions[k].append(p_20)
  167 + roc_summary[k].append(roc_point)
  168 + precision = results.get_precision_summary()
  169 + precision_summary[k].append(precision)
  170 + f05 = results.get_f05_summary()
  171 + f05_summary[k].append(f05)
  172 + mcc = results.get_mcc_summary()
  173 + mcc_summary[k].append(mcc)
139 174 with open(log_file+"-k%.3d"%k,'a') as f:
140   - f.write("%s \t%.2f \t%.4f\n" % (str(roc_point),p_20,auc))
  175 + f.write("[%.2f,%.2f] \t%.4f \t%.4f \t%.4f\n" %
  176 + (roc_point[0],roc_point[1],precision,f05,mcc))
141 177 # back to main flow
142   - with open(log_file,'w') as f:
143   - f.write("# k coverage \tp(20) \tauc\n\n")
  178 + with open(log_file,'a') as f:
  179 + plot_summary(precision_summary,f05_summary,mcc_summary,log_file)
144 180 for k in neighbors:
145   - print "len_recommended_%d"%k,len(recommended[k])
146   - print "repo_size",repo_size
147   - coverage = len(recommended[k])/float(repo_size)
148   - print coverage
149   - f.write("%d \t%.2f \t%.2f \t%.2fi\n" %
150   - (k,coverage,float(sum(precisions[k]))/len(precisions[k]),
151   - float(sum(aucs[k]))/len(aucs[k])))
152   - plot_roc(k,roc_points[k],log_file)
  181 + coverage = len(recommended[size])/float(repo_size)
  182 + f.write("%3d \t%.2f \t%.4f \t%.4f \t%.4f\n" %
  183 + (k,coverage,float(sum(precision_summary[k]))/len(precision_summary[k]),
  184 + float(sum(f05_summary[k]))/len(f05_summary[k]),
  185 + float(sum(mcc_summary[k]))/len(mcc_summary[k])))
  186 + plot_roc(k,roc_summary[k],log_file)
... ...
src/experiments/pure.py 0 → 100755
... ... @@ -0,0 +1,199 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 + profile-suite - experiment different profile sizes
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +
  22 +import sys
  23 +sys.path.insert(0,'../')
  24 +from config import Config
  25 +from data import PopconXapianIndex, PopconSubmission
  26 +from recommender import Recommender
  27 +from user import LocalSystem, User
  28 +from evaluation import *
  29 +import logging
  30 +import random
  31 +import Gnuplot
  32 +import numpy
  33 +
  34 +if __name__ == '__main__':
  35 + if len(sys.argv)<2:
  36 + print "Usage: profile-suite strategy_category sample_file"
  37 + exit(1)
  38 +
  39 + iterations = 20
  40 + profile_size = [10,20,40,70,100,140,170,200,240]
  41 + neighbor_size = [3,5,10,50,100,150,200,300,400,500]
  42 +
  43 + content_strategies = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
  44 + collaborative_strategies = ['knn_eset']#,'knn_eset','knn_plus']
  45 + #collaborative_strategies = ['knn','knn_eset','knn_plus']
  46 +
  47 + #iterations = 1
  48 + #profile_size = [10,20,30]
  49 + #neighbor_size = [10,20,30]
  50 + #content_strategies = ['cb']
  51 + #collaborative_strategies = ['knn_eset']
  52 +
  53 + strategy_category = sys.argv[1]
  54 + if strategy_category == "content":
  55 + strategies = content_strategies
  56 + sizes = profile_size
  57 + option_str = "profile"
  58 + elif strategy_category == "collaborative":
  59 + strategies = collaborative_strategies
  60 + sizes = neighbor_size
  61 + option_str = "neighborhood"
  62 + else:
  63 + print "Usage: profile-suite strategy_category sample_file"
  64 + exit(1)
  65 +
  66 + cfg = Config()
  67 + population_sample = []
  68 + sample_file = sys.argv[2]
  69 + sample_str = sample_file.split('/')[-1]
  70 + with open(sample_file,'r') as f:
  71 + for line in f.readlines():
  72 + user_id = line.strip('\n')
  73 + population_sample.append(os.path.join(cfg.popcon_dir,user_id[:2],user_id))
  74 + sample_dir = ("results/%s/%s" %
  75 + (strategy_category,sample_str))
  76 + if not os.path.exists(sample_dir):
  77 + os.makedirs(sample_dir)
  78 +
  79 + for strategy in strategies:
  80 + cfg.strategy = strategy
  81 + p_20_summary = {}
  82 + f05_100_summary = {}
  83 + c_20 = {}
  84 + c_100 = {}
  85 +
  86 + log_file = os.path.join(sample_dir,sample_str+"-"+cfg.strategy)
  87 + graph_20 = log_file+"-20.png"
  88 + graph_100 = log_file+"-100.png"
  89 + graph_20_jpg = graph_20.strip(".png")+".jpg"
  90 + graph_100_jpg = graph_100.strip(".png")+".jpg"
  91 + comment_20 = graph_20_jpg+".comment"
  92 + comment_100 = graph_100_jpg+".comment"
  93 +
  94 + with open(comment_20,'w') as f:
  95 + f.write("# sample %s\n" % sample_str)
  96 + f.write("# strategy %s\n# threshold 20\n# iterations %d\n\n" %
  97 + (cfg.strategy,iterations))
  98 + f.write("# %s\tp_20\tc_20\n\n"%option_str)
  99 + with open(comment_100,'w') as f:
  100 + f.write("# sample %s\n" % sample_str)
  101 + f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  102 + (cfg.strategy,iterations))
  103 + f.write("# %s\t\tf05_100\t\tc_100\n\n"%option_str)
  104 +
  105 + for size in sizes:
  106 + c_20[size] = set()
  107 + c_100[size] = set()
  108 + p_20_summary[size] = []
  109 + f05_100_summary[size] = []
  110 + with open(log_file+"-%s%.3d"%(option_str,size),'w') as f:
  111 + f.write("# sample %s\n" % sample_str)
  112 + f.write("# strategy %s-%s%.3d\n\n" % (cfg.strategy,option_str,size))
  113 + f.write("# p_20\tf05_100\n\n")
  114 +
  115 + # main loop per user
  116 + for submission_file in population_sample:
  117 + user = PopconSystem(submission_file)
  118 + user.filter_pkg_profile(cfg.pkgs_filter)
  119 + user.maximal_pkg_profile()
  120 + for size in sizes:
  121 + cfg.profile_size = size
  122 + cfg.k_neighbors = size
  123 + rec = Recommender(cfg)
  124 + repo_size = rec.items_repository.get_doccount()
  125 + p_20 = []
  126 + f05_100 = []
  127 + for n in range(iterations):
  128 + # Fill sample profile
  129 + profile_len = len(user.pkg_profile)
  130 + item_score = {}
  131 + for pkg in user.pkg_profile:
  132 + item_score[pkg] = user.item_score[pkg]
  133 + sample = {}
  134 + sample_size = int(profile_len*0.9)
  135 + for i in range(sample_size):
  136 + key = random.choice(item_score.keys())
  137 + sample[key] = item_score.pop(key)
  138 + iteration_user = User(item_score)
  139 + recommendation = rec.get_recommendation(iteration_user,repo_size)
  140 + if hasattr(recommendation,"ranking"):
  141 + ranking = recommendation.ranking
  142 + real = RecommendationResult(sample)
  143 + predicted_20 = RecommendationResult(dict.fromkeys(ranking[:20],1))
  144 + evaluation = Evaluation(predicted_20,real,repo_size)
  145 + p_20.append(evaluation.run(Precision()))
  146 + predicted_100 = RecommendationResult(dict.fromkeys(ranking[:100],1))
  147 + evaluation = Evaluation(predicted_100,real,repo_size)
  148 + f05_100.append(evaluation.run(F_score(0.5)))
  149 + c_20[size] = c_20[size].union(recommendation.ranking[:20])
  150 + c_100[size] = c_100[size].union(recommendation.ranking[:100])
  151 + # save summary
  152 + if p_20:
  153 + p_20_summary[size].append(sum(p_20)/len(p_20))
  154 + if f05_100:
  155 + f05_100_summary[size].append(sum(f05_100)/len(f05_100))
  156 +
  157 + with open(log_file+"-%s%.3d"%(option_str,size),'a') as f:
  158 + f.write("%.4f \t%.4f\n" %
  159 + ((sum(p_20)/len(p_20),sum(f05_100)/len(f05_100))))
  160 +
  161 + # back to main flow
  162 + coverage_20 = {}
  163 + coverage_100 = {}
  164 + with open(comment_20,'a') as f:
  165 + for size in sizes:
  166 + coverage_20[size] = len(c_20[size])/float(repo_size)
  167 + f.write("%3d\t\t%.4f\t\t%.4f\n" %
  168 + (size,float(sum(p_20_summary[size]))/len(p_20_summary[size]),coverage_20[size]))
  169 + with open(comment_100,'a') as f:
  170 + for size in sizes:
  171 + coverage_100[size] = len(c_100[size])/float(repo_size)
  172 + f.write("%3d\t\t%.4f\t\t%.4f\n" %
  173 + (size,float(sum(f05_100_summary[size]))/len(f05_100_summary[size]),coverage_100[size]))
  174 +
  175 + # plot results summary
  176 + g = Gnuplot.Gnuplot()
  177 + g('set style data lines')
  178 + g('set yrange [0:1.0]')
  179 + g.xlabel('%s size'%option_str.capitalize())
  180 + g.title("Setup: %s (threshold 20)" % cfg.strategy)
  181 + g.plot(Gnuplot.Data(sorted([[k,sum(p_20_summary[k])/len(p_20_summary[k])]
  182 + for k in p_20_summary.keys()]),title="Precision"),
  183 + Gnuplot.Data(sorted([[k,coverage_20[k]]
  184 + for k in coverage_20.keys()]),title="Coverage"))
  185 + g.hardcopy(graph_20,terminal="png")
  186 + commands.getoutput("convert -quality 20 %s %s" %
  187 + (graph_100,graph_20_jpg))
  188 + g = Gnuplot.Gnuplot()
  189 + g('set style data lines')
  190 + g('set yrange [0:1.0]')
  191 + g.xlabel('%s size'%option_str.capitalize())
  192 + g.title("Setup: %s (threshold 100)" % cfg.strategy)
  193 + g.plot(Gnuplot.Data(sorted([[k,sum(f05_100_summary[k])/len(f05_100_summary[k])]
  194 + for k in f05_100_summary.keys()]),title="F05"),
  195 + Gnuplot.Data(sorted([[k,coverage_100[k]]
  196 + for k in coverage_100.keys()]),title="Coverage"))
  197 + g.hardcopy(graph_100,terminal="png")
  198 + commands.getoutput("convert -quality 100 %s %s" %
  199 + (graph_100,graph_100_jpg))
... ...
src/experiments/roc-suite.py
... ... @@ -43,11 +43,11 @@ import numpy
43 43  
44 44 iterations = 30
45 45 sample_proportions = [0.9]
46   -weighting = [('bm25',1.0),('bm25',1.2),('bm25',2.0),('trad',0)]
  46 +weighting = [('bm25',1.0)]
47 47 content_based = ['cb','cbt','cbd','cbh','cb_eset','cbt_eset','cbd_eset','cbh_eset']
48 48 collaborative = ['knn_eset','knn','knn_plus']
49 49 hybrid = ['knnco','knnco_eset']
50   -profile_size = range(20,200,20)
  50 +profile_size = range(20,200,40)
51 51 neighbors = range(10,510,50)
52 52  
53 53 def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file):
... ... @@ -73,7 +73,7 @@ def write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_fi
73 73 output.write(pkg+"\n")
74 74 output.close()
75 75  
76   -def plot_roc(roc_points,auc,eauc,c,p,log_file):
  76 +def plot_roc(roc_points,eauc,c,p,log_file):
77 77 g = Gnuplot.Gnuplot()
78 78 g('set style data lines')
79 79 g.xlabel('False Positive Rate')
... ... @@ -83,52 +83,27 @@ def plot_roc(roc_points,auc,eauc,c,p,log_file):
83 83 g.title("Setup: %s" % log_file.split("/")[-1])
84 84 g('set label "C %.2f" at 0.8,0.25' % c)
85 85 g('set label "P(20) %.2f" at 0.8,0.2' % p)
86   - g('set label "AUC %.4f" at 0.8,0.15' % auc)
87   - g('set label "EAUC %.4f" at 0.8,0.1' % eauc)
  86 + g('set label "AUC %.4f" at 0.8,0.15' % eauc)
88 87 g.plot(Gnuplot.Data(roc_points,title="ROC"),
89   - Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"),
90   - Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
  88 + Gnuplot.Data([[0,0],[1,1]],with_="lines lt 7"))
  89 + #Gnuplot.Data([roc_points[-1],[1,1]],with_="lines lt 6"))
91 90 g.hardcopy(log_file+"-roc.png",terminal="png")
92 91 g.hardcopy(log_file+"-roc.ps",terminal="postscript",enhanced=1,color=1)
93 92  
94   -def plot_summary(precision,recall,f1,f05,accuracy,log_file):
95   - # Plot metrics summary
96   - g = Gnuplot.Gnuplot()
97   - g('set style data lines')
98   - g.xlabel('Recommendation size')
99   - g.title("Setup: %s" % log_file.split("/")[-1])
100   - g.plot(Gnuplot.Data(accuracy,title="Accuracy"),
101   - Gnuplot.Data(precision,title="Precision"),
102   - Gnuplot.Data(recall,title="Recall"),
103   - Gnuplot.Data(f1,title="F_1"),
104   - Gnuplot.Data(f05,title="F_0.5"))
105   - g.hardcopy(log_file+".png",terminal="png")
106   - g.hardcopy(log_file+".ps",terminal="postscript",enhanced=1,color=1)
107   - g('set logscale x')
108   - g('replot')
109   - g.hardcopy(log_file+"-logscale.png",terminal="png")
110   - g.hardcopy(log_file+"-logscale.ps",terminal="postscript",enhanced=1,color=1)
111   -
112 93 def get_label(cfg,sample_proportion):
113 94 label = {}
114 95 if cfg.strategy in content_based:
115   - label["description"] = "strategy-filter-profile-k1_bm25"
116   - label["values"] = ("%s-profile%.3d-%s-kbm%.1f" %
117   - (cfg.strategy,cfg.profile_size,
118   - cfg.pkgs_filter.split("/")[-1],
119   - cfg.bm25_k1))
  96 + label["description"] = "strategy-profile"
  97 + label["values"] = ("%s-profile%.3d" %
  98 + (cfg.strategy,cfg.profile_size))
120 99 elif cfg.strategy in collaborative:
121   - label["description"] = "strategy-knn-filter-k1_bm25"
122   - label["values"] = ("%s-k%.3d-%s-kbm%.1f" %
123   - (cfg.strategy,cfg.k_neighbors,
124   - cfg.pkgs_filter.split("/")[-1],
125   - cfg.bm25_k1))
  100 + label["description"] = "strategy-knn"
  101 + label["values"] = ("%s-k%.3d" %
  102 + (cfg.strategy,cfg.k_neighbors))
126 103 elif cfg.strategy in hybrid:
127   - label["description"] = "strategy-knn-filter-profile-k1_bm25"
128   - label["values"] = ("%s-k%.3d-profile%.3d-%s-kbm%.1f" %
129   - (cfg.strategy,cfg.k_neighbors,cfg.profile_size,
130   - cfg.pkgs_filter.split("/")[-1],
131   - cfg.bm25_k1))
  104 + label["description"] = "strategy-knn-profile"
  105 + label["values"] = ("%s-k%.3d-profile%.3d" %
  106 + (cfg.strategy,cfg.k_neighbors,cfg.profile_size))
132 107 else:
133 108 print "Unknown strategy"
134 109 return label
... ... @@ -136,41 +111,28 @@ def get_label(cfg,sample_proportion):
136 111 class ExperimentResults:
137 112 def __init__(self,repo_size):
138 113 self.repository_size = repo_size
139   - self.accuracy = {}
140 114 self.precision = {}
141 115 self.recall = {}
142   - self.f1 = {}
143   - self.f05 = {}
144 116 self.fpr = {}
145   - #points = [1]+range(10,200,10)+range(200,self.repository_size,100)
146 117 points = [1]+range(10,self.repository_size,10)
147 118 self.recommended = set()
148 119 for size in points:
149   - self.accuracy[size] = []
150 120 self.precision[size] = []
151 121 self.recall[size] = []
152   - self.f1[size] = []
153   - self.f05[size] = []
154 122 self.fpr[size] = []
155 123  
156 124 def add_result(self,ranking,sample):
157   - print "len_recommended", len(self.recommended)
158   - print "len_rank", len(ranking)
159 125 self.recommended = self.recommended.union(ranking)
160   - print "len_recommended", len(self.recommended)
161 126 # get data only for point
162   - for size in self.accuracy.keys():
  127 + for size in self.precision.keys():
163 128 predicted = RecommendationResult(dict.fromkeys(ranking[:size],1))
164 129 real = RecommendationResult(sample)
165 130 evaluation = Evaluation(predicted,real,self.repository_size)
166   - #self.accuracy[size].append(evaluation.run(Accuracy()))
167 131 self.precision[size].append(evaluation.run(Precision()))
168 132 self.recall[size].append(evaluation.run(Recall()))
169   - #self.f1[size].append(evaluation.run(F_score(1)))
170   - #self.f05[size].append(evaluation.run(F_score(0.5)))
171 133 self.fpr[size].append(evaluation.run(FPR()))
172 134  
173   - # Average ROC by threshold (whici is the size)
  135 + # Average ROC by threshold (= size of recommendation)
174 136 def get_roc_points(self):
175 137 points = []
176 138 for size in self.recall.keys():
... ... @@ -179,38 +141,6 @@ class ExperimentResults:
179 141 points.append([sum(fpr)/len(fpr),sum(tpr)/len(tpr)])
180 142 return sorted(points)
181 143  
182   - def get_precision_summary(self):
183   - summary = [[size,sum(values)/len(values)] for size,values in self.precision.items()]
184   - return sorted(summary)
185   -
186   - def get_recall_summary(self):
187   - summary = [[size,sum(values)/len(values)] for size,values in self.recall.items()]
188   - return sorted(summary)
189   -
190   - def get_f1_summary(self):
191   - summary = [[size,sum(values)/len(values)] for size,values in self.f1.items()]
192   - return sorted(summary)
193   -
194   - def get_f05_summary(self):
195   - summary = [[size,sum(values)/len(values)] for size,values in self.f05.items()]
196   - return sorted(summary)
197   -
198   - def get_accuracy_summary(self):
199   - summary = [[size,sum(values)/len(values)] for size,values in self.accuracy.items()]
200   - return sorted(summary)
201   -
202   - def best_precision(self):
203   - size = max(self.precision, key = lambda x: max(self.precision[x]) and x>10)
204   - return (size,max(self.precision[size]))
205   -
206   - def best_f1(self):
207   - size = max(self.f1, key = lambda x: max(self.f1[x]))
208   - return (size,max(self.f1[size]))
209   -
210   - def best_f05(self):
211   - size = max(self.f05, key = lambda x: max(self.f05[x]))
212   - return (size,max(self.f05[size]))
213   -
214 144 def run_strategy(cfg,user):
215 145 for weight in weighting:
216 146 cfg.weight = weight[0]
... ... @@ -220,22 +150,24 @@ def run_strategy(cfg,user):
220 150 for proportion in sample_proportions:
221 151 results = ExperimentResults(repo_size)
222 152 label = get_label(cfg,proportion)
223   - #log_file = "results/20110906/4a67a295/"+label["values"]
224   - log_file = "results/"+label["values"]
  153 + user_dir = ("results/roc-suite/%s" % user.user_id[:8])
  154 + if not os.path.exists(user_dir):
  155 + os.mkdir(user_dir)
  156 + log_file = os.path.join(user_dir,label["values"])
225 157 for n in range(iterations):
226 158 # Fill sample profile
227   - profile_size = len(user.pkg_profile)
  159 + profile_len = len(user.pkg_profile)
228 160 item_score = {}
229 161 for pkg in user.pkg_profile:
230 162 item_score[pkg] = user.item_score[pkg]
231 163 sample = {}
232   - sample_size = int(profile_size*proportion)
  164 + sample_size = int(profile_len*proportion)
233 165 for i in range(sample_size):
234 166 key = random.choice(item_score.keys())
235 167 sample[key] = item_score.pop(key)
236 168 iteration_user = User(item_score)
237 169 recommendation = rec.get_recommendation(iteration_user,repo_size)
238   - #write_recall_log(label,n,sample,recommendation,profile_size,repo_size,log_file)
  170 + write_recall_log(label,n,sample,recommendation,profile_len,repo_size,log_file)
239 171 if hasattr(recommendation,"ranking"):
240 172 results.add_result(recommendation.ranking,sample)
241 173 with open(log_file,'w') as f:
... ... @@ -247,32 +179,12 @@ def run_strategy(cfg,user):
247 179 numpy.trapz(y=[0,roc_points[0][1]],x=[0,roc_points[0][0]])+
248 180 numpy.trapz(y=[roc_points[-1][1],1],x=[roc_points[-1][0],1]))
249 181 precision_20 = sum(results.precision[10])/len(results.precision[10])
250   - print results.recommended
251   - print "len",len(results.recommended)
252 182 coverage = len(results.recommended)/float(repo_size)
253   - print "repo_size: ", float(repo_size)
254   - print coverage
255   - exit(1)
256   - #f1_10 = sum(results.f1[10])/len(results.f1[10])
257   - #f05_10 = sum(results.f05[10])/len(results.f05[10])
258 183 f.write("# %s\n# %s\n\n" %
259 184 (label["description"],label["values"]))
260 185 f.write("# coverage \tp(20) \tauc \teauc\n\t%.2f \t%.2f \t%.4f \t%.4f\n\n" %
261 186 (coverage,precision_20,auc,eauc))
262   - #f.write("# best results (recommendation size; metric)\n")
263   - #f.write("precision (%d; %.2f)\nf1 (%d; %.2f)\nf05 (%d; %.2f)\n\n" %
264   - # (results.best_precision()[0],results.best_precision()[1],
265   - # results.best_f1()[0],results.best_f1()[1],
266   - # results.best_f05()[0],results.best_f05()[1]))
267   - #f.write("# recommendation size 10\nprecision (10; %.2f)\nf1 (10; %.2f)\nf05 (10; %.2f)" %
268   - # (precision_10,f1_10,f05_10))
269   - #precision = results.get_precision_summary()
270   - #recall = results.get_recall_summary()
271   - #f1 = results.get_f1_summary()
272   - #f05 = results.get_f05_summary()
273   - #accuracy = results.get_accuracy_summary()
274   - #plot_summary(precision,recall,f1,f05,accuracy,log_file)
275   - plot_roc(roc_points,auc,eauc,coverage,precision_20,log_file)
  187 + plot_roc(roc_points,eauc,coverage,precision_20,log_file)
276 188  
277 189 def run_content(user,cfg):
278 190 for strategy in content_based:
... ... @@ -288,10 +200,6 @@ def run_collaborative(user,cfg):
288 200 cfg.strategy = strategy
289 201 for k in neighbors:
290 202 cfg.k_neighbors = k
291   - #for size in popcon_size:
292   - # if size:
293   - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
294   - # cfg.popcon_programs = popcon_programs+"_"+size
295 203 run_strategy(cfg,user)
296 204  
297 205 def run_hybrid(user,cfg):
... ... @@ -301,28 +209,23 @@ def run_hybrid(user,cfg):
301 209 cfg.strategy = strategy
302 210 for k in neighbors:
303 211 cfg.k_neighbors = k
304   - #for size in popcon_size:
305   - # if size:
306   - # cfg.popcon_desktopapps = popcon_desktopapps+"_"+size
307   - # cfg.popcon_programs = popcon_programs+"_"+size
308 212 for size in profile_size:
309 213 cfg.profile_size = size
310 214 run_strategy(cfg,user)
311 215  
312 216 if __name__ == '__main__':
313   - #user = LocalSystem()
314   - #user = RandomPopcon(cfg.popcon_dir,os.path.join(cfg.filters_dir,"desktopapps"))
  217 + if len(sys.argv)<2:
  218 + print "Usage: roc-suite popcon_submission_path [content|collaborative|hybrid]"
  219 + exit(1)
315 220  
316 221 cfg = Config()
317   - #user = PopconSystem("/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")
318   - user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
319   - #user = PopconSystem("/root/.app-recommender/popcon-entries/4a/4a5834eb2aba6b6f17312239e0761c70")
  222 + user = PopconSystem(sys.argv[1])
320 223 user.filter_pkg_profile(cfg.pkgs_filter)
321 224 user.maximal_pkg_profile()
322 225  
323   - if "content" in sys.argv or len(sys.argv)<2:
  226 + if "content" in sys.argv or len(sys.argv)<3:
324 227 run_content(user,cfg)
325   - if "collaborative" in sys.argv or len(sys.argv)<2:
  228 + if "collaborative" in sys.argv or len(sys.argv)<3:
326 229 run_collaborative(user,cfg)
327   - if "hybrid" in sys.argv or len(sys.argv)<2:
  230 + if "hybrid" in sys.argv or len(sys.argv)<3:
328 231 run_hybrid(user,cfg)
... ...
src/experiments/sample-popcon-arch.py 0 → 100755
... ... @@ -0,0 +1,44 @@
  1 +#! /usr/bin/env python
  2 +"""
  3 + sample-popcon-arch - extract a sample of a specific arch
  4 +"""
  5 +__author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6 +__copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7 +__license__ = """
  8 + This program is free software: you can redistribute it and/or modify
  9 + it under the terms of the GNU General Public License as published by
  10 + the Free Software Foundation, either version 3 of the License, or
  11 + (at your option) any later version.
  12 +
  13 + This program is distributed in the hope that it will be useful,
  14 + but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 + GNU General Public License for more details.
  17 +
  18 + You should have received a copy of the GNU General Public License
  19 + along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +"""
  21 +import sys
  22 +sys.path.insert(0,'../')
  23 +import xapian
  24 +import os
  25 +import random
  26 +import sys
  27 +from user import RandomPopcon
  28 +
  29 +if __name__ == '__main__':
  30 + try:
  31 + size = int(sys.argv[1])
  32 + arch = sys.argv[2]
  33 + popcon_dir = sys.argv[3]
  34 + pkgs_filter = sys.argv[4]
  35 + except:
  36 + print "Usage: sample-popcon-arch size arch popcon_dir pkgs_filter"
  37 + exit(1)
  38 +
  39 + sample_file = ("results/misc-popcon/sample-%s-%d" % (arch,size))
  40 + with open(sample_file,'w') as f:
  41 + for n in range(1,size+1):
  42 + user = RandomPopcon(popcon_dir,arch,pkgs_filter)
  43 + f.write(user.user_id+'\n')
  44 + print "sample",n
... ...