import pandas as pd
import math
import numpy as np
import time
import gc
import random
import warnings
warnings.filterwarnings('ignore')
from sklearn.cross_validation import train_test_split
gc.collect()
def loadData(fp):
data=pd.read_csv(fp)
seed=random.randint(1,2019)
traindf,testdf=train_test_split(data,test_size=0.13,random_state=seed)
return traindf,testdf
def get_dict(df):
tmp=df[['user','item','quote_item']].values
user_items={}
for i in range(len(tmp)):
user=tmp[i][0]
item=tmp[i][1]
quote=tmp[i][2]
if user not in user_items.keys():
user_items[user]={}
k1='buy'
k2='quote'
if k1 not in user_items[user]:
user_items[user][k1]=[]
if item not in user_items[user][k1]:
user_items[user][k1].append(item)
if k2 not in user_items[user]:
user_items[user][k2]=[]
if quote not in user_items[user][k2]:
user_items[user][k2].append(quote)
return user_items
def get_user_industry(all_data):
tmp=all_data[['user','user_industry']].values
user_industry={}
for i in range(len(tmp)):
user=tmp[i][0]
user_ind=tmp[i][1]
if user not in user_industry.keys():
user_industry[user]=0
user_industry[user]=user_ind
return user_industry
class MostPopular:
def __init__(self,traindf,user_industry,test_dict):
self.traindf=traindf
self.user_industry=user_industry
self.test_dict=test_dict
def get_all(self):
tmp=self.traindf[['user','user_industry','item','last_order_quantity']].values
industry_item_pop={}
item_count={}
for i in range(len(tmp)):
user=tmp[i][0]
industry=tmp[i][1]
item=tmp[i][2] #tmp是直接从traindf中取出来的,并不是从train_dict中取出来的,所以,item是一个值
num=tmp[i][3]
if industry not in industry_item_pop.keys():
industry_item_pop[industry]={}
if item not in industry_item_pop[industry].keys():
industry_item_pop[industry][item]=0
industry_item_pop[industry][item]+=math.log(num) #pop=math.log(sales_num)
if item not in item_count.keys():
item_count[item]=0
item_count[item]+=1
for industry,items in industry_item_pop.items():
for item in items.keys():
industry_item_pop[industry][item]/=(0.000001+item_count[item]) #pop=math.log(sales_num)/count
industry_item_pop={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in industry_item_pop.items()}
recs={}
for user in self.test_dict.keys():
if user not in recs.keys():
recs[user]={}
user_ind=self.user_industry[user]
if user_ind not in industry_item_pop.keys():
continue
# print("The user_industry is %s"%user_ind)
# print("The user_industry is new, so we could not find most popular items for you, sorry!")
# print("And we really suggest you to contact our custom service hotline: 021-64946666")
# print("Or you could contact our sales manager directly: Mr.Yan 12345678912")
# print('\n')
else:
industry_items=industry_item_pop[user_ind] #return {item1:pop,item2:pop}
for i in range(len(industry_items)):
item_id=industry_items[i][0]
item_pop=industry_items[i][1]
if item_id not in recs[user].keys():
recs[user][item_id]=0
recs[user][item_id]=item_pop
recs={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in recs.items()}
# print("For MostPopular, the predict top rec list for test_set is :",recs)
# print('\n')
return recs
class UserCF:
def __init__(self,traindf,train_dict,test_dict):
self.traindf=traindf
self.train=train_dict
self.test=test_dict
def get_all(self):
tmp=self.traindf[['user','user_label']].values
user_label={}
for i in range(len(tmp)):
user=tmp[i][0]
labels=tmp[i][1]
if user not in user_label.keys():
user_label[user]=[]
for label in labels.split(':'):
if label not in user_label[user]:
user_label[user].append(label)
user_label_num={}
for user in user_label.keys():
if user not in user_label_num.keys():
user_label_num[user]=0
user_label_num[user]=len(user_label[user])
user_user_sim={}
for u in user_label_num.keys():
u_label=user_label[u]
if u not in user_user_sim.keys():
user_user_sim[u]={}
for v in user_label_num.keys():
if u==v:continue
else:
v_label=user_label[v]
same=0
for label in u_label:
if label in v_label:
same+=1
if v not in user_user_sim[u].keys():
user_user_sim[u][v]=0
user_user_sim[u][v]+=(same/math.sqrt(user_label_num[u]*user_label_num[v]))
user_user_sim={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in user_user_sim.items()}
tmp=self.traindf[['item','last_order_quantity']].values
item_sale={} #item:sale
item_count={}
for i in range(len(tmp)):
item=tmp[i][0]
sales=tmp[i][1]
if item not in item_sale.keys():
item_sale[item]=0
item_sale[item]+=math.log(sales)
if item not in item_count.keys():
item_count[item]=0
item_count[item]+=1
for item in item_sale.keys():
item_sale[item]/=item_count[item]
recs={}
for u in self.test.keys():
if u not in recs.keys():
recs[u]={}
if u not in self.train.keys():
# print("The user %s is new, we could not find in train_set! Pls call the user for more info!"%u)
continue
seen_item=self.train[u]['buy']
v_sim=user_user_sim[u] #这里取了所有的相似用户,应该取topk的,但是因为我们数据有限,所以不限制
for i in range(len(v_sim)):
v_user=v_sim[i][0] #相似用户
for item in self.train[v]['buy']:
if item not in seen_item:
if item not in item_sale.keys():
item_sale[item]=0
recs[u][item]=item_sale[item]
else:
continue
recs={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in recs.items()}
# print("For UserCF, the predict rec is :",recs)
# print('\n')
return recs
class ItemCF:
def __init__(self,traindf,train_dict,test_dict):
self.traindf=traindf
self.train=train_dict
self.test=test_dict
def get_all(self):
tmp=self.traindf[['item','item_label']].values
item_label={}
for i in range(len(tmp)):
item=tmp[i][0]
labels=tmp[i][1]
if item not in item_label.keys():
item_label[item]=[]
for label in labels.split(':'):
if label not in item_label[item]:
item_label[item].append(label)
item_label_num={}
for item in item_label.keys():
if item not in item_label_num.keys():
item_label_num[item]=0
item_label_num[item]=len(item_label[item])
item_sim={}
for u in item_label.keys():
u_label=item_label[u]
if u not in item_sim.keys():
item_sim[u]={}
for v in item_label.keys():
if u==v:continue
else:
v_label=item_label[v]
count=0
for label in u_label:
if label in v_label:
count+=1
if v not in item_sim[u]:
item_sim[u][v]=0
item_sim[u][v]+=(count/math.sqrt(item_label_num[u]*item_label_num[v]))
item_sim={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in item_sim.items()}
tmp=self.traindf[['item','last_order_quantity']].values
item_sale={}
item_count={}
for i in range(len(tmp)):
item=tmp[i][0]
sales=tmp[i][1]
if item not in item_sale.keys():
item_sale[item]=0
item_sale[item]+=math.log(sales)
if item not in item_count.keys():
item_count[item]=0
item_count[item]+=1
for item in item_sale.keys():
item_sale[item]/=item_count[item]
recs={}
for user in self.test.keys():
if user not in recs.keys():
recs[user]={}
if user not in self.train.keys():
# print("the user %s is new, we could not find any info in train_set, pls contact the user for more info"%user)
continue
buy_item=self.train[user]['buy']
quote_item=self.train[user]['quote']
for u in quote_item:
if u not in item_sim.keys():
item_sim[u]={}
v_sim=item_sim[u]
for i in range(len(v_sim)):
v=v_sim[i][0]
sim=v_sim[i][1]
if v in buy_item:
continue
elif v not in buy_item:
if v not in recs[user].keys():
recs[user][v]=0
recs[user][v]=item_sale[v]
recs={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in recs.items()}
# print("For UserCF, the predict rec is :",recs)
# print('\n')
return recs
class Metric:
def __init__(self,all_test_pred,train_dict,test_dict):
self.rec=all_test_pred
self.train=train_dict
self.test=test_dict
def precision_recall(self):
all_precision=0
hit=0
all_recall=0
for user in self.test.keys():
true_rec=self.test[user]['buy']
pred_rec=self.rec[user]
for item in true_rec:
if item in pred_rec:
hit+=0
all_precision+=len(pred_rec) #precision=hit/len(pred_rec)
all_recall+=len(true_rec) #recal=hit/len(true_rec)
return round(hit/all_precision*100,2),round(hit/all_recall*100,2)
def coverage(self):
all_item=set()
pred_item=set()
for user in self.train.keys():
for item in self.train[user]['buy']:
if item not in all_item:
all_item.add(item)
for user in self.test.keys():
for item in self.test[user]['buy']:
if item not in pred_item:
pred_item.add(item)
return round(len(pred_item)/len(all_item)*100,2)
def popularity(self):
item_pop={}
for user in self.train.keys():
for item in self.train[user]['buy']:
if item not in item_pop.keys():
item_pop[item]=0
item_pop[item]+=1
pop=0
count=0
for user in self.test.keys():
rec=self.rec[user]
for i in range(len(rec)):
item_id=rec[i][0]
item_p=rec[i][1]
pop+=math.log(1+item_p)
count+=1
return round(pop/count*100,2)
def eval(self):
metrics={'precision':self.precision_recall()[0],
'recall':self.precision_recall()[1],
'coverage':self.coverage(),
'popularity':self.popularity()}
return metrics
class experiment():
def __init__(self,m,fp='./data/yufang0510.csv'):
self.m=m #试验的次数
self.fp=fp
def run(self):
rec_dict={}
start_time=time.time()
metrics_1={'precision': 0.0, 'recall': 0.0, 'coverage': 0.0, 'popularity': 0.0}
metrics_2={'precision': 0.0, 'recall': 0.0, 'coverage': 0.0, 'popularity': 0.0}
metrics_3={'precision': 0.0, 'recall': 0.0, 'coverage': 0.0, 'popularity': 0.0}
for i in range(self.m):
traindf,testdf=loadData(self.fp)
data=pd.concat([traindf,testdf],axis=0)
train_dict=get_dict(traindf)
test_dict=get_dict(testdf)
user_industry=get_user_industry(data)
clf1=MostPopular(traindf,user_industry,test_dict)
all_test_pred_1=clf1.get_all()
metric_1=Metric(all_test_pred_1,train_dict,test_dict).eval()
metrics_1={k:metric_1[k]+metrics_1[k] for k in metrics_1.keys()}
clf2=UserCF(traindf,train_dict,test_dict)
all_test_pred_2=clf2.get_all()
metric_2=Metric(all_test_pred_2,train_dict,test_dict).eval()
metrics_2={k:metric_2[k]+metrics_2[k] for k in metrics_2.keys()}
clf3=ItemCF(traindf,train_dict,test_dict)
all_test_pred_3=clf3.get_all()
metric_3=Metric(all_test_pred_3,train_dict,test_dict).eval()
metrics_3={k:metric_3[k]+metrics_3[k] for k in metrics_3.keys()}
metrics_1={k:metrics_1[k]/self.m for k in metrics_1.keys()}
metrics_2={k:metrics_2[k]/self.m for k in metrics_2.keys()}
metrics_3={k:metrics_3[k]/self.m for k in metrics_3.keys()}
print("AVERAGE METRIC @ MostPopular :",metrics_1)
print("AVERAGE METRIC @ UserCF :",metrics_2)
print("AVERAGE METRIC @ ItemCF :",metrics_3)
print("all run time is :",time.time()-start_time)
return metrics_1,metrics_2,metrics_3
m=5
fp='./data/yufang0510.csv'
exp=experiment(5,fp)
metric=exp.run()
gc.collect()