小型推荐系统

import pandas as pd

import math

import numpy as np

import time

import gc

import random

import warnings

warnings.filterwarnings('ignore')

from sklearn.cross_validation import train_test_split

gc.collect()

 

def loadData(fp):

    data=pd.read_csv(fp)

    seed=random.randint(1,2019)

    traindf,testdf=train_test_split(data,test_size=0.13,random_state=seed)

return traindf,testdf

 

def get_dict(df): 

    tmp=df[['user','item','quote_item']].values

    user_items={}

    for i in range(len(tmp)):

        user=tmp[i][0]

        item=tmp[i][1]

        quote=tmp[i][2]

        if user not in user_items.keys():

            user_items[user]={}

        k1='buy'

        k2='quote'

        if k1 not in user_items[user]:

            user_items[user][k1]=[]

        if item not in user_items[user][k1]:

            user_items[user][k1].append(item)

        if k2 not in user_items[user]:

            user_items[user][k2]=[]

        if quote not in user_items[user][k2]:

            user_items[user][k2].append(quote)

return user_items

 

def get_user_industry(all_data):

    tmp=all_data[['user','user_industry']].values

    user_industry={}

    for i in range(len(tmp)):

        user=tmp[i][0]

        user_ind=tmp[i][1]

        if user not in user_industry.keys():

            user_industry[user]=0

        user_industry[user]=user_ind

return user_industry

 

class MostPopular:

    def __init__(self,traindf,user_industry,test_dict):

        self.traindf=traindf

        self.user_industry=user_industry

        self.test_dict=test_dict

 

    def get_all(self): 

        tmp=self.traindf[['user','user_industry','item','last_order_quantity']].values  

       

        industry_item_pop={} 

        item_count={}

        

        for i in range(len(tmp)):

            user=tmp[i][0] 

            industry=tmp[i][1] 

            item=tmp[i][2] #tmp是直接从traindf中取出来的,并不是从train_dict中取出来的,所以,item是一个值

            num=tmp[i][3]

            if industry not in industry_item_pop.keys():

                industry_item_pop[industry]={}

            if item not in industry_item_pop[industry].keys():

                industry_item_pop[industry][item]=0

            industry_item_pop[industry][item]+=math.log(num) #pop=math.log(sales_num)

            if item not in item_count.keys():

                item_count[item]=0

            item_count[item]+=1

        for industry,items in industry_item_pop.items():

            for item in items.keys():

                industry_item_pop[industry][item]/=(0.000001+item_count[item])  #pop=math.log(sales_num)/count

        industry_item_pop={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in industry_item_pop.items()}

        

        recs={}

        for user in self.test_dict.keys():

            if user not in recs.keys():

                recs[user]={}

            user_ind=self.user_industry[user]

            if user_ind not in industry_item_pop.keys():

                continue

#                 print("The user_industry is %s"%user_ind)

#                 print("The user_industry is new, so we could not find most popular items for you, sorry!")

#                 print("And we really suggest you to contact our custom service hotline: 021-64946666")

#                 print("Or you could contact our sales manager directly: Mr.Yan 12345678912")

#                 print('\n')              

            else:

                industry_items=industry_item_pop[user_ind] #return {item1:pop,item2:pop}

                for i in range(len(industry_items)):

                    item_id=industry_items[i][0]

                    item_pop=industry_items[i][1]

                    if item_id not in recs[user].keys():

                        recs[user][item_id]=0

                    recs[user][item_id]=item_pop 

        recs={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in recs.items()}

#         print("For MostPopular, the predict top rec list for test_set is :",recs)

#         print('\n')

        return recs

 

class UserCF:

    def __init__(self,traindf,train_dict,test_dict):

        self.traindf=traindf

        self.train=train_dict

        self.test=test_dict

        

    def get_all(self):

        tmp=self.traindf[['user','user_label']].values

 

        user_label={}  

        for i in range(len(tmp)):

            user=tmp[i][0]

            labels=tmp[i][1]

            if user not in user_label.keys():

                user_label[user]=[]

            for label in labels.split(':'):

                if label not in user_label[user]:

                    user_label[user].append(label)

    

        user_label_num={} 

        for user in user_label.keys():

            if user not in user_label_num.keys():

                user_label_num[user]=0

            user_label_num[user]=len(user_label[user])

    

        user_user_sim={} 

        for u in user_label_num.keys():

            u_label=user_label[u] 

            if u not in user_user_sim.keys():

                user_user_sim[u]={}

            for v in user_label_num.keys():

                if u==v:continue

                else:

                    v_label=user_label[v]

                    same=0

                    for label in u_label:

                        if label in v_label:

                            same+=1

                    if v not in user_user_sim[u].keys():

                        user_user_sim[u][v]=0

                    user_user_sim[u][v]+=(same/math.sqrt(user_label_num[u]*user_label_num[v])) 

        user_user_sim={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in user_user_sim.items()}

 

        tmp=self.traindf[['item','last_order_quantity']].values

        item_sale={} #item:sale

        item_count={}

        for i in range(len(tmp)):

            item=tmp[i][0]

            sales=tmp[i][1]

            if item not in item_sale.keys():

                item_sale[item]=0

            item_sale[item]+=math.log(sales)

            if item not in item_count.keys():

                item_count[item]=0

            item_count[item]+=1

        for item in item_sale.keys():

            item_sale[item]/=item_count[item]

    

        

        recs={}

        for u in self.test.keys():

            if u not in recs.keys():

                recs[u]={}

            if u not in self.train.keys():

#                 print("The user %s is new, we could not find in train_set! Pls call the user for more info!"%u)

                continue

            seen_item=self.train[u]['buy']

            v_sim=user_user_sim[u] #这里取了所有的相似用户,应该取topk的,但是因为我们数据有限,所以不限制

            for i in range(len(v_sim)):

                v_user=v_sim[i][0] #相似用户

                for item in self.train[v]['buy']:

                    if item not in seen_item:

                        if item not in item_sale.keys():

                            item_sale[item]=0

                        recs[u][item]=item_sale[item]

                    else:

                        continue

        recs={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in recs.items()}

#         print("For UserCF, the predict rec is :",recs)

#         print('\n')

        return recs

 

class ItemCF:

    def __init__(self,traindf,train_dict,test_dict):

        self.traindf=traindf

        self.train=train_dict

        self.test=test_dict

        

    def get_all(self):

        tmp=self.traindf[['item','item_label']].values

        item_label={} 

        for i in range(len(tmp)):

            item=tmp[i][0]

            labels=tmp[i][1]

            if item not in item_label.keys():

                item_label[item]=[]

            for label in labels.split(':'):

                if label not in item_label[item]:

                    item_label[item].append(label)

        

        item_label_num={}  

        for item in item_label.keys():

            if item not in item_label_num.keys():

                item_label_num[item]=0

            item_label_num[item]=len(item_label[item])

    

        item_sim={} 

        for u in item_label.keys():

            u_label=item_label[u]

            if u not in item_sim.keys():

                item_sim[u]={}

            for v in item_label.keys():

                if u==v:continue

                else:

                    v_label=item_label[v]

                    count=0

                    for label in u_label:

                        if label in v_label:

                            count+=1

                    if v not in item_sim[u]:

                        item_sim[u][v]=0

                    item_sim[u][v]+=(count/math.sqrt(item_label_num[u]*item_label_num[v]))

        item_sim={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in item_sim.items()}

        

        tmp=self.traindf[['item','last_order_quantity']].values

        item_sale={} 

        item_count={}

        for i in range(len(tmp)):

            item=tmp[i][0]

            sales=tmp[i][1]

            if item not in item_sale.keys():

                item_sale[item]=0

            item_sale[item]+=math.log(sales)

            if item not in item_count.keys():

                item_count[item]=0

            item_count[item]+=1

        for item in item_sale.keys():

            item_sale[item]/=item_count[item]

        

        recs={}

        for user in self.test.keys():

            if user not in recs.keys():

                recs[user]={}

            if user not in self.train.keys():

#                 print("the user %s is new, we could not find any info in train_set, pls contact the user for more info"%user)

                continue

            buy_item=self.train[user]['buy'] 

            quote_item=self.train[user]['quote']

            for u in quote_item: 

                if u not in item_sim.keys():

                    item_sim[u]={}

                v_sim=item_sim[u]

                for i in range(len(v_sim)):

                    v=v_sim[i][0]

                    sim=v_sim[i][1]

                    if v in buy_item:

                        continue

                    elif v not in buy_item:

                        if v not in recs[user].keys():

                            recs[user][v]=0

                        recs[user][v]=item_sale[v]

        recs={u:sorted(v.items(),key=lambda x:x[1],reverse=True) for u,v in recs.items()}

#         print("For UserCF, the predict rec is :",recs)

#         print('\n')

        return recs

 

class Metric:

    def __init__(self,all_test_pred,train_dict,test_dict):

        self.rec=all_test_pred

        self.train=train_dict

        self.test=test_dict

    

    def precision_recall(self):

        all_precision=0

        hit=0

        all_recall=0

        for user in self.test.keys():

            true_rec=self.test[user]['buy'] 

            pred_rec=self.rec[user]  

            for item in true_rec:

                if item in pred_rec:

                    hit+=0

            all_precision+=len(pred_rec)  #precision=hit/len(pred_rec)

            all_recall+=len(true_rec) #recal=hit/len(true_rec)

        return round(hit/all_precision*100,2),round(hit/all_recall*100,2)

    

    def coverage(self): 

        all_item=set()  

        pred_item=set() 

        for user in self.train.keys():  

            for item in self.train[user]['buy']:

                if item not in all_item:

                    all_item.add(item)

        for user in self.test.keys():

            for item in self.test[user]['buy']:

                if item not in pred_item:

                    pred_item.add(item)

        return round(len(pred_item)/len(all_item)*100,2)

    

    def popularity(self): 

        item_pop={} 

        for user in self.train.keys():

            for item in self.train[user]['buy']:

                if item not in item_pop.keys():

                    item_pop[item]=0

                item_pop[item]+=1

        

        pop=0

        count=0

        for user in self.test.keys():

            rec=self.rec[user] 

            for i in range(len(rec)):

                item_id=rec[i][0]

                item_p=rec[i][1]

                pop+=math.log(1+item_p)

                count+=1

        return round(pop/count*100,2)

    

    def eval(self):

        metrics={'precision':self.precision_recall()[0],

                'recall':self.precision_recall()[1],

                'coverage':self.coverage(),

                'popularity':self.popularity()}

        return metrics

 

class experiment():

    def __init__(self,m,fp='./data/yufang0510.csv'):

        self.m=m #试验的次数

        self.fp=fp

    

    def run(self): 

        rec_dict={}

        

        start_time=time.time()

        metrics_1={'precision': 0.0, 'recall': 0.0, 'coverage': 0.0, 'popularity': 0.0}

        metrics_2={'precision': 0.0, 'recall': 0.0, 'coverage': 0.0, 'popularity': 0.0}

        metrics_3={'precision': 0.0, 'recall': 0.0, 'coverage': 0.0, 'popularity': 0.0}

        for i in range(self.m):

            traindf,testdf=loadData(self.fp)

            data=pd.concat([traindf,testdf],axis=0)

            train_dict=get_dict(traindf)

            test_dict=get_dict(testdf)

            user_industry=get_user_industry(data)

            

            clf1=MostPopular(traindf,user_industry,test_dict)

            all_test_pred_1=clf1.get_all()

            metric_1=Metric(all_test_pred_1,train_dict,test_dict).eval()

            metrics_1={k:metric_1[k]+metrics_1[k] for k in metrics_1.keys()}

            

            clf2=UserCF(traindf,train_dict,test_dict)

            all_test_pred_2=clf2.get_all()

            metric_2=Metric(all_test_pred_2,train_dict,test_dict).eval()

            metrics_2={k:metric_2[k]+metrics_2[k] for k in metrics_2.keys()}

            

            clf3=ItemCF(traindf,train_dict,test_dict)

            all_test_pred_3=clf3.get_all()

            metric_3=Metric(all_test_pred_3,train_dict,test_dict).eval()

            metrics_3={k:metric_3[k]+metrics_3[k] for k in metrics_3.keys()}

        metrics_1={k:metrics_1[k]/self.m for k in metrics_1.keys()}

        metrics_2={k:metrics_2[k]/self.m for k in metrics_2.keys()}

        metrics_3={k:metrics_3[k]/self.m for k in metrics_3.keys()}

        print("AVERAGE METRIC @ MostPopular :",metrics_1)

        print("AVERAGE METRIC @ UserCF :",metrics_2)

        print("AVERAGE METRIC @ ItemCF :",metrics_3)

        print("all run time is :",time.time()-start_time)

        return metrics_1,metrics_2,metrics_3

 

m=5

fp='./data/yufang0510.csv'

exp=experiment(5,fp)

metric=exp.run()

gc.collect()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值