根据数据分析那篇文章可知,若实现强化推荐,必须模拟真实数据流。
首先明确,买方和买方的数据流应包括哪些特征:
买方数据流:买方ID、物品ID、操作(浏览、加购、购买)、操作时间
卖方数据流(只针对物品属性的改变):物品ID、属性(为了简单,只选择价格、类别,商品信息暂不考虑)、属性值、操作时间
用户信息和商品信息的维护使用redis数据库。
按照流程,首先下载Retailrocket数据集,通过python 实现分行读取:
import redis
import pandas as pd
def buy_stream():
buy_data=pd.read_csv("data/events.csv").sort_values(by="timestamp")
for r in buy_data[:10].itertuples():
yield r
def sell_stream():
sell_data_1 = pd.read_csv("data/item_properties_part1.csv")
sell_data_2 = pd.read_csv("data/item_properties_part2.csv")
sell_data=pd.concat([sell_data_1,sell_data_2]).sort_values(by="timestamp")
for r in sell_data[:10].itertuples():
yield r
i=0
buy=buy_stream()
sell=sell_stream()
buy_row = next(buy)
sell_row = next(sell)
while(True):#
if int(buy_row.timestamp)<int(sell_row.timestamp):
print(buy_row)# 执行步骤3
try: buy_row=next(buy)
except:break
else:
print(sell_row)#执行步骤4
try: sell_row=next(cell)
except:pass
使用redis作为中间件存储流:
import redis
import pandas as pd
pool = redis.ConnectionPool(host='......',port=6380,decode_responses=True,password='123456')
view_item=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=0)
view_cate=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=1)
view_price=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=2)
add_item=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=3)
add_cate=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=4)
add_price=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=5)
buy_item=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=6)
buy_cate=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=7)
buy_price=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=8)
item_db=redis.Redis(host='......',port=6380,decode_responses=True,password='123456',db=9)
item_db.flushall()
# view_db.set('name','test')
# # print(view_db.get('name'))
buy_data=pd.read_csv("data/events.csv").sort_values(by="timestamp")
sell_data_1 = pd.read_csv("data/item_properties_part1.csv")
sell_data_2 = pd.read_csv("data/item_properties_part2.csv")
sell_data=pd.concat([sell_data_1,sell_data_2]).sort_values(by="timestamp")
item_cate=sell_data[sell_data.property=="categoryid"].drop_duplicates(subset=['itemid'],keep='first',inplace=False)
item_cate.rename(columns={'value':'categoryid'}, inplace = True)
item_cate=item_cate[['itemid','categoryid']]
item_price=sell_data[sell_data.property=="790"].drop_duplicates(subset=['itemid'],keep='first',inplace=False)
item_price.rename(columns={'value':'price'}, inplace = True)
item_price=item_price[['itemid','price']]
item_available=sell_data[sell_data.property=="available"].drop_duplicates(subset=['itemid'],keep='first',inplace=False)
item_available.rename(columns={'value':'available'}, inplace = True)
item_available=item_available[['itemid','available']]
con_data=pd.merge(buy_data,item_cate,on="itemid")
con_data=pd.merge(con_data,item_price,on="itemid")
con_data=pd.merge(con_data,item_available,on="itemid").sort_values(by="timestamp")
# print(con_data)
def buy_stream(con_data):
for r in con_data[:10000].itertuples():
yield r
def sell_stream(sell_data):
for r in sell_data[:10000].itertuples():
yield r
i=0
buy=buy_stream(con_data)
sell=sell_stream(sell_data)
buy_row = next(buy)
sell_row = next(sell)
i=0
E=0# 记录物品是否已入库
while(True):#
i += 1
if i%10==0:
print(buy_row.visitorid,view_item.lrange(buy_row.visitorid, 0, -1))
if int(buy_row.timestamp)<int(sell_row.timestamp):
# print(buy_row)# 执行步骤3
if E==1 and view_item.exists(buy_row.visitorid):
# pass # 输入用户特征和候选物品,执行推荐过程,返回推荐结果,生成反馈,存入动作池,根据更新策略执行更新。
if buy_row.event=='view':
# 更新物品的属性值
view_item.rpush(buy_row.visitorid, buy_row.itemid)
view_cate.rpush(buy_row.visitorid, item_db.hget(buy_row.itemid, "categoryid"))
view_price.rpush(buy_row.visitorid, item_db.hget(buy_row.itemid, "price"))
#
else:# 使用物品的初始属性
if buy_row.event == 'view':
view_item.rpush(buy_row.visitorid,buy_row.itemid)
view_cate.rpush(buy_row.visitorid,buy_row.categoryid)
view_price.rpush(buy_row.visitorid,float(buy_row.price[1:]))
try: buy_row=next(buy)
except:break
else:
# print(sell_row)#执行步骤4
if sell_row.property in ["categoryid","available","790"]:
E=1
if sell_row.property=="790":
item_db.hset(sell_row.itemid,"price",float(sell_row.value[1:]))
else:
item_db.hset(sell_row.itemid,sell_row.property,sell_row.value)
try: sell_row=next(cell)
except:pass
测试完毕,接下来设计可交互的推荐模型。