利用MF(矩阵分解)将用户和物品编码成8维度的向量,从而可用于后续相似度计算,从而产生推荐结果。
step1:数据的预处理(读取rating.dat的数据,划分测试集和训练集,并转换成tensorflow能直接运用的格式)
#读取数据
data_file = './ratings.dat'
lines = open(data_file)
#打印前十行
for i,line in enumerate(lines):
if i>10:
break;
print(line)
#计算数据总量
count = 0
for line in lines:
count += 1
print(count)
def read_raw_data(file_path):
user_info = dict()
lines = open(file_path)
for line in lines:
tmp = line.strip().split("::")
if len(tmp)<4:
continue
ui = user_info.get(tmp[0],None)
if ui is None:
user_info[tmp[0]] = [(tmp[1],tmp[2],tmp[3])]
else:
user_info[tmp[0]].append((tmp[1],tmp[2],tmp[3]))
return user_info
user_info = read_raw_data('./ratings.dat')
#简单统计下用户的行为数,基于行为数过滤掉异常用户
user_action_num = {}
for k,v in user_info.items():
user_action_num[k] = len(v)
import numpy as np
user_stat = np.asanyarray(list(user_action_num.values()))
print("用户总量:",len(user_stat))
max_num = np.max(user_stat)
min_num = np.min(user_stat)
median_num = np.median(user_stat)
avg_num = np.average(user_stat)
print(max_num)
print(min_num)
print(median_num)
print(avg_num)
filter_user_nums = 0
for n in user_stat:
if n >2000:
filter_user_nums +=1
print(filter_user_nums)
#过滤掉行为数大于2000的
def extract_valid_user(user_info):
user_info_filter_result = {}
for k,v in user_info.items():
if len(v) > 2000:
continue
user_info_filter_result[k] = v
return user_info_filter_result
user_info = extract_valid_user(user_info)
print(len(user_info))
#基于时间序列进行排序,并定义训练集和测试集
def spilt_train_test(user_info):
train_set = []
test_set = []
for k,v in user_info.items():
tmp = sorted(v,key=lambda _:_[2])
for i in range(len(tmp)):
if i <len(tmp)-2:
train_set.append(str(k)+","+tmp[i][0]+","+tmp[i][1])
else:
test_set.append(str(k)+","+tmp[i][0]+","+tmp[i][1])
return train_set,test_set
train_set,test_set = spilt_train_test(user_info)
#将数据保存下来:
def save_data(test_data,train_data,save_path_dir):
import random
random.shuffle(train_data)
random.shuffle(test_data)
with open(save_path_dir+"train_set","w") as f:
for line in train_data:
f.write(line+"\n")
with open(save_path_dir+"test_set","w") as f:
for line in test_data:
f.write(line+"\n")
save_path = "./"
save_data(test_set,train_set,save_path)
#定义一个hash方法
def data2hash(data):
mask60 = 0x0fffffffffffffff
seed = 131
hash = 0
for s in data:
hash = hash*seed +ord(s)
return hash&mask60
print(data2hash("UserId=1"))
print(data2hash("MovieId=1"))
# hash化的目的:数据降维,虽然丢失掉了一些特征,但为了模型的训练速度,这种程度的丢失无伤大雅
def tohash(file,save_path):
wfile = open(save_path,"w")
with open(file) as f:
for line in f:
tmp = line.strip().split(",")
user_id = data2hash("UserId="+tmp[0])
item_id = data2hash("MovieId="+tmp[1])
wfile.write(str(user_id)+","+str(item_id)+","+tmp[2]+"\n")
wfile.close()
train_file_path = "./train_set"
train_file_save_path = "./train_set_hash"
test_file_path = "./test_set"
test_file_save_path = "./test_set_hash"
tohash(train_file_path,train_file_save_path)
tohash(test_file_path,test_file_save_path)
ls
import tensorflow as tf
# 定义自己的tfrecord的格式
def get_tfrecords_example(feature,label):
tfrecords_features = {
"feature":tf.train.Feature(int64_list=tf.train.Int64List(value=feature)),
"label":tf.train.Feature(float_list=tf.train.FloatList(value=label))
}
return tf.train.Example(
features = tf.train.Features(feature =tfrecords_features)
)
def totfrecords(file,save_dir):
print("Process to tfrecord File:%s ...." % file)
num = 0
writer = tf.io.TFRecordWriter(save_dir+"/"+"part-0000"+str(num)+".tfrecords")
lines = open(file)
for i,line in enumerate(lines):
tmp = line.strip().split(",")
feature = [int(tmp[0]),int(tmp[1])]
label =[float(1) if float(tmp[2])>=3 else float(0)]
example = get_tfrecords_example(feature,label)
writer.write(example.SerializeToString())
if (i+1) % 200000 ==0:
writer.close()
num +=1
writer = tf.io.TFRecordWriter(save_dir+"/"+"part-0000"+str(num)+".tfrecords")
print("Process To tfrecord File: %s End" %file)
writer.close()
import os
train_file_path ="./train_set_hash"
train_totfrecord ="./train_tf"
test_file_path = "./test_set_hash"
test_totfrecord = "./test_tf"
totfrecords(train_file_path,train_totfrecord)
totfrecords(test_file_path,test_totfrecord)
step2:编写向量服务,用于特征向量的初始化,更新,删除,以及文件保存
import numpy as np
# 单例模式
class Singleton(type):
_instance = {}
def __call__(cls,*args,**kwargs):
if cls not in Singleton._instance:
Singleton._instance[cls] = type.__call__(cls,*args,**kwargs)
return Singleton._instance[cls]
class PS(metaclass=Singleton):
def __init__(self,embedding_dim):
# 设置随机数种子
np.random.seed(2021)
self.params_server = dict()
self.dim = embedding_dim
print("ps inited....")
# 拉取数据
def pull(self,keys):
values = []
for k in keys:
tmp = []
for arr in k:
value = self.params_server.get(arr,None)
if value is None: #若初始为None 则通过随机种子进行产生随机数
value = np.random.rand(self.dim)
self.params_server[arr] = value
tmp.append(value)
values.append(tmp)
return np.asarray(values,dtype='float32')
# 更新参数
def push(self,keys,values):
for i in range(len(keys)):
for j in range(len(keys[i])):
self.params_server[keys[i][j]] = values[i][j]
def delete(self,keys):
for k in keys:
self.params_server.pop(k)
# 保存参数到文件,便于后续操作
def save(self,path):
print("总共包含keys: ",len(self.params_server))
writer= open(path,"w")
for k,v in self.params_server.items():
writer.write(str(k)+"\t"+",".join(['%.8f' % _ for _ in v])+"\n")
writer.close()
step3:读取数据
import tensorflow as tf
import os
tf.compat.v1.disable_eager_execution()
class InputFn:
def __init__(self,local_ps):
self.feature_len = 2
self.label_len =1
self.n_parse_threads = 4
self.shuffle_buffer_size =1024
self.prefetch_buffer_size =1
self.batch = 8
self.local_ps = local_ps
def input_fn(self,data_dir,is_test=False):
def _parse_example(example):
features = {
"feature":tf.io.FixedLenFeature(self.feature_len,tf.int64),
"label":tf.io.FixedLenFeature(self.label_len,tf.float32),
}
return tf.io.parse_single_example(example,features)
def _get_embedding(parsed):
keys = parsed["feature"]
keys_array = tf.compat.v1.py_func(self.local_ps.pull,[keys],tf.float32)
result ={
"feature":parsed["feature"],
"label":parsed["label"],
"feature_embedding":keys_array,
}
return result
file_list = os.listdir(data_dir)
files = []
for i in range(len(file_list)):
files.append(os.path.join(data_dir,file_list[i]))
dataset = tf.compat.v1.data.Dataset.list_files(files)
#数据复制多少份
if is_test:
dataset = dataset.repeat(1)
else:
dataset = dataset.repeat()
#读取tfrecord数据
dataset = dataset.interleave(
lambda _:tf.compat.v1.data.TFRecordDataset(_),
cycle_length =1
)
#对tfrecord的数据进行解析
dataset = dataset.map(
_parse_example,
num_parallel_calls = self.n_parse_threads
)
#batch data
dataset = dataset.batch(
self.batch,drop_remainder=True
)
dataset = dataset.map(
_get_embedding,
num_parallel_calls = self.n_parse_threads
)
#对数据进行打乱
if not is_test:
dataset.shuffle(self.shuffle_buffer_size)
#数据预加载
dataset = dataset.prefetch(
buffer_size =self.prefetch_buffer_size
)
#迭代器
iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
return iterator,iterator.get_next()
step4: 定义训练模型,给定梯度更新的方式和参数
# 定义训练模型
import tensorflow as tf
batch = 32
embedding_dim = 8
learning_rate = 0.001
def mf_fn(inputs,is_test):
# 取特征和y值,feature为:user_id 和 movie_id
embed_layer = inputs["feature_embedding"]
embed_layer = tf.reshape(embed_layer,shape=[-1,2,embedding_dim])
label = inputs["label"]
# 切分数据,获得user_id的embedding 和 movie_id的embedding
embed_layer = tf.split(embed_layer,num_or_size_splits=2,axis=1)
user_id_embedding = tf.reshape(embed_layer[0],shape=[-1,embedding_dim])
movie_id_embedding = tf.reshape(embed_layer[1],shape = [-1,embedding_dim])
# 根据公式进行乘积并求和
out_ = tf.reduce_mean(
user_id_embedding*movie_id_embedding,axis=1
)
# 设定预估部分
out_tmp = tf.sigmoid(out_)
if is_test:
tf.compat.v1.add_to_collections("input_tensor",embed_layer)
tf.compat.v1.add_to_collections("output_tensor",out_tmp)
# 损失函数loss
label_ = tf.reshape(label,[-1])
loss_ = tf.reduce_sum(tf.square(label_-out_))
out_dic = {
"loss": loss_,
"ground_truth":label_,
"prediction":out_
}
return out_dic
# 定义整个图结构,并给出梯度更新方式
def setup_graph(inputs,is_test=False):
result = {}
with tf.compat.v1.variable_scope("net_graph",reuse=is_test):
#初始模型图
net_out_dic = mf_fn(inputs,is_test)
loss = net_out_dic["loss"]
result["out"] = net_out_dic
if is_test:
return result
# SGD
emb_grad = tf.gradients(
loss,[inputs["feature_embedding"]],name = "feature_embedding"
)[0]
result["feature_new_embedding"] = \
inputs['feature_embedding']-learning_rate*emb_grad
result["feature_embedding"] = inputs["feature_embedding"]
result["feature"] = inputs["feature"]
return result
step5:AUC工具类用来查看训练的效果
# 开始训练
import numpy as np
from sklearn.metrics import roc_auc_score
# 定义一个AUC用来评估
class AUCUtils(object):
def __init__(self):
self.reset()
def add(self,loss,g=np.array([]),p=np.array([])):
self.loss.append(loss)
self.ground_truth += g.flatten().tolist()
self.prediction += p.flatten().tolist()
def calc(self):
return {
"loss_num":len(self.loss),
"loss":np.array(self.loss).mean(),
"auc_num":len(self.ground_truth),
"auc":roc_auc_score(self.ground_truth,self.prediction) if len(self.ground_truth)>0 else 0,
"pcoc": sum(self.prediction)/sum(self.ground_truth)
}
def calc_str(self):
res = self.calc()
return "loss: %f(%d),auc:%f(%d),pcoc:%f" % (
res["loss"],res["loss_num"],
res["auc"],res["auc_num"],
res["pcoc"]
)
def reset(self):
self.loss = []
self.ground_truth = []
self.prediction = []
step6:训练模块
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
batch = 32
embedding_dim = 8
local_ps = PS(embedding_dim)
n_parse_threads = 4
shuffle_buffer_size = 1024
prefetch_buffer_size = 16
max_steps = 100000
test_show_step = 1000
# 数据输入
inputs = InputFn(local_ps)
last_test_auc = 0
train_metric = AUCUtils()
test_metric = AUCUtils()
train_file = './train_tf'
test_file = './test_tf'
save_embedding = "./saved_embedding"
train_itor,train_inputs = inputs.input_fn(train_file,is_test=False)
train_dic = setup_graph(train_inputs,is_test=False)
test_itor,test_inputs = inputs.input_fn(test_file,is_test=True)
test_dic = setup_graph(test_inputs,is_test=True)
train_log_iter = 1000
last_test_auc = 0.5
def train():
_step = 0
print("#" * 80)
# 建议sess 进行训练
with tf.compat.v1.Session() as sess:
sess.run([tf.compat.v1.global_variables_initializer(),
tf.compat.v1.local_variables_initializer()])
#开始训练
sess.run(train_itor.initializer)
while _step < max_steps:
feature_old_embedding,feature_new_embedding,keys,out = sess.run(
[train_dic["feature_embedding"],
train_dic["feature_new_embedding"],
train_dic["feature"],
train_dic["out"]]
)
train_metric.add(
out["loss"],
out["ground_truth"],
out["prediction"]
)
local_ps.push(keys,feature_new_embedding)
_step +=1
# 每训练多少个batch的训练数据,就打印一次训练这些batch的auc信息
if _step % train_log_iter ==0:
print("Train at step %d:%s",_step,train_metric.calc_str())
train_metric.reset()
if _step % test_show_step ==0:
valid_step(sess,test_itor,test_dic)
def valid_step(sess,test_itor,test_dic):
test_metric.reset()
sess.run(test_itor.initializer)
global last_test_auc
while True:
try:
out = sess.run(test_dic["out"])
test_metric.add(
out["loss"],
out["ground_truth"],
out["prediction"]
)
except tf.errors.OutOfRangeError:
print("Test at step:%s",test_metric.calc_str())
res = test_metric.calc()
print(res["auc"])
if res["auc"] > last_test_auc:
last_test_auc = test_metric.calc()["auc"]
local_ps.save(save_embedding)
break
step7:开始训练
train()
注:需要数据集或有任何问题可私聊