k d d − 17 日 汇 报 kdd-17日汇报 kdd−17日汇报
理解赛题及任务流程
赛题理解:
采集的数据超过10天,期间包含一个商业售卖活动。发生的点击超过100万,一共有10万商品、3万用户
。数据集总大小在500MB左右
提交的数据要根据phase,每到一个phase,就要提交之前和现在所有的qtime中的用户推荐的50item
用户有三万个,这里只收集了6789个用户特征
item也不全
fig,axes = plt.subplots(1,2,figsize=(10,5))
ax1=underexpose_user_feat.user_age_level.value_counts().plot(kind="bar",ax=axes[0])
ax2=underexpose_user_feat.user_age_level.value_counts().plot(kind="pie",ax=axes[1],autopct="%.2f%%")
分析:
4、5年龄段的用户最多
其次是7 、2、 6 、3
最少的是1 、8
underexpose_user_feat.user_gender.value_counts(dropna=False,ascending=False,sort=False)
fig,axes = plt.subplots(1,2,figsize=(10,5))
ax1=underexpose_user_feat.user_gender.value_counts().plot(kind="bar",ax=axes[0])
ax2=underexpose_user_feat.user_gender.value_counts().plot(kind="pie",ax=axes[1],autopct="%.2f%%")
分析:
男女比例接近:22 :77
可见女性占主要成分
underexpose_user_feat.user_city_level.value_counts(dropna=False,ascending=False,sort=False)
fig,axes = plt.subplots(1,2,figsize=(10,5))
ax1=underexpose_user_feat.user_city_level.value_counts().plot(kind="bar",ax=axes[0])
ax2=underexpose_user_feat.user_city_level.value_counts().plot(kind="pie",ax=axes[1],autopct="%.2f%%")
分析:
2、6城市中的用户最多
其次是3、5、1
最少的7
建议:可以根据这些信息,改进热度召回等等
下面就是分别求出了不同性别,年龄段、城市的用户id
# 女性用户
female_user_id=underexpose_user_feat[underexpose_user_feat['user_gender']=='F']
female_user_id=female_user_id.user_id
female_user_id.to_csv('female_user_id.csv')
# 男性用户
male_user_id=underexpose_user_feat[underexpose_user_feat['user_gender']=='M']
male_user_id=male_user_id.user_id
male_user_id.to_csv('male_user_id.csv')
# 1 级城市 用户 共6个城市
city1_user_id=underexpose_user_feat[underexpose_user_feat['user_city_level']==1.0]
city1_user_id=city1_user_id.user_id
city1_user_id.to_csv('city1_user_id.csv')
# 第1年龄段 共8个年龄段
age1_user_id=underexpose_user_feat[underexpose_user_feat['user_age_level']==1.0]
age1_user_id=age1_user_id.user_id
age1_user_id.to_csv('age1_user_id.csv')
整理往年Top方案
《Factorization Meets the Neighborhood:a Multifaced Collaborative Filtering Model》
–Yehuda Koren
额,应该是kdd 2008时候,前几名用的一个思想,解决的也是bias问题,数据集和我们类似,不过他们的有评分,我们的只有click
主要的思想就是,对评分求均值,之后低于均值的,在输出评分上-1,高于均值的+0.5
我们可不可以,按用户的性别,城市,年龄的重要性进行一种加减评分
emmmm,新手,随便说说
他的数据集
和这次比赛,个人感觉挺像的
我这里也有代码复现:
# Imports for data io operations
from collections import deque
from six import next
import readers
# Main imports for training
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
# Evaluate train times per epoch
import time
# Constant seed for replicating training results
np.random.seed(42)
u_num = 6040 # Number of users in the dataset
i_num = 3952 # Number of movies in the dataset
batch_size = 1000 # Number of samples per batch
dims = 5 # Dimensions of the data, 15
max_epochs = 50 # Number of times the network sees all the training data
# Device used for all computations
place_device = "/cpu:0"
device="/cpu:0"
def get_data():
# Reads file using the demiliter :: form the ratings file
# Columns are user ID, item ID, rating, and timestamp
# Sample data - 3::1196::4::978297539
df = readers.read_file("./ml-1m/ratings.dat", sep="::")
rows = len(df)
# Purely integer-location based indexing for selection by position
df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
# Separate data into train and test, 90% for train and 10% for test
split_index = int(rows * 0.9)
# Use indices to separate the data
df_train = df[0:split_index]
df_test = df[split_index:].reset_index(drop=True)
return df_train, df_test
def clip(x):
return np.clip(x, 1.0, 5.0)
def loss(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"):
with tf.device(device):
# Use L2 loss to compute penalty
cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
penalty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
cost = tf.add(cost_l2, tf.multiply(regularizer, penalty))
# 'Follow the Regularized Leader' optimizer
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
return cost, train_op
# Read data from ratings file to build a TF model
df_train, df_test = get_data()
samples_per_batch = len(df_train) // batch_size
print("Number of train samples %d, test samples %d, samples per batch %d" %
(len(df_train), len(df_test), samples_per_batch))
# Peeking at the top 5 user values
print(df_train["user"].head())
print(df_test["user"].head())
# Peeking at the top 5 item values
print(df_train["item"].head())
print(df_test["item"].head())
# Peeking at the top 5 rate values
print(df_train["rate"].head())
print(df_test["rate"].head())
bias_global = tf.get_variable("bias_global", shape=[])
w_bias_user = tf.get_variable("embd_bias_user", shape=[u_num])
w_bias_item = tf.get_variable("embd_bias_item", shape=[i_num])
w_user = tf.get_variable("embd_user", shape=[u_num, dims],
initializer=tf.truncated_normal_initializer(stddev=0.02))
w_item = tf.get_variable("embd_item", shape=[i_num, dims],
initializer=tf.truncated_normal_initializer(stddev=0.02))
def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
with tf.device("/cpu:0"):
with tf.variable_scope('lsi',reuse=True):
# Using a global bias term
# bias_global = tf.get_variable("bias_global", shape=[])
# # User and item bias variables
# # get_variable: Prefixes the name with the current variable scope
# # and performs reuse checks.
# w_bias_user = tf.get_ariable("embd_bias_user", shape=[user_num])
# w_bias_item = tf.get_ariable("embd_bias_item", shape=[item_num])
# embedding_lookup: Looks up 'ids' in a list of embedding tensors
# Bias embeddings for user and items, given a batch
bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
# User and item weight variables
# w_user = tf.get_variable("embd_user", shape=[user_num, dim],
# initializer=tf.truncated_normal_initializer(stddev=0.02))
# w_item = tf.get_variable("embd_item", shape=[item_num, dim],
# initializer=tf.truncated_normal_initializer(stddev=0.02))
# Weight embeddings for user and items, given a batch
embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
with tf.device(device):
# reduce_sum: Computes the sum of elements across dimensions of a tensor
infer = tf.reduce_sum(tf.multiply(embd_user, embd_item), 1)
infer = tf.add(infer, bias_global)
infer = tf.add(infer, bias_user)
infer = tf.add(infer, bias_item, name="svd_inference")
# l2_loss: Computes half the L2 norm of a tensor without the sqrt
regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item),
name="svd_regularizer")
return infer, regularizer
# Using a shuffle iterator to generate random batches, for training
iter_train = readers.ShuffleIterator([df_train["user"],
df_train["item"],
df_train["rate"]],
batch_size=batch_size)
# Sequentially generate one-epoch batches, for testing
iter_test = readers.OneEpochIterator([df_test["user"],
df_test["item"],
df_test["rate"]],
batch_size=-1)
user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])
infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.0010, reg=0.05, device=place_device)
#print(user_batch)
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
errors = deque(maxlen=samples_per_batch)
start = time.time()
for i in range(max_epochs * samples_per_batch):
users, items, rates = next(iter_train)
_, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
item_batch: items,
rate_batch: rates})
pred_batch = clip(pred_batch)
errors.append(np.power(pred_batch - rates, 2))
if i % samples_per_batch == 0:
train_err = np.sqrt(np.mean(errors))
test_err2 = np.array([])
for users, items, rates in iter_test:
pred_batch = sess.run(infer, feed_dict={user_batch: users,
item_batch: items})
pred_batch = clip(pred_batch)
test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
end = time.time()
print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
start = end
saver.save(sess, './save/')
readers.py
from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd
def read_file(filname, sep="\t"):
col_names = ["user", "item", "rate", "st"]
df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
df["user"] -= 1
df["item"] -= 1
for col in ("user", "item"):
df[col] = df[col].astype(np.int32)
df["rate"] = df["rate"].astype(np.float32)
return df
class ShuffleIterator(object):
"""
Randomly generate batches
"""
def __init__(self, inputs, batch_size=10):
self.inputs = inputs
self.batch_size = batch_size
self.num_cols = len(self.inputs)
self.len = len(self.inputs[0])
self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))
def __len__(self):
return self.len
def __iter__(self):
return self
def __next__(self):
return self.next()
def next(self):
ids = np.random.randint(0, self.len, (self.batch_size,))
out = self.inputs[ids, :]
return [out[:, i] for i in range(self.num_cols)]
class OneEpochIterator(ShuffleIterator):
"""
Sequentially generate one-epoch batches, typically for test data
"""
def __init__(self, inputs, batch_size=10):
super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size)
if batch_size > 0:
self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size))
else:
self.idx_group = [np.arange(self.len)]
self.group_id = 0
def next(self):
if self.group_id >= len(self.idx_group):
self.group_id = 0
raise StopIteration
out = self.inputs[self.idx_group[self.group_id], :]
self.group_id += 1
return [out[:, i] for i in range(self.num_cols)]
制定个人参赛计划
我的任务是召回,我的计划就是再研究研究召回。
- 18号主要还是再研究item_cf 和SVD,这两个基础模型 可以话,再用用suprise这个库,里面有常用的一些模型算法可以调用
- 19号开始研究DeepMatch
这几天,个人主要自己写了一个基于SVD分解的50item推荐
我数据处理的可能有问题,有些结果不是很对
import pandas as pd
import numpy as np
import time
import sqlite3
train_dataset_click0=pd.read_csv('./data/underexpose_train/underexpose_train_click-0.csv',names=['user_id','item_id','time'])
train_dataset_click1=pd.read_csv('./data/underexpose_train/underexpose_train_click-1.csv',names=['user_id','item_id','time'])
train_dataset_click2=pd.read_csv('./data/underexpose_train/underexpose_train_click-2.csv',names=['user_id','item_id','time'])
train_dataset_click3=pd.read_csv('./data/underexpose_train/underexpose_train_click-3.csv',names=['user_id','item_id','time'])
train_dataset_click4=pd.read_csv('./data/underexpose_train/underexpose_train_click-4.csv',names=['user_id','item_id','time'])
train_dataset_click5=pd.read_csv('./data/underexpose_train/underexpose_train_click-5.csv',names=['user_id','item_id','time'])
train_dataset_click6=pd.read_csv('./data/underexpose_train/underexpose_train_click-6.csv',names=['user_id','item_id','time'])
train_temp1=train_dataset_click1.append(train_dataset_click2)
train_temp2=train_temp1.append(train_dataset_click3)
train_temp3=train_temp2.append(train_dataset_click4)
train_temp4=train_temp3.append(train_dataset_click5)
train_temp5=train_temp4.append(train_dataset_click6)
train_temp6=train_temp5.append(train_dataset_click0)
train_dataset_click=train_temp6
train_dataset_click['click']=1
train_dataset_click=train_dataset_click.drop(columns=['time'])
# train_dataset_click.head()
train_dataset_click.info()
# train_dataset_click.tail()
test_dataset_click0=pd.read_csv('./data/underexpose_test/underexpose_test_click-0.csv',names=['user_id','item_id','time'])
test_dataset_click1=pd.read_csv('./data/underexpose_test/underexpose_test_click-1.csv',names=['user_id','item_id','time'])
test_dataset_click2=pd.read_csv('./data/underexpose_test/underexpose_test_click-2.csv',names=['user_id','item_id','time'])
test_dataset_click3=pd.read_csv('./data/underexpose_test/underexpose_test_click-3.csv',names=['user_id','item_id','time'])
test_dataset_click4=pd.read_csv('./data/underexpose_test/underexpose_test_click-4.csv',names=['user_id','item_id','time'])
test_dataset_click5=pd.read_csv('./data/underexpose_test/underexpose_test_click-5.csv',names=['user_id','item_id','time'])
test_dataset_click6=pd.read_csv('./data/underexpose_test/underexpose_test_click-6.csv',names=['user_id','item_id','time'])
test_temp1=test_dataset_click1.append(test_dataset_click2)
test_temp2=test_temp1.append(test_dataset_click3)
test_temp3=test_temp2.append(test_dataset_click4)
test_temp4=test_temp3.append(test_dataset_click5)
test_temp5=test_temp4.append(test_dataset_click6)
test_temp6=test_temp5.append(test_dataset_click0)
test_dataset_click=test_temp6
test_dataset_click['click']=1
test_dataset_click=test_dataset_click.drop(columns=['time'])
# test_dataset_click.head()
test_dataset_click.info()
dataset_click=train_dataset_click.append(test_dataset_click)
dataset_click
dataset_click.info()在这里插入代码片
user_qtime0=pd.read_csv('./data/underexpose_test/underexpose_test_qtime-0.csv',names=['user_id','time'])
user_qtime1=pd.read_csv('./data/underexpose_test/underexpose_test_qtime-1.csv',names=['user_id','time'])
user_qtime2=pd.read_csv('./data/underexpose_test/underexpose_test_qtime-2.csv',names=['user_id','time'])
user_qtime3=pd.read_csv('./data/underexpose_test/underexpose_test_qtime-3.csv',names=['user_id','time'])
user_qtime4=pd.read_csv('./data/underexpose_test/underexpose_test_qtime-4.csv',names=['user_id','time'])
user_qtime5=pd.read_csv('./data/underexpose_test/underexpose_test_qtime-5.csv',names=['user_id','time'])
user_qtime6=pd.read_csv('./data/underexpose_test/underexpose_test_qtime-6.csv',names=['user_id','time'])
temp1=user_qtime1.append(user_qtime2)
temp2=temp1.append(user_qtime3)
temp3=temp2.append(user_qtime4)
temp4=temp3.append(user_qtime5)
temp5=temp4.append(user_qtime6)
temp6=temp5.append(user_qtime0)
all_user_id=temp6
all_user_id.reset_index(drop= True)
# user_qtime1head()
user_id=all_user_id.user_id
user_id.head()
user_id=user_id.reset_index(drop= True)
user_id[0]
1
len(user_id)
len(all_user_id)
12081
submit= pd.DataFrame({'user_id':user_id.astype('int32'),
'item1':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item2':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item3':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item4':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item5':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item6':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item7':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item8':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item9':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item10':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item11':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item12':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item13':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item14':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item15':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item16':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item17':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item18':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item19':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item20':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item21':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item22':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item23':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item24':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item25':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item26':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item27':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item28':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item29':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item30':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item31':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item32':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item33':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item34':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item35':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item36':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item37':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item38':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item39':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item40':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item41':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item42':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item43':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item44':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item45':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item46':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item47':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item48':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item49':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
'item50':pd.Series(np.arange(0,len(all_user_id))).astype('int32'),
})
submit
dataset_click.shape
dataset_click.info()
dataset_click.to_csv('train_click-1.txt',index=False,header=False)
对每一个用户,分别统计他的点击总量
output_dict = {}
with open('train_click-1.txt') as f: # train_click-1.txt 不能含有字符串
for line_number, line in enumerate(f):
user = line.split(',')[0] # 确定user_id
click_count = int(line.split(',')[2])
if user in output_dict:
click_count +=output_dict[user]
output_dict.update({user:click_count})
output_dict.update({user:click_count})
output_list = [{'user_id':k,'click':v} for k,v in output_dict.items()]
click_count_df = pd.DataFrame(output_list)
click_count_df = click_count_df.sort_values(by = 'click', ascending = False)
click_count_df.to_csv(path_or_buf='user_clickcount_df.csv', index = False)
click_count_df.head()
click_count_df.info()
对每一个item,分别统计他的点击总量
output_dict = {}
with open('train_click-1.txt') as f: # train_click-1.txt 不能含有字符串
for line_number, line in enumerate(f):
item = line.split(',')[1] # 确定item_id
click_count = int(line.split(',')[2])
if item in output_dict:
click_count +=output_dict[item]
output_dict.update({item:click_count})
output_dict.update({item:click_count})
output_list = [{'item_id':k,'click':v} for k,v in output_dict.items()]
item_click_count_df = pd.DataFrame(output_list)
item_click_count_df = item_click_count_df.sort_values(by = 'click', ascending = False)
item_click_count_df.to_csv(path_or_buf='item_clickcount_df.csv', index = False)
item_click_count_df.head()
item_click_count_df.info()
看看目前的排行情况
user_clickcount_df = pd.read_csv(filepath_or_buffer='user_clickcount_df.csv')
user_clickcount_df.head(n =10)
user_clickcount_df.info()
item_clickcount_df = pd.read_csv(filepath_or_buffer='item_clickcount_df.csv')
item_clickcount_df.head(10)
item_clickcount_df.info()
取其中一部分数(按大小排好序的了,这些应该是比较重要的数据),作为我们的实验数据,这里数据可能有问题
前18866userm的点击量占到了91%
total_click_count = sum(item_clickcount_df.click)
print ((float(user_clickcount_df.head(n=18866).click.sum())/total_click_count)*100)
user_click_count_subset = user_clickcount_df.head(n=18866) # 取前13866名用户,作为用户子集
91.95156677630074
total_click_count = sum(item_clickcount_df.click)
total_click_count
2031911
前37000item的点击量占到了92%
print ((float(item_clickcount_df.head(n=67000).click.sum())/total_click_count)*100)
item_click_count_subset = item_clickcount_df.head(n=67000) # 取前37000名item,作为item子集
取用户,歌
user_subset = list(user_click_count_subset.user_id)
item_subset = list(item_click_count_subset.item_id)
过滤掉其他用户数据
dataset_click_sub = dataset_click[dataset_click.user_id.isin(user_subset) ] # 在user_id在user_subset中的数据
del(dataset_click)
# 在dataset_click_sub数据中,还有item_id的数据
dataset_click_sub_item = dataset_click_sub[dataset_click_sub.item_id.isin(item_subset)]
del(dataset_click_sub)
# 只含有user_subset、item_subset的数据
dataset_click_sub_item.to_csv(path_or_buf='dataset_click_sub_item.csv', index=False)
我们的数据量原来是(242132, 3)
dataset_click_sub_item.shape
(1584461, 3)
dataset_click_sub_item.head()
## pd.merge()可以试试 可以加上这个用户和item的特征,再进行分析
## 可以分析出那些特征比较重要,有空可以试试
推荐系统
import Recommenders as Recommenders
from sklearn.model_selection import train_test_split
简单暴力,排行榜单推荐
train_data, test_data = train_test_split(dataset_click_sub_item, test_size = 0.40, random_state=0)
train_data.head()
def create_popularity_recommendation(train_data, user_id, item_id):
#Get a count of user_ids for each unique song as recommendation score
train_data_grouped = train_data.groupby([item_id]).agg({user_id: 'count'}).reset_index()
train_data_grouped.rename(columns = {user_id: 'score'},inplace=True)
#Sort the songs based upon recommendation score
train_data_sort = train_data_grouped.sort_values(['score', item_id], ascending = [0,1])
#Generate a recommendation rank based upon score
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
#Get the top 10 recommendations
popularity_recommendations = train_data_sort.head(50)
return popularity_recommendations
recommendations = create_popularity_recommendation(train_data,'user_id','item_id')
recommendations
recommendations.item_id.values[0]
for i in range(1,51):
item_id='item'+str(i)
submit.ix[submit.user_id>0,item_id]=recommendations.item_id.values[i-1]
submit.to_csv('submission.csv',index=False,header=False)
基于歌曲相似度的推荐
# item_count_subset=item_clickcount_df.head(n=5000)
# user_subset=list(user_click_subset.user_id)
user_subset = list(user_click_count_subset.user_id)
item_subset = list(item_click_count_subset.item_id)
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'item_id')
user_id = list(train_data.user_id)[7]
user_items = is_model.get_user_items(user_id)
is_model.recommend(user_id)
基于矩阵分解(SVD)的推荐
先计算item被当前用户点击量 / 用户点击总量 当做分值
user_click_item_sum_df = train_data[['user_id','click']].groupby('user_id').sum().reset_index()
user_click_item_sum_df.rename(columns={'click':'total_click_count'},inplace=True)
user_click_item_sum_df.head()
item_user_merged = pd.merge(train_data,user_click_item_sum_df)
item_user_merged.head()
#item_user_merged.info()
fractional_click_count==得分 即 item被当前用户点击量 / 用户点击总量
item_user_merged['fractional_click_count'] = item_user_merged['click']/item_user_merged['total_click_count']
item_user_merged[item_user_merged.user_id ==1][['user_id','item_id','click','fractional_click_count']].head()
from scipy.sparse import coo_matrix
small_set = item_user_merged
# 建索引-index
# user_codes = small_set.user_id.drop_duplicates().reset_index()
# item_codes = small_set.item_id.drop_duplicates().reset_index()
# user_codes.rename(columns={'index':'user_index'}, inplace=True)
# item_codes.rename(columns={'index':'item_index'}, inplace=True)
# item_codes['it_index_value'] = list(item_codes.index)
# user_codes['us_index_value'] = list(user_codes.index)
# small_set = pd.merge(small_set,item_codes,how='left')
# small_set = pd.merge(small_set,user_codes,how='left')
mat_candidate = small_set[['user_id','item_id','fractional_click_count']]
data_array = mat_candidate.fractional_click_count.values
row_array = mat_candidate.user_id.values
col_array = mat_candidate.item_id.values
data_sparse = coo_matrix((data_array, (row_array, col_array)),dtype=float)
data_sparse # 构建的稀疏矩阵
import math as mt
from scipy.sparse.linalg import * #used for matrix multiplication
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix
def compute_svd(urm, K):
U, s, Vt = svds(urm, K)
dim = (len(s), len(s))
S = np.zeros(dim, dtype=np.float32)
for i in range(0, len(s)):
S[i,i] = mt.sqrt(s[i])
U = csc_matrix(U, dtype=np.float32)
S = csc_matrix(S, dtype=np.float32)
Vt = csc_matrix(Vt, dtype=np.float32)
return U, S, Vt
def compute_estimated_matrix(urm, U, S, Vt, uTest, K, test):
rightTerm = S*Vt
max_recommendation = 250
estimatedRatings = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16)
recomendRatings = np.zeros(shape=(MAX_UID,max_recommendation ), dtype=np.float16)
for userTest in uTest:
prod = U[userTest, :]*rightTerm
estimatedRatings[userTest, :] = prod.todense()
print(estimatedRatings[userTest,:][:5])
recomendRatings[userTest, :] = (-estimatedRatings[userTest, :]).argsort()[:max_recommendation]
print(recomendRatings)
return recomendRatings
K=100
urm = data_sparse
MAX_PID = urm.shape[1]
MAX_UID = urm.shape[0]
U, S, Vt = compute_svd(urm, K)
uTest = [1,3]
uTest_recommended_items = compute_estimated_matrix(urm, U, S, Vt, uTest, K, True)
for user in uTest:
print("Recommendation for user with user id {}". format(user))
rank_value = 1
for i in uTest_recommended_items[user,0:50]:
# item_details = small_set[small_set.item_id == i].drop_duplicates('item_id')[['item_id']]
# print("The number {} recommended item is {} ".format(rank_value, list(item_details['item_id'])[0]))
# rank_value+=1
print(i)
结果问题感觉挺大的,竟然还有inf,数据的处理感觉有点问题
Recommendation for user with user id 1
inf
29810.0
1471.0
21420.0
27890.0
inf
21280.0
4484.0
inf
6864.0
62080.0
6570.0
8148.0
52770.0
8820.0
1591.0
15630.0
58340.0
inf
58620.0
60350.0
13850.0
5524.0
8150.0
56540.0
10936.0
2420.0
8180.0
32690.0
35740.0
13180.0
inf
inf
15976.0
2954.0
11256.0
22260.0
inf
8210.0
12900.0
27220.0
40420.0
35460.0
37000.0
1609.0
110.0
4210.0
42880.0
20340.0
63330.0
Recommendation for user with user id 3
45440.0
inf
inf
inf
36200.0
24590.0
15900.0
31820.0
5410.0
30320.0
4780.0
33800.0
inf
inf
18930.0
55940.0
4268.0
44860.0
27070.0
inf
30140.0
43940.0
9710.0
9370.0
40770.0
23730.0
841.0
20350.0
27490.0
23070.0
29620.0
64770.0
12240.0
36030.0
inf
17010.0
35140.0
inf
inf
inf
inf
35300.0
54600.0
inf
40930.0
inf
48830.0
inf
inf
inf