原题地址:天池新人实战赛之[离线赛]
题目理解:
- 数据分为用户操作集D(tianchi_fresh_comp_train_user_2w.csv)和 商品子集P(tianchi_fresh_comp_train_item_2w),我们的任务即是利用D中数据训练得到模型,从P中商品选择出D中用户最可能购买的商品
- 利用 python 的 pandas 第三方包进行数据处理
- 删去无法利用的地理位置列
- 假定操作行为的时间段对购买没有影响(能力有限没有分析)
- D中商品含有不在P中的,会对学习器的预测产生干扰,故删去这些数据
- 双十二用户会大量购买,但并不能反映用户在一周后的购买倾向,甚至会产生误导,故删去这些数据
- 将D中用户的操作行为进行哑编码,从{1, 2, 3, 4},编码为四个列向量[‘look’], [‘like’], [‘putin’], [‘buy’],这样可以增强学习器的非线性拟合能力
- 将12-19号前两天的操作数据标记出来(是-1,否-0)
- 将12-19号前两天的购买数据标记出来(是-1,否-0)
- 对不同的操作数据进行加权,基于宏观的对比,计算出浏览,收藏,加购物车,购买的数量及比例
代码展示:
'''
进行数据的筛选
'''
import pandas as pd
import numpy as np
print("reading csv file") # load csv files
df = pd.read_csv('tianchi_fresh_comp_train_user.csv')
itemP = pd.read_csv('tianchi_fresh_comp_train_item.csv')
print("read csv file DONE!")
del df["user_geohash"] # delete geohash column
print("processing time column")
df['time'] = df['time'].astype(str) # transform df['time'] to string type so we can use str's attribution
df['time'] = df['time'].str.slice(0, 10) # use slice method to delete hour part in df['time']
df = df.loc[df['time'].str.contains('2014-12-12') == False] # delete datas that happen in 2014-12-12
print("process time column DONE!")
print("processing item columns")
del itemP['item_geohash'] # delete geohash column
itemP['item_id'] = itemP['item_id'].astype(str) #transform itemP['item_id'] to string type
itemsub = set(itemP['item_id']) # USE SET TO GET item_id THAT WE NEED BUT NOT LIST!
df['item_id'] = df['item_id'].astype(str)
itemlst = list(df['item_id'])
itemmark = list() #use list to mark datas
for item in itemlst:
if item not in itemsub:
itemmark.append(False)
else:
itemmark.append(True)
df['item_mark'] = itemmark # get new column ['item_mark']
df['item_mark'] = df['item_mark'].astype(str)
df = df.loc[df['item_mark'].str.contains('True')] # delete datas that not in
print("process item columns DONE!")
df.to_csv("result.csv", index=False) # save as new file.csv
'''
哑编码
'''
import pandas as pd
import numpy as np
'''
used for one-hot encoding
'''
df = pd.read_csv('result.csv') # load file
one_hot = pd.getdummies(df) # noe-hot encoding
one_hot.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
idlst = list(range(1970968))
df['id'] = idlst
df = pd.merge(df, one_hot, left_on='id', right_on='id') # 合并DataFrame
df.rename(columns={'1': 'look', '2': 'like', '3': 'putin', '4': 'buy'}, inplace=True)
del df['behavior_type']
del df['id']
del df['item_mark']
df.to_csv('one_hot.csv', index=False) # save as new file.csv
'''
标记日期
'''
import pandas as pd
import numpy as np
df = pd.read_csv('one_hot.csv')
df['time'] = pd.to_datetime(df['time']) # transform from str to datetime so that we can use datetime.attrinbution for judgement
timemark = list() # 将判断结果储存在新list中
def judge(date): # 判断日期
if ((date.month==12 and date.day==17) or (date.month==12 and date.day==18)):
timemark.append(1)
else:
timemark.append(0)
df['time'].apply(judge)
df['time_mark'] = timemark # mark 2 days early than 2014-12-19(_2014-12-17_,_2014-12-18_)
df.to_csv('tmmk_vsr.csv')
df = pd.read_csv('tmmk_vsr.csv')
df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
df['2days'] = df['buy'] * df['time_mark'] # 标记处19号前两天的购买数据
df.to_csv('2days.csv', index=False) # save as new file.csv
'''
加权
'''
import pandas as pd
import numpy as np
df = pd.read_csv('2days.csv')
# df['look'].sum() = 1863827
# df['like'].sum() = 32506
# df['putin'].sum() = 53646
# df['buy'].sum() = 20989
df['wight'] = (df['look'] * (20989 / 1863827) + df['like'] * (20989 / 32506) + \
df['putin'] * (20989 / 53646) + df['buy'] + df['time_mark']) * ((2 - df['2days']) / 2)
df.to_csv('relation.csv', index=False) # save as new file.csv
至此,对2千万条数据做了初步的预处理,目前共有1970970条数据
考虑到浏览操作的样本太多且对购买影响很小,并且数据集存在着样本的极度不平衡问题—反例过多,正例过少,导致学到的模型即使判断所有的样本为反例也能取得不错的评分,所以我筛除了浏览操作的样本,并且把18号加入购物车操作的样本删选出来做为预测集,再把预测为正例的样本筛选出来作为最后的结果。
import pandas as pd
import numpy as np
data = pd.read_csv("data.csv")
miniDataSet = data.loc[data['look'] != 1] # 筛除浏览样本
del miniDataSet['id']
del miniDataSet['look']
miniDataSet.to_csv("dataSet.csv", index=False) # 保存为新的数据集
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # 数据分割ˇ
from sklearn.metrics import classification_report # 学习器评估
from sklearn import svm # 支持向量机
from sklearn import metrics
data = pd.read_csv("dataSet.csv", index_col="time")
data['label_y'] = data['buy'] #以购买操作作为标记
outputSet = data.ix['2014-12-18']
outputSet = outputSet.loc[outputSet['putin'] == 1]
X = data.ix[:, ['user_id', 'item_id', 'item_category', 'putin', 'buy', 'time_mark', 'wight']]
y = data['label_y']
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = svm.SVC(C=100, class_weight='balanced')
# 训练模型
clf.fit(X_train, y_train)
# 预测
predict = clf.predict(X_test)
# 评估
print(clf.score(X_test, y_test))
print(classification_report(y_test, predict))
print(metrics.f1_score(y_test, predict, average='weighted'))
X = outputSet.ix[:, ['user_id', 'item_id', 'item_category', 'putin', 'buy', 'time_mark', 'wight']]
# 预测
output = clf.predict(X)
X['output'] = output
X = X.loc[X['output'] > 0.0]
X = X.ix[:, ['user_id', 'item_id']]
del X['time']
# 保存结果
X.to_csv('tianchi_mobile_recommendation_predict.csv', index=False)
上传结果后,成绩如下
总结:
- 做题应该耐心读题,先把题目搞懂再动手做
- TianChi官网也有很多免费的学习资源,要善于利用互联网上的资源