1、比赛内容:
https://tianchi.aliyun.com/getStart/introduction.htm?spm=5176.100066.0.0.62a0c916DZRdDr&raceId=231574
2、解题思路:将转发、评论、赞次数作为物品推荐给博文(预先用结巴分词抽取关键词作为博文标签)
3、参考代码:
1)主程序
# -*- coding: utf-8 -*-
'''
Created on 2017年10月31日
@author: Jason.F
'''
import time
import pandas as pd
from sklearn import cross_validation
import jieba
import jieba.analyse
from ItemCF import ItemCF
class cWeibo:
def __init__(self,path):
self.path=path
def importData(self):
path=self.path
#导入样本集
data=pd.read_csv(path+'\\weibo_train_data.txt',encoding='utf8',sep='\t',names=['luid','mid','time','fcs','ccs','lcs','cont'])#nrows=1000
data['fcs']=data['fcs'].astype('int')#博文发表一周后的转发数,权重0.5
data['ccs']=data['ccs'].astype('int')#博文发表一周后的评论数,权重0.25
data['lcs']=data['lcs'].astype('int')#博文发表一周后的点赞数,权重0.25
train,test=cross_validation.train_test_split(data,test_size=0.2)
self.traindata = pd.DataFrame(data)#全量训练
self.testdata = pd.DataFrame(test)#测试集
print '训练集,有:', self.traindata.shape[0], '行', self.traindata.shape[1], '列'
print '测试集,有:', self.testdata.shape[0], '行', self.testdata.shape[1], '列'
#导入预测集
data=pd.read_csv(path+'\\weibo_predict_data.txt',encoding='utf8',sep='\t',names=['luid','mid','time','cont'])#nrows=100
self.predata=data #预测集
print '预测集,有:', self.predata.shape[0], '行', self.predata.shape[1], '列'
def ETL(self):
'''
#uid映射为数字编号
ut_train=set(self.traindata.ix[:,0])
ut_pred=set(self.predata.ix[:,0])
ut=list(ut_train.symmetric_difference(ut_pred))#取并集并去重
df_ut=pd.DataFrame(ut,columns=['luid'])
df_ut['uid']=df_ut.index
self.traindata=pd.merge(self.traindata,df_ut, on=['luid'], how='left')
self.traindata=self.traindata[['uid','time','fcs','ccs','lcs','cont']]
self.testdata=pd.merge(self.testdata,df_ut, on=['luid'], how='left')
self.testdata=self.testdata[['uid','time','fcs','ccs','lcs','cont']]
self.predata=pd.merge(self.predata,df_ut, on=['luid'], how='left')
self.predata=self.predata[['luid','mid'