导包
import pandas as pd
import numpy as np
from datetime import date #处理时间数据
import warnings
warnings.filterwarnings('ignore') #忽略警告
数据读入
off_train = pd.read_csv('data/ccf_offline_stage1_train.csv',header=None,keep_default_na=False)
off_train.columns=['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']
off_test=pd.read_csv('data/ccf_offline_stage1_test_revised.csv',header=None,keep_default_na=False)
off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']
on_train = pd.read_csv('data/ccf_online_stage1_train.csv',header=None,keep_default_na=False)
on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']
keep_default_na=False 将NAN填充为null,方便后续处理。
header=None,重新定义columns,主要把大写字母改成小写。
简单查看数据
off_train.head()
off_train.info()
off_train.describe()
off_train
off_train.shape[0] #1754884 record
off_train[(off_train['coupon_id']!=‘null’)].shape[0] #1053282 with coupon_id
#off_train=off_train[~off_train['coupon_id'].isin(['null'])]
off_train['coupon_id'].unique().shape[0] #9738 coupon
off_train['user_id'].unique().shape[0] #539438 users
off_train['merchant_id'].unique().shape[0] #8415 merchants
(off_train[(off_train['date']!=‘null’)]).min() #max()
#1754884 record
#1053282 with coupon_id
#9738 coupon
#date_received:20160101~20160615
#date:20160101~20160630
#539438 users
#8415 merchants
off_test
off_test.user_id.unique().shape[0]
len((set(on_train.user_id))&(set(off_train.user_id)))
#计算交集,使用set
#2050 coupon_id.
#date_received:20160701~20160731,
#76309 users(76307 in trainset, 35965 in online_trainset)
#1559 merchants(1558 in trainset)
大致看一下,上面的数据不一定对,懒得run了。
on_train
#11429826 record
#872357 coupon_id
#762858 user(267448 in off_train)
数据集划分
简单来讲就是你需要对你的数据按照一定的时间划分。比如比赛用前五天的数据预测后一天的,但是总的数据集是两个月的,这时候你需要划分你的数据集分好训练集和验证集。
上面的1,2,3是三个训练集,后面带的小框是验证集,等于你需要训练好几个模型。同时,更早之前的数据肯定对现在的预测影响会更小,所以会有个权重的问题,也就是你得到了十个模型,最接近预测日的算model_0,最远的算model_9,那么给model_0权重0.7,给model_9的权重0.05。
dataset split:
dateset3: 20160701~20160731 (113640),features3 from 20160315~20160630 (off_test)
dateset2: 20160515~20160615 (258446),features2 from 20160201~20160514
dateset1: 20160414~20160514 (138303),features1 from 20160101~20160413
dataset3 = off_test
feature3 = off_train[((off_train.date>='20160315')&515')&(off_train.date_rece(off_train.date<='20160630'))|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630'))]
dataset2 = off_train[(off_train.date_received>='20160ived<='20160615')]
feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))]
dataset1 = off_train[(off_train.date_received>='20160414')&(off_train.date_received<='20160514')]
feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))]
#feature(特征) 有优惠券不消费+消费
#label 领取了优惠券