import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline
import datetime
import os
import seaborn as sns#数据可视化from datetime import date
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
import pickle #用于存储模型import seaborn as sns
from sklearn.metrics import*from sklearn.model_selection import*
train = pd.read_csv("airbnb/train_users_2.csv")
test = pd.read_csv("airbnb/test_users.csv")print('the columns name of training dataset:\n',train.columns)print('the columns name of test dataset:\n',test.columns)
the columns name of training dataset:
Index(['id', 'date_account_created', 'timestamp_first_active',
'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
'language', 'affiliate_channel', 'affiliate_provider',
'first_affiliate_tracked', 'signup_app', 'first_device_type',
'first_browser', 'country_destination'],
dtype='object')
the columns name of test dataset:
Index(['id', 'date_account_created', 'timestamp_first_active',
'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
'language', 'affiliate_channel', 'affiliate_provider',
'first_affiliate_tracked', 'signup_app', 'first_device_type',
'first_browser'],
dtype='object')
count 88908
unique 1976
top 2014-05-22
freq 248
Name: date_first_booking, dtype: object
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: date_first_booking, dtype: float64
#将特征action次数低于阈值100的列为OTHER
act_freq =100#Threshold of frequency
act =dict(zip(*np.unique(df_sessions.action, return_counts=True)))
df_sessions.action = df_sessions.action.apply(lambda x:'OTHER'if act[x]< act_freq else x)#np.unique(df_sessions.action, return_counts=True) 取以数组形式返回非重复的action值和它的数量#zip(*(a,b))a,b种元素一一对应,返回zip object
#对特征action,action_detail,action_type,device_type,secs_elapsed进行细化# 首先将用户的特征根据用户id进行分组# **特征action:**统计每个用户总的action出现的次数,各个action类型的数量,平均值以及标准差# **特征action_detail:**统计每个用户总的action_detail出现的次数,各个action_detail类型的数量,平均值以及标准差# **特征action_type:**统计每个用户总的action_type出现的次数,各个action_type类型的数量,平均值,标准差以及总的停留时长(进行log处理)# **特征device_type:**统计每个用户总的device_type出现的次数,各个device_type类型的数量,平均值以及标准差# **特征secs_elapsed:**对缺失值用0填充,统计每个用户secs_elapsed时间的总和,平均值,标准差以及中位数(进行log处理),(总和/平均数),secs_elapsed(log处理后)各个时间出现的次数#对action特征进行细化
f_act = df_sessions.action.value_counts().argsort()
f_act_detail = df_sessions.action_detail.value_counts().argsort()
f_act_type = df_sessions.action_type.value_counts().argsort()
f_dev_type = df_sessions.device_type.value_counts().argsort()#按照id进行分组
dgr_sess = df_sessions.groupby(['id'])#Loop on dgr_sess to create all the features.
samples =[]#samples列表
ln =len(dgr_sess)#计算分组后df_sessions的长度for g in dgr_sess:#对dgr_sess中每个id的数据进行遍历
gr = g[1]#data frame that comtains all the data for a groupby value 'zzywmcn0jv'
l =[]#建一个空列表,临时存放特征#the id for example:'zzywmcn0jv'
l.append(g[0])#将id值放入空列表中# number of total actions
l.append(len(gr))#将id对应数据的长度放入列表#secs_elapsed 特征中的缺失值用0填充再获取具体的停留时长值
sev = gr.secs_elapsed.fillna(0).values #These values are used later.#action features 特征-用户行为 #每个用户行为出现的次数,各个行为类型的数量,平均值以及标准差
c_act =[0]*len(f_act)for i,v inenumerate(gr.action.values):#i是从0-1对应的位置,v 是用户行为特征的值
c_act[f_act[v]]+=1
_, c_act_uqc = np.unique(gr.action.values, return_counts=True)#计算用户行为行为特征各个类型数量的长度,平均值以及标准差
c_act +=[len(c_act_uqc), np.mean(c_act_uqc), np.std(c_act_uqc)]
l = l + c_act
#action_detail features 特征-用户行为具体#(how many times each value occurs, numb of unique values, mean and std)
c_act_detail =[0]*len(f_act_detail)for i,v inenumerate(gr.action_detail.values):
c_act_detail[f_act_detail[v]]+=1
_, c_act_det_uqc = np.unique(gr.action_detail.values, return_counts=True)
c_act_detail +=[len(c_act_det_uqc), np.mean(c_act_det_uqc), np.std(c_act_det_uqc)]
l = l + c_act_detail
#action_type features 特征-用户行为类型 click等#(how many times each value occurs, numb of unique values, mean and std#+ log of the sum of secs_elapsed for each value)
l_act_type =[0]*len(f_act_type)
c_act_type =[0]*len(f_act_type)for i,v inenumerate(gr.action_type.values):
l_act_type[f_act_type[v]]+= sev[i]#sev = gr.secs_elapsed.fillna(0).values ,求每个行为类型总的停留时长
c_act_type[f_act_type[v]]+=1
l_act_type = np.log(1+ np.array(l_act_type)).tolist()#每个行为类型总的停留时长,差异比较大,进行log处理
_, c_act_type_uqc = np.unique(gr.action_type.values, return_counts=True)
c_act_type +=[len(c_act_type_uqc), np.mean(c_act_type_uqc), np.std(c_act_type_uqc)]
l = l + c_act_type + l_act_type
#device_type features 特征-设备类型#(how many times each value occurs, numb of unique values, mean and std)
c_dev_type =[0]*len(f_dev_type)for i,v inenumerate(gr.device_type .values):
c_dev_type[f_dev_type[v]]+=1
c_dev_type.append(len(np.unique(gr.device_type.values)))
_, c_dev_type_uqc = np.unique(gr.device_type.values, return_counts=True)
c_dev_type +=[len(c_dev_type_uqc), np.mean(c_dev_type_uqc), np.std(c_dev_type_uqc)]
l = l + c_dev_type
#secs_elapsed features 特征-停留时长
l_secs =[0]*5
l_log =[0]*15iflen(sev)>0:#Simple statistics about the secs_elapsed values.
l_secs[0]= np.log(1+ np.sum(sev))
l_secs[1]= np.log(1+ np.mean(sev))
l_secs[2]= np.log(1+ np.std(sev))
l_secs[3]= np.log(1+ np.median(sev))
l_secs[4]= l_secs[0]/float(l[1])##Values are grouped in 15 intervals. Compute the number of values#in each interval.#sev = gr.secs_elapsed.fillna(0).values
log_sev = np.log(1+ sev).astype(int)#np.bincount():Count number of occurrences of each value in array of non-negative ints.
l_log = np.bincount(log_sev, minlength=15).tolist()
l = l + l_secs + l_log
#The list l has the feature values of one sample.
samples.append(l)#preparing objects
samples = np.array(samples)
samp_ar = samples[:,1:].astype(np.float16)#取除id外的特征数据
samp_id = samples[:,0]#取id,id位于第一列#为提取的特征创建一个dataframe
col_names =[]#name of the columnsfor i inrange(len(samples[0])-1):#减1的原因是因为有个id
col_names.append('c_'+str(i))#起名字的方式
df_agg_sess = pd.DataFrame(samp_ar, columns=col_names)
df_agg_sess['id']= samp_id
df_agg_sess.index = df_agg_sess.id#将id作为index
df_agg_sess.head()
c_0
c_1
c_2
c_3
c_4
c_5
c_6
c_7
c_8
c_9
...
c_448
c_449
c_450
c_451
c_452
c_453
c_454
c_455
c_456
id
id
00023iyk9l
40.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
12.0
6.0
2.0
3.0
3.0
1.0
0.0
1.0
0.0
00023iyk9l
0010k6l0om
63.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
8.0
12.0
2.0
8.0
4.0
3.0
0.0
0.0
0.0
0010k6l0om
001wyh0pz8
90.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
27.0
30.0
9.0
8.0
1.0
0.0
0.0
0.0
0.0
001wyh0pz8
0028jgx1x1
31.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
1.0
2.0
3.0
5.0
4.0
1.0
0.0
0.0
0.0
0028jgx1x1
002qnbzfs5
789.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
111.0
102.0
104.0
57.0
28.0
9.0
4.0
1.0
1.0
002qnbzfs5
5 rows × 458 columns
#分析:经过特征提取后,session文件由6个特征变为458个特征
# 对trian和test文件进行特征提取
#标记train文件的行数和存储我们进行预测的目标变量#labels存储了我们进行预测的目标变量country_destination
train = pd.read_csv("airbnb/train_users_2.csv")
test = pd.read_csv("airbnb/test_users.csv")#计算出train的行数,便于之后对train和test数据进行分离操作
train_row = train.shape[0]# The label we need to predict
labels = train['country_destination'].values
#timestamp_first_active#转换为datetime类型
tfa = df.timestamp_first_active.astype(str).apply(lambda x: datetime.datetime(int(x[:4]),int(x[4:6]),int(x[6:8]),int(x[8:10]),int(x[10:12]),int(x[12:])))#提取特征:年,月,日
df['tfa_year']= np.array([x.year for x in tfa])
df['tfa_month']= np.array([x.month for x in tfa])
df['tfa_day']= np.array([x.day for x in tfa])#提取特征:weekday#对结果进行one hot encoding编码#isoweekday() 可以返回一周的星期几,e.g.星期日:0;星期一:1
df['tfa_wd']= np.array([x.isoweekday()for x in tfa])
df_tfa_wd = pd.get_dummies(df.tfa_wd, prefix ='tfa_wd')# one hot encoding
df = pd.concat((df, df_tfa_wd), axis =1)#添加df['tfa_wd'] 编码后的特征
df.drop(['tfa_wd'], axis =1, inplace =True)#删除原有未编码的特征#提取特征:季节#因为判断季节关注的是月份,故对年份进行统一
Y =2000
seasons =[(0,(date(Y,1,1), date(Y,3,20))),#'winter'(1,(date(Y,3,21), date(Y,6,20))),#'spring'(2,(date(Y,6,21), date(Y,9,22))),#'summer'(3,(date(Y,9,23), date(Y,12,20))),#'autumn'(0,(date(Y,12,21), date(Y,12,31)))]#'winter'defget_season(dt):
dt = dt.date()#获取日期
dt = dt.replace(year=Y)#将年统一换成2000年returnnext(season for season,(start, end)in seasons if start <= dt <= end)
df['tfa_season']= np.array([get_season(x)for x in tfa])
df_tfa_season = pd.get_dummies(df.tfa_season, prefix ='tfa_season')# one hot encoding
df = pd.concat((df, df_tfa_season), axis =1)
df.drop(['tfa_season'], axis =1, inplace =True)
# date_account_created#将date_account_created转换为datetime类型
dac = pd.to_datetime(df.date_account_created)# 提取特征:年,月,日
df['dac_year']= np.array([x.year for x in dac])
df['dac_month']= np.array([x.month for x in dac])
df['dac_day']= np.array([x.day for x in dac])#提取特征:weekday
df['dac_wd']= np.array([x.isoweekday()for x in dac])
df_dac_wd = pd.get_dummies(df.dac_wd, prefix ='dac_wd')
df = pd.concat((df, df_dac_wd), axis =1)
df.drop(['dac_wd'], axis =1, inplace =True)#提取特征:季节
df['dac_season']= np.array([get_season(x)for x in dac])
df_dac_season = pd.get_dummies(df.dac_season, prefix ='dac_season')
df = pd.concat((df, df_dac_season), axis =1)
df.drop(['dac_season'], axis =1, inplace =True)#提取特征:date_account_created和timestamp_first_active之间的差值#即用户在airbnb平台活跃到正式注册所花的时间
dt_span = dac.subtract(tfa).dt.days
#dt_span的头十行数据
dt_span.value_counts().head(10)
#分析:数据主要集中在-1,可以猜测,用户当天注册dt_span值便是-1# 从差值提取特征:差值为一天,一月,一年和其他# 即用户活跃到注册花费的时间为一天,一月,一年或其他defget_span(dt):# dt is an integerif dt ==-1:return'OneDay'elif(dt <30)&(dt >-1):return'OneMonth'elif(dt >=30)&(dt <=365):return'OneYear'else:return'other'
df['dt_span']= np.array([get_span(x)for x in dt_span])
df_dt_span = pd.get_dummies(df.dt_span, prefix ='dt_span')
df = pd.concat((df, df_dt_span), axis =1)
df.drop(['dt_span'], axis =1, inplace =True)#删除原有的特征#对timestamp_first_active,date_account_created进行特征提取后,从特征列表中删除原有的特征
df.drop(['date_account_created','timestamp_first_active'], axis =1, inplace =True)
#age
av = df.age.values
#在数据探索阶段,我们发现大部分数据是集中在(15,90)区间的,但有部分年龄分布在(1900,2000)区间,#我们猜测用户是把出生日期误填为年龄,故进行预处理#数据来自2014年,故用2014-value
av = np.where(np.logical_and(av<2000, av>1900),2014-av, av)
df['age']= av
#将年龄进行分段
age = df.age
age.fillna(-1, inplace =True)#空值填充为-1
div =15defget_age(age):# age is a float number 将连续型转换为离散型if age <0:return'NA'#表示是空值elif(age < div):return div #如果年龄小于15岁,那么返回15岁elif(age <= div *2):return div*2#如果年龄大于15小于等于30岁,则返回30岁elif(age <= div *3):return div *3elif(age <= div *4):return div *4elif(age <= div *5):return div *5elif(age <=110):return div *6else:return'Unphysical'#非正常年龄#将分段后的年龄作为新的特征放入特征列表中
df['age']= np.array([get_age(x)for x in age])
df_age = pd.get_dummies(df.age, prefix ='age')
df = pd.concat((df, df_age), axis =1)
df.drop(['age'], axis =1, inplace =True)
#将目标变量进行labels encoding
le = LabelEncoder()
ytrain_le = le.fit_transform(ytrain.values)# labels encoding前:# [‘AU’, ‘CA’, ‘DE’, ‘ES’, ‘FR’, ‘GB’, ‘IT’, ‘NDF’, ‘NL’, ‘PT’, ‘US’,‘other’]# labels encoding后:# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]#提取10%的数据进行模型训练#减少训练模型花费的时间
n =int(xtrain.shape[0]*0.1)
xtrain_new = xtrain.iloc[:n,:]#训练数据
ytrain_new = ytrain_le[:n]#训练数据的目标变量# StandardScaling the dataset#Standardization of a dataset is a common requirement for many machine learning estimators: #they might behave badly if the individual feature do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance)
X_scaler = StandardScaler()
xtrain_new = X_scaler.fit_transform(xtrain_new)
#评分模型:NDCG# NDCG是一种衡量排序质量的评价指标,该指标考虑了所有元素的相关性# 由于我们预测的目标变量并不是二分类变量,故我们用NDGG模型来进行模型评分,判断模型优劣# 一般二分类变量: 我们习惯于使用 f1 score, precision, recall, auc score来进行模型评分from sklearn.metrics import make_scorer
defdcg_score(y_true, y_score, k=5):"""
y_true : array, shape = [n_samples] #数据
Ground truth (true relevance labels).
y_score : array, shape = [n_samples, n_classes] #预测的分数
Predicted scores.
k : int
"""
order = np.argsort(y_score)[::-1]#分数从高到低排序
y_true = np.take(y_true, order[:k])#取出前k[0,k)个分数
gain =2** y_true -1
discounts = np.log2(np.arange(len(y_true))+2)return np.sum(gain / discounts)defndcg_score(ground_truth, predictions, k=5):"""
Parameters
----------
ground_truth : array, shape = [n_samples]
Ground truth (true labels represended as integers).
predictions : array, shape = [n_samples, n_classes]
Predicted probabilities. 预测的概率
k : int
Rank.
"""
lb = LabelBinarizer()
lb.fit(range(len(predictions)+1))
T = lb.transform(ground_truth)
scores =[]# Iterate over each y_true and compute the DCG scorefor y_true, y_score inzip(T, predictions):
actual = dcg_score(y_true, y_score, k)
best = dcg_score(y_true, y_true, k)
score =float(actual)/float(best)
scores.append(score)return np.mean(scores)
构建模型
#############################Logistic Regressionfrom sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
lr = LogisticRegression(C =1.0, penalty='l2', multi_class='ovr')
RANDOM_STATE =2017#随机种子#k-fold cross validation(k-折叠交叉验证)
kf = KFold(n_splits=5, random_state=RANDOM_STATE)#分成5个组
train_score =[]
cv_score =[]# select a k (value how many y):
k_ndcg =3# kf.split: Generate indices to split data into training and test set.for train_index, test_index in kf.split(xtrain_new, ytrain_new):#训练集数据分割为训练集和测试集,y是目标变量
X_train, X_test = xtrain_new[train_index,:], xtrain_new[test_index,:]
y_train, y_test = ytrain_new[train_index], ytrain_new[test_index]
lr.fit(X_train, y_train)
y_pred = lr.predict_proba(X_test)
train_ndcg_score = ndcg_score(y_train, lr.predict_proba(X_train), k = k_ndcg)
cv_ndcg_score = ndcg_score(y_test, y_pred, k=k_ndcg)
train_score.append(train_ndcg_score)
cv_score.append(cv_ndcg_score)print("\nThe training score is: {}".format(np.mean(train_score)))print("\nThe cv score is: {}".format(np.mean(cv_score)))