In [29]:
#先把数据读进来
import pandas as pd
data = pd.read_csv('kaggle_bike_competition_train.csv', header = 0, error_bad_lines=False)
In [30]:
#看一眼数据长什么样
data.head()
Out[30]:
In [32]:
# 处理时间字段
temp = pd.DatetimeIndex(data['datetime'])
data['date'] = temp.date
data['time'] = temp.time
data.head()
Out[32]:
In [33]:
# 设定hour这个小时字段
data['hour'] = pd.to_datetime(data.time, format="%H:%M:%S")
data['hour'] = pd.Index(data['hour']).hour
data
Out[33]:
In [35]:
# 我们对时间类的特征做处理,产出一个星期几的类别型变量
data['dayofweek'] = pd.DatetimeIndex(data.date).dayofweek
# 对时间类特征处理,产出一个时间长度变量
data['dateDays'] = (data.date - data.date[0]).astype('timedelta64[D]')
data
Out[35]:
In [36]:
byday = data.groupby('dayofweek')
# 统计下没注册的用户租赁情况
byday['casual'].sum().reset_index()
Out[36]:
In [37]:
# 统计下注册的用户的租赁情况
byday['registered'].sum().reset_index()
Out[37]:
In [38]:
data['Saturday']=0
data.Saturday[data.dayofweek==5]=1
data['Sunday']=0
data.Sunday[data.dayofweek==6]=1
data
Out[38]:
In [39]:
# remove old data features
dataRel = data.drop(['datetime', 'count','date','time','dayofweek'], axis=1)
dataRel.head()
Out[39]:
In [40]:
from sklearn.feature_extraction import DictVectorizer
# 我们把连续值的属性放入一个dict中
featureConCols = ['temp','atemp','humidity','windspeed','dateDays','hour']
dataFeatureCon = dataRel[featureConCols]
dataFeatureCon = dataFeatureCon.fillna( 'NA' ) #in case I missed any
X_dictCon = dataFeatureCon.T.to_dict().values()
# 把离散值的属性放到另外一个dict中
featureCatCols = ['season','holiday','workingday','weather','Saturday', 'Sunday']
dataFeatureCat = dataRel[featureCatCols]
dataFeatureCat = dataFeatureCat.fillna( 'NA' ) #in case I missed any
X_dictCat = dataFeatureCat.T.to_dict().values()
# 向量化特征
vec = DictVectorizer(sparse = False)
X_vec_cat = vec.fit_transform(X_dictCat)
X_vec_con = vec.fit_transform(X_dictCon)
In [41]:
dataFeatureCon.head()
Out[41]:
In [42]:
X_vec_con
Out[42]:
In [43]:
dataFeatureCat.head()
Out[43]:
In [44]:
X_vec_cat
Out[44]:
In [18]:
from sklearn import preprocessing
# 标准化连续值数据
scaler = preprocessing.StandardScaler().fit(X_vec_con)
X_vec_con = scaler.transform(X_vec_con)
X_vec_con
Out[18]:
In [20]:
from sklearn import preprocessing
# one-hot编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_vec_cat)
X_vec_cat = enc.transform(X_vec_cat).toarray()
X_vec_cat
Out[20]:
In [22]:
import numpy as np
# combine cat & con features
X_vec = np.concatenate((X_vec_con,X_vec_cat), axis=1)
X_vec
Out[22]:
In [23]:
# 对Y向量化
Y_vec_reg = dataRel['registered'].values.astype(float)
Y_vec_cas = dataRel['casual'].values.astype(float)
In [24]:
# 看看处理后的结果值
Y_vec_reg
Out[24]:
In [25]:
Y_vec_cas
Out[25]: