盐城汽车上牌量预测
import所需要的包进来
#coding:utf-8
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
第一部分:查看数据train和test_A的数据样式
能够看到train中有4个属性特征,date day_of_week brand cnt
但是test中的属性特征是date day_of_week
预测的目标是某天的上牌量的数目cnt
很明显train中的数据中属性特征多余test。
dir = './data/'
train = pd.read_table(dir + 'train_20171215.txt',engine='python')
test_A = pd.read_table(dir + 'test_A_20171225.txt',engine='python')
sample_A = pd.read_table(dir + 'sample_A_20171225.txt',engine='python',header=None)
print(train.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4773 entries, 0 to 4772
Data columns (total 4 columns):
date 4773 non-null int64
day_of_week 4773 non-null int64
brand 4773 non-null int64
cnt 4773 non-null int64
dtypes: int64(4)
memory usage: 149.2 KB
None
print(test_A.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276 entries, 0 to 275
Data columns (total 2 columns):
date 276 non-null int64
day_of_week 276 non-null int64
dtypes: int64(2)
memory usage: 4.4 KB
None
再看一下每天对应的周数,从周一到周日,正常的数据
print(train['day_of_week'].unique())
print(test_A['day_of_week'].unique())
[3 4 5 6 7 1 2]
[4 5 6 1 2 3 7]
缺失值的处理
有一些品牌可能在一些天没有上牌的数量,所以直接使用0进行填充
train['cnt'] = train['cnt'].fillna(0)
第二部分 Data Processing
预测得目标:预测的变量是cnt也就是每天的上牌量的数目,所以需要先看一下,这个变量的分布情况。
这里,我们需要去观察目标值的范围变化,看一下目标值的大题趋势如何,首先以箱形图去观察一下目标值的变化
箱型图
箱型图可以检测这组数据是否存在异常值。.箱型图有个功能就是可以检测这组数据是否存在异常值。
由图中可以看出cnt数据的分布处于右偏,以正态分布的角度,异常值处于>1000的地方
plt.boxplot(train['cnt'])
plt.show()
cnt的数据分布情况
接下来直接绘制分布图,看一看具体的分布情况
通过绘制分布图,可以看出来数据分布确实符合右偏分布,这里大概初步了解数据的分布尺度在 0 到 2000 左右,
且在0-500/1000的数量最密集
一般模型(线性)都比较喜欢线性的数据分布,所以我们需要归一化特征属性
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from scipy import stats
from scipy.stats import norm, skew
sns.distplot(train['cnt'],fit=norm)
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['cnt'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('cnt distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['cnt'], plot=plt)
plt.show()
mu = 380.57 and sigma = 252.69
找出与预测目标cnt最相关的变量X(即非目标值中找到与目标最相关的值)
因为数据本身与时间相关,所以我们可以绘制一下随脱敏时间和星期的变化
看一下散点图
fig, ax = plt.subplots()
ax.scatter(x = train['date'], y = train['cnt'])
plt.ylabel('cnt', fontsize=13)
plt.xlabel('date', fontsize=13)
plt.show()
cnt与date的分布关系
plt.plot(train['date'],train['cnt'])
plt.show()
看一下cnt的统计的info
print(train['cnt'].describe())
count 4773.000000
mean 380.567358
std 252.720918
min 12.000000
25% 221.000000
50% 351.000000
75% 496.000000
max 2102.000000
Name: cnt, dtype: float64
预测结果以mean square error作为评判标准
from sklearn.metrics import mean_squared_error
train['25%'] = 221
train['50%'] = 351
train['75%'] = 496
train['median'] = train['cnt'].median()
train['mean'] = train['cnt'].mean()
print(mean_squared_error(train['cnt'],train['25%']))
print(mean_squared_error(train['cnt'],train['50%']))
print(mean_squared_error(train['cnt'],train['75%']))
print(mean_squared_error(train['cnt'],train['median']))
print(mean_squared_error(train['cnt'],train['mean']))
89316.2231301
64728.7100356
77179.1761995
64728.7100356
63854.4813732
以周一到周日进行分组统计info, 分析周一周日的分布情况
从数据的分布图中可以很明显的看出来,数据具有节假日的特性,而且周六和周日明显和其他的数据差别很大
lenght=7
for i in range(lenght):
tmp = train[train['day_of_week']==i+1]
plt.subplot(lenght, 1, i+1)
plt.plot(range(len(tmp)),tmp['cnt'])
plt.show()
可以尝试着用以day_of_week进行分组,然后计算周一到周天的平均值。并且计算MSE,可以看到MSE下降很多。
MeanMerge = train.groupby(['day_of_week'],as_index=False).cnt.mean()
print(MeanMerge)
MeanMergeTrain = train.merge(MeanMerge,on=['day_of_week'])
print(mean_squared_error(MeanMergeTrain['cnt_x'],MeanMergeTrain['cnt_y']))
day_of_week cnt
0 1 481.854777
1 2 504.112763
2 3 425.141451
3 4 336.032379
4 5 391.222222
5 6 100.203647
6 7 348.843478
48035.0908982
先划分验证机和测试集
xx_train = train[train['date']<=756]
xx_test = train[train['date']>756]
print('test shape',xx_test.shape)
print('train shape',xx_train.shape)
test shape (1316, 9)
train shape (3457, 9)
方案零:均值(原始数据验证)
from sklearn.metrics import mean_squared_error
# 线下统计每周的均值数据,不加权
xx_train = xx_train.groupby(['day_of_week'],as_index=False).cnt.mean()
xx_result = pd.merge(xx_test,xx_train,on=['day_of_week'],how='left')
print('xx_result shape',xx_result.shape)
print(xx_result.head(10))
print(mean_squared_error(xx_result['cnt_x'],xx_result['cnt_y']))
xx_result shape (1316, 10)
date day_of_week brand cnt_x 25% 50% 75% median mean \
0 757 6 1 92 221 351 496 351.0 380.567358
1 757 6 2 50 221 351 496 351.0 380.567358
2 757 6 3 113 221 351 496 351.0 380.567358
3 757 6 4 32 221 351 496 351.0 380.567358
4 757 6 5 27 221 351 496 351.0 380.567358
5 758 1 1 610 221 351 496 351.0 380.567358
6 758 1 2 511 221 351 496 351.0 380.567358
7 758 1 3 930 221 351 496 351.0 380.567358
8 758 1 4 384 221 351 496 351.0 380.567358
9 758 1 5 874 221 351 496 351.0 380.567358
cnt_y
0 97.771739
1 97.771739
2 97.771739
3 97.771739
4 97.771739
5 459.680702
6 459.680702
7 459.680702
8 459.680702
9 459.680702
54757.4682534
由这个可以看出来周日的数据缺失,导致周日的MSE,偏差特别大
for i in range(7):
tmp = xx_result[xx_result['day_of_week']==i+1]
print('Week_%d:'%(i+1),mean_squared_error(tmp['cnt_x'],tmp['cnt_y']))
Week_1: 73634.7716535
Week_2: 92667.0107518
Week_3: 52607.3416139
Week_4: 36924.4757934
Week_5: 51974.9535275
Week_6: 9304.12731163
Week_7: 144515.121097
方案一:加权平均
这个方案主要是采取历史纪录*一个权值(可选函数为反比例函数,指数函数和简单的递减函数)
def xx(df):
df['w_cnt'] = (df['cnt'] * df['weight']).sum() / sum(df['weight'])
return df
xx_train = train[train['date']<=756]
xx_train['weight'] = ((xx_train['date'] + 1) / len(xx_train)) ** 6
xx_train = xx_train.groupby(['day_of_week'],as_index=False).apply(xx).reset_index()
xx_test = train[train['date']>756]
print('test shape',xx_test.shape)
print('train shape',xx_train.shape)
#
from sklearn.metrics import mean_squared_error
# 这里是加权的方案
xx_train = xx_train.groupby(['day_of_week'],as_index=False).w_cnt.mean()
xx_result = pd.merge(xx_test,xx_train,on=['day_of_week'],how='left')
print('xx_result shape',xx_result.shape)
print(xx_result.head(10))
print(mean_squared_error(xx_result['cnt'],xx_result['w_cnt']))
test shape (1316, 9)
train shape (3457, 12)
xx_result shape (1316, 10)
date day_of_week brand cnt 25% 50% 75% median mean \
0 757 6 1 92 221 351 496 351.0 380.567358
1 757 6 2 50 221 351 496 351.0 380.567358
2 757 6 3 113 221 351 496 351.0 380.567358
3 757 6 4 32 221 351 496 351.0 380.567358
4 757 6 5 27 221 351 496 351.0 380.567358
5 758 1 1 610 221 351 496 351.0 380.567358
6 758 1 2 511 221 351 496 351.0 380.567358
7 758 1 3 930 221 351 496 351.0 380.567358
8 758 1 4 384 221 351 496 351.0 380.567358
9 758 1 5 874 221 351 496 351.0 380.567358
w_cnt
0 96.282652
1 96.282652
2 96.282652
3 96.282652
4 96.282652
5 541.488347
6 541.488347
7 541.488347
8 541.488347
9 541.488347
51427.2948061
c:\python36-64\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
方案二:时序转回归
将回归问题转化为有监督问题
from pandas import DataFrame
from pandas import concat
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
"""
Frame a time series as a supervised learning dataset.
Arguments:
data: Sequence of observations as a list or NumPy array.
n_in: Number of lag observations as input (X).
n_out: Number of observations as output (y).
dropnan: Boolean whether or not to drop rows with NaN values.
Returns:
Pandas DataFrame of series framed for supervised learning.
"""
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
# put it all together
agg = concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
因为test最后预测的是276的上牌量,所以我们需要将十几件往前推移276天
time_cnt = list(train['cnt'].values)
# nin 前看 nout后看 这个题目需要前看
timeSupervised = series_to_supervised(data=time_cnt,n_in=276,dropnan=True)
print(timeSupervised.head(5))
var1(t-276) var1(t-275) var1(t-274) var1(t-273) var1(t-272) \
276 20.0 48.0 16.0 20.0 1411.0
277 48.0 16.0 20.0 1411.0 811.0
278 16.0 20.0 1411.0 811.0 1005.0
279 20.0 1411.0 811.0 1005.0 773.0
280 1411.0 811.0 1005.0 773.0 1565.0
var1(t-271) var1(t-270) var1(t-269) var1(t-268) var1(t-267) ... \
276 811.0 1005.0 773.0 1565.0 1176.0 ...
277 1005.0 773.0 1565.0 1176.0 824.0 ...
278 773.0 1565.0 1176.0 824.0 802.0 ...
279 1565.0 1176.0 824.0 802.0 1057.0 ...
280 1176.0 824.0 802.0 1057.0 1107.0 ...
var1(t-9) var1(t-8) var1(t-7) var1(t-6) var1(t-5) var1(t-4) \
276 436.0 253.0 241.0 393.0 489.0 407.0
277 253.0 241.0 393.0 489.0 407.0 237.0
278 241.0 393.0 489.0 407.0 237.0 200.0
279 393.0 489.0 407.0 237.0 200.0 535.0
280 489.0 407.0 237.0 200.0 535.0 384.0
var1(t-3) var1(t-2) var1(t-1) var1(t)
276 237.0 200.0 535.0 384
277 200.0 535.0 384.0 303
278 535.0 384.0 303.0 314
279 384.0 303.0 314.0 176
280 303.0 314.0 176.0 310
[5 rows x 277 columns]
第三部分:model的训练
1,lightgbm
import lightgbm as lgb
gbm0 = lgb.LGBMRegressor(
objective='regression',
num_leaves=64,
learning_rate=0.05,
n_estimators=10000)
print(timeSupervised.shape)
x_train = timeSupervised[timeSupervised.index<755]
x_test = timeSupervised[timeSupervised.index>755]
# 这个方式其实是最简单的,后面还可以很多改善,比如滚动预测一类
#print(x_train.head(10))
#print(x_test.head(10))
print(x_train.shape)
print(x_test.shape)
y_train = x_train.pop('var1(t)')
y_test = x_test.pop('var1(t)')
#print(x_train.head(10))
(4497, 277)
(479, 277)
(4017, 277)
# 损失函数mse
gbm0.fit(x_train.values,y_train,eval_set=[(x_test.values,y_test)],eval_metric='mse',early_stopping_rounds=15)
print(gbm0.predict(x_test.values))
from sklearn.metrics import mean_squared_error
line1 = plt.plot(range(len(x_test)),gbm0.predict(x_test.values),label=u'predict')
line2 = plt.plot(range(len(y_test)),y_test.values,label=u'true')
plt.legend()
plt.show()
[1] valid_0's l2: 67836.4
Training until validation scores don't improve for 15 rounds.
[2] valid_0's l2: 66320.7
[3] valid_0's l2: 64863.8
[4] valid_0's l2: 63734.3
[5] valid_0's l2: 62556.8
[6] valid_0's l2: 61509.4
[7] valid_0's l2: 60420.1
[8] valid_0's l2: 59594.9
[9] valid_0's l2: 58844.2
[10] valid_0's l2: 57950.3
[11] valid_0's l2: 57312.4
[12] valid_0's l2: 56989.2
[13] valid_0's l2: 56624.4
[14] valid_0's l2: 56398
[15] valid_0's l2: 56090.3
[16] valid_0's l2: 55514.4
[17] valid_0's l2: 55406.4
[18] valid_0's l2: 54975.9
[19] valid_0's l2: 54544.7
[20] valid_0's l2: 54191.8
[21] valid_0's l2: 53927.2
[22] valid_0's l2: 53950.3
[23] valid_0's l2: 53797
[24] valid_0's l2: 53406.1
[25] valid_0's l2: 53213.7
[26] valid_0's l2: 53156.5
[27] valid_0's l2: 52918.6
[28] valid_0's l2: 52402.2
[29] valid_0's l2: 52331.2
[30] valid_0's l2: 52239.8
[31] valid_0's l2: 52131.2
[32] valid_0's l2: 51781.4
[33] valid_0's l2: 51753.8
[34] valid_0's l2: 51556.9
[35] valid_0's l2: 51198.7
[36] valid_0's l2: 51221.8
[37] valid_0's l2: 51236.8
[38] valid_0's l2: 51062.5
[39] valid_0's l2: 51035.2
[40] valid_0's l2: 50947.1
[41] valid_0's l2: 51038.1
[42] valid_0's l2: 50941.6
[43] valid_0's l2: 50686.8
[44] valid_0's l2: 50630.7
[45] valid_0's l2: 50562.1
[46] valid_0's l2: 50477.7
[47] valid_0's l2: 50402.4
[48] valid_0's l2: 50335
[49] valid_0's l2: 50118.8
[50] valid_0's l2: 50109.9
[51] valid_0's l2: 50053.5
[52] valid_0's l2: 50009.4
[53] valid_0's l2: 49963
[54] valid_0's l2: 49915.5
[55] valid_0's l2: 49964.3
[56] valid_0's l2: 50014
[57] valid_0's l2: 49838.6
[58] valid_0's l2: 49774.9
[59] valid_0's l2: 49823.2
[60] valid_0's l2: 49858.3
[61] valid_0's l2: 49837
[62] valid_0's l2: 49878.7
[63] valid_0's l2: 49713.3
[64] valid_0's l2: 49705.8
[65] valid_0's l2: 49686.8
[66] valid_0's l2: 49727.7
[67] valid_0's l2: 49789
[68] valid_0's l2: 49832.4
[69] valid_0's l2: 49897.2
[70] valid_0's l2: 49759.3
[71] valid_0's l2: 49813.7
[72] valid_0's l2: 49748.3
[73] valid_0's l2: 49811.2
[74] valid_0's l2: 49865.8
[75] valid_0's l2: 49746.9
[76] valid_0's l2: 49713.3
[77] valid_0's l2: 49748.4
[78] valid_0's l2: 49729.7
[79] valid_0's l2: 49690.2
[80] valid_0's l2: 49697.9
Early stopping, best iteration is:
[65] valid_0's l2: 49686.8
[ 234.5294567 276.33655027 235.64508312 ..., 477.43510641 357.96633192
363.71724954]
可以看出来,由于缺少节假日的数据的影响还是很大的
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
2,GradientBoostingRegressor
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state =5)
GBoost.fit(x_train, y_train)
res = GBoost.predict(x_test)
print ('Local cv:')
print( mean_squared_error(res, y_test))
line1 = plt.plot(range(len(x_test)),GBoost.predict(x_test),label=u'predict')
line2 = plt.plot(range(len(y_test)),y_test.values,label=u'true')
plt.legend()
plt.show()
Local cv:
58745.5841332
3, XGBoost
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
learning_rate=0.05, max_depth=3,
min_child_weight=1.7817, n_estimators=2200,
reg_alpha=0.4640, reg_lambda=0.8571,
subsample=0.5213, silent=1,
random_state =7, nthread = -1)
model_xgb.fit(x_train, y_train)
res = model_xgb.predict(x_test)
print ('Local cv:')
print( mean_squared_error(res, y_test))
line1 = plt.plot(range(len(x_test)),model_xgb.predict(x_test),label=u'predict')
line2 = plt.plot(range(len(y_test)),y_test.values,label=u'true')
plt.legend()
plt.show()
Local cv:
51862.416136