数据集网址:https://www.kaggle.com/competitions/rossmann-store-sales
读取数据
#读取数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import time
train = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv',dtype={'StateHoliday':np.string_})#加载数据时,为特定字段指定了数据类型
test = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv',dtype={'StateHoliday':np.string_})
store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')#每个店铺的详情
display(train.head(),test.head(),store.head())
print(train.shape,test.shape,store.shape)
数据预处理
处理缺失值
train.isnull().sum()#查看train是否包含空数据
test.isnull().sum()#查看test是否包含空数据
test文件里面的open特征有11个空数据,则查看哪些数据的open为空
cond = test['Open'].isnull()
test[cond]
可得缺失的数据都属于store为622,查看store为622的数据的open数据
cond = train['Store'] == 622
train[cond]
上图可得,Open的数据为0或1,则统计store为622的情况下,Open里面1和0的个数
cond = train['Store'] == 622
value_counts = train[cond]['Open'].value_counts()
value_counts
Open
1 784
0 158
Name: count, dtype: int64
可得Open中1远大于0,则将test中缺失数据填充为0
test.fillna(1,inplace = True)#填充空数据
test.isnull().sum()
查看store的缺失值
store.isnull().sum()
因缺失值,名称相似,因此看一下,数据是否同时缺失对应的特征
v1 = 'CompetitionDistance'
v2 = 'CompetitionOpenSinceMonth'
v3 = 'CompetitionOpenSinceYear'
v4 = 'Promo2SinceWeek'
v5 = 'Promo2SinceYear'
v6 = 'PromoInterval'
print(store[(store[v2].isnull()) & (store[v3].isnull())].shape)
print(store[store[v4].isnull() & store[v5].isnull() & store[v6].isnull()].shape)
(354, 10)
(544, 10)
则v2,v3同时缺失v4,v5,v6同时缺失
#因缺失值过多,不好填充,则统一填充为0,代表刚开业未有竞争对手
store.fillna(0,inplace=True)
store.isnull().sum()
#销售时间的关系
cond = train['Sales']>0
sales_data = train[cond]#销售额为正的数据
sales_data
sales_data.loc[train['Store'] == 1].plot(x = 'Date',y = 'Sales',title = 'Store_1',figsize = (16,4),color = 'red')
#查看test中要预测的数据
test['Date'].unique()
#从2014年6-9月份的销量来看,6,7月份的销售趋势与8,9月份类似,而我们需要预测的六周在2015年8,9月份,因此我们可以把2015年6,7月份最近6周的11-15加店的数据作为测试数据,用于模型的优化和验证
display(train.shape,test.shape)
#合并数据
cond = train['Sales']>0
train = train[cond]#过滤销售额小于0的数据
train = pd.merge(train,store,on='Store',how = 'left')#数据合并
test = pd.merge(test,store,on='Store',how = 'left')#数据合并
display(train.shape,test.shape)
train.info()
test.info()
特征工程
train.head()
test.head()#测试集也包含字符串
有特征为字符串,字符串不能用于建模,则将字符串转为数字
train['StateHoliday'].unique()#字符串不能用于建模
for data in [train, test]:
data['year'] = data['Date'].apply(lambda x: int(x.split('-')[0]))
data['month'] = data['Date'].apply(lambda x: int(x.split('-')[1]))
data['day'] = data['Date'].apply(lambda x: int(x.split('-')[2]))
month2str = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
data['monthstr'] = data['month'].map(month2str)
convert = lambda x: 0 if x['PromoInterval'] == 0 else 1 if x['monthstr'] in x['PromoInterval'] else 0
data['IsPromoMonth'] = data.apply(convert, axis=1)
mappings = {'0': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4}
data['StoreType'] = data['StoreType'].replace(mappings)
data['StateHoliday'] = data['StateHoliday'].replace(mappings)
data['Assortment'] = data['Assortment'].replace(mappings)
构建训练数据和测试数据
display(train.shape,test.shape)
df_train = train.drop(['Date','monthstr','PromoInterval','Customers','Open'],axis = 1)#Customers,test中没有这个特征,故删除
df_test = test.drop(['Date','monthstr','PromoInterval','Id','Open'],axis = 1)
display(df_train.shape,df_test.shape)
#df_train训练数据,历史数据
#根据历史数据,进行建模,对df_test进行预测
#df_train这个数据,进行拆分,建模训练数据。验证数据(评估)
X_train = df_train[6*7*1115:]#建模训练数据
X_test = df_train[:6*7*1115]#建模验证数据(评估)2015年6~7月份的销售数据 可用来与test结果进行评估
数据属性间的相关性系数
plt.figure(figsize = (24,20))
plt.rcParams['font.size'] = 12
sns.heatmap(df_train.corr(),cmap = 'RdYlGn_r',annot = True,vmin = -1,vmax = 1)
提取训练模型的数据
_ = plt.hist(X_train['Sales'],bins = 100)
#目标值
y_train = np.loglp(X_train['Sales'])#对数化,正态化,更加规整、正态化
y_test = np.loglp(X_test['Sales'])
#特征
x_train = X_train.drop('Sales',axis = 1)#用于建模
x_test = X_test.drop('Sales',axis = 1)#用于验证和评估
_ = plt.hist(y_train,bins = 100)
构建模型
定义评价函数
#均方根百分比误差
def rmspe(y,yhat):
return np.sqrt(np.mean(1-yhat/y)**2)
def rmspe_xg(y,yhat):
y = np.expm1(y)#其放大数据的作用
yhat = np.expm1(yhat.get_label())#DMaxtrix数据类型,get_label获取数据
return 'rmspe',rmspe(y,yhat)
模型训练
params = {'objective':'reg:linear',
'booster':'gbtree',
'eta':0.03,#根学习率类似
'max_depth':10,
'subsample':0.9,
'colsample_bytree':0.7,
'silent':1,
'seed':10}
num_boost_round = 6000
dtrain = xgb.DMatrix(x_train,y_train)
dtest = xgb.DMatrix(x_test,y_test)
print('模型训练开始....')
evals = [(dtrain,'train'),(dtest,'validation')]
gbm = xgb.train(params,#模型参数
dtrain,#训练数据
num_boost_round,#轮次,决策树的个数
evals = evals,#验证,评估的数据
early_stopping_rounds = 100,#在验证集上,当连续迭代n次迭代,分数没有提高后,提前终止训练
feval = rmspe_xg,#模型评估的函数
verbose_eval = True)#打印输出log日志,每次训练详情
gbm.save_model('train_model.json')
模型评估
print('验证数据表现:')
x_test.sort_index(inplace = True)
y_test.sort_index(inplace = True)
#预测
yhat = gbm.predict(xgb.DMatrix(x_test))
error = rmspe(np.expm1(y_test),np.expm1(yhat))
print('RMSPE:',error)
res = pd.DataFrame(data = y_test)
res['prediction'] = yhat
res = pd.merge(x_test,res,left_index = True,right_index = True)
res['Ratio'] = res['prediction']/res['Sales']#预测和真实销量的比率
res['Error'] = abs(1-res['Ratio'])#误差率
res['weight'] = res['Sales']/res['prediction']#真实销量占预测值的百分比
display(res.head())
from matplotlib import font_manager
fm = font_manager.FontManager()
for font in fm.ttflist:
print(font.name)
plt.rcParams['font.family'] = 'STKaiti'
col_1 = ['Sales','prediction']
col_2 = ['Ratio']
#随机选三个店铺,进行可视化
shops = np.random.randint(1,1116,size = 3)
print('预测值和真实销量的比率是%0.3f'%(res['Ratio'].mean()))
for shop in shops:
cond = res['Store'] ==shop
df1 = pd.DataFrame(data = res[cond],columns = col_1)
df2 = pd.DataFrame(data = res[cond],columns = col_2)
df1.plot(title = '%d商店的预测数据和真实销量的对比'%(shop),figsize = (12,4))
df2.plot(title = '%d商店的预测数据和真实销量的比率'%(shop),figsize = (12,4))
#偏差数据
res.sort_values(by = ["Error"],ascending = False)
模型优化
整体模型优化
weights = [(0.99 + (i/1000)) for i in range(20)]
errors = []
for w in weights:
#偏差校正
error = rmspe(np.expm1(y_test),np.expm1(yhat* w))#这就是对预测值,进行权重乘法,微小改变
errors.append(error)
errors = pd.Series(errors,index = weights)
plt.figure(figsize = (9,6))
errors.plot()
plt.xlabel('权重系数',fontsize = 18)
plt.ylabel('均方根百分比误差',fontsize = 18)
index = errors.argmin()
print('最佳的偏差校正系数:',index,error.iloc[index],weights[index])
更加细致的优化(考虑不同店铺)
shops = np.arange(1,1116)
weights1 = [] # 验证数据每个店铺的权重系数 46830
weights2 = [] # 测试数据每个店铺的权重系数 41088,提交到Kaggle官网
for shop in shops:
cond = res['Store'] == shop
df1 = pd.DataFrame(res[cond], columns=col_1) # 验证数据的预测数据和真实销量
cond2 = df_test['Store'] == shop
df2 = pd.DataFrame(df_test[cond2])
weights = [(0.98 + (i/1000)) for i in range(40)]
errors = []
for w in weights:
error = rmspe(np.expm1(df1['Sales']),np.expm1(df1['prediction'] * w))
errors.append(error)
errors = pd.Series(errors,index = weights)
index = errors.argmin() # 最小的索引
best_weight = np.array(weights[index]) # 只是一个数值
weights1.extend(best_weight.repeat(len(df1)).tolist())
weights2.extend(best_weight.repeat(len(df2)).tolist())
# for循环结束,每个店铺的权重,是多少,计算得到了
# 验证数据调整校正系数的排序
X_test = X_test.sort_values(by = 'Store') # 1,2,3,……1115
X_test['weights1'] = weights1 # 权重和店铺,进行一一对应!
X_test = X_test.sort_index() # 根据索引大小进行排序
weights1 = X_test['weights1']
X_test = X_test.drop('weights1',axis = 1)
# 测试数据调整校正系数
df_test = df_test.sort_values(by = 'Store') # 1,2,3,……1115
df_test['weights2'] = weights2 # 权重和店铺,进行一一对应!
df_test = df_test.sort_index() # 根据索引大小进行排序
weights2 = df_test['weights2']
df_test = df_test.drop('weights2',axis = 1)
set(weights1)
yhat_new = yhat * weights1#预测销售额,校正
rmspe(np.expm1(y_test),np.expm1(yhat_new ))
模型预测
#使用算法,对测试数据,进行预测
test = xgb.DMatrix(df_test)
y_pred = gbm.predict(test)#算法预测的结果,结果提交kaggle
#y_pred是对数运算的结果,真实数据需要数据转换,幂运算
#保存数据,不经任何调整校正
result = pd.DataFrame({'ID':np.arange(1,41089),'Sales':np.expm1(y_pred)})
result.to_csv('result1.csv',index = False)
#对于整体模型进行优化
w = 0.997
result = pd.DataFrame({'ID':np.arange(1,41089),'Sales':np.expm1(y_pred * w)})
result.to_csv('result2.csv',index = False)
#进行更加细致的模型优化
weights2
#每个店铺的权重校正都不同
result = pd.DataFrame({'ID':np.arange(1,41089),'Sales':np.expm1(y_pred * weights2)})
result.to_csv('result3.csv',index = False)