导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
导入数据
data_train=pd.read_csv('Rossmann Store Sales/train.csv',low_memory=False)
data_store = pd.read_csv('Rossmann Store Sales/store.csv',low_memory=False)
查看数据
data_train.info()
data_store.info()
data_train.head()
sns.boxplot(x=data_train.Open,y=data_train.Sales)
数据处理
#open=0的店铺处于关闭状态,其销量为0,故删除
data_train = data_train[data_train['Open']!=0]
#data列为object格式,故将date列修改为日期格式
data_train['Date'] = data_train['Date'].astype('datetime64[ns]')
#将data_train data_store 合并
data_train = pd.merge(data_train,data_store,how='left')
#查看缺失值
data_train.isna().sum()
data_store.isna().sum()
'''
Store 0
StoreType 0
Assortment 0
CompetitionDistance 3
CompetitionOpenSinceMonth 354
CompetitionOpenSinceYear 354
Promo2 0
Promo2SinceWeek 544
Promo2SinceYear 544
PromoInterval 544
dtype: int64
'''
data_store[data_store['CompetitionDistance'].isna()]
#对缺失值按0值填补
data_store.fillna(0,inplace=True)
#data列为object格式,故将date列修改为日期格式
data_train['Date'] = data_train['Date'].astype('datetime64[ns]')
#将data_train data_store 合并
data_train = pd.merge(data_train,data_store,how='left')
特征处理
'''
'StoreType':商店类型
'Assortment':售卖的商品组合类型
'StateHoliday':法定假日(这里也有abc0四种假日类型)
'''
#将字母类型转化为数字类型
mappings = {'0':0,'a':1,'b':2,'c':3,'d':4}
data_train['StoreType']=data_train['StoreType'].map(mappings)
data_train['Assortment'] = data_train['Assortment'].map(mappings)
data_train['StateHoliday'] = data_train['StateHoliday'].map(mappings)
#提取年月日、周几、第几周
data_train['Year'] = data_train['Date'].dt.year
data_train['Month'] = data_train['Date'].dt.month
data_train['Day'] = data_train['Date'].dt.day
data_train['DayOfWeek'] = data_train['Date'].dt.dayofweek
#data_train['WeekOfYear'] = data_train['Date'].dt.weekofyear,由于新版本不支持dt.weekofyear,故使用isocalendar
data_train['WeekOfYear'] = data_train['Date'].dt.isocalendar().week
'''
CompetitionOpen:
CompetitionOpenSinceYear:竞争对手开业年份
CompetitionOpenSinceMonth:竞争对手开业月份
PromoOpen:
Promo2SinceYear:长期促销开始年份
Promo2SinceWeek:促销开始时 是一年的第几周
Promo2:表示是否有长期促销
'''
#整理竞争对手存在时间 和 促销活动开始至今的时间
data_train['CompetitionOpen']=12*(data_train.Year-data_train.CompetitionOpenSinceYear)+\
(data_train.Month-data_train.CompetitionOpenSinceMonth)
#\使公式换行 不中断
data_train['PromoOpen']=12*(data_train.Year-data_train.Promo2SinceYear)+\
(data_train.WeekOfYear-data_train.Promo2SinceWeek)/4.0
data_train['CompetitionOpen'] = data_train.CompetitionOpen.apply(lambda x: x if x>0 else 0)
data_train['PromoOpen'] = data_train.PromoOpen.apply(lambda x : x if x>0 else 0)
#将月份转化为英文字符格式
import calendar
data_train['Monthenglish'] = data_train['Month'].apply(lambda x: calendar.month_name[x])
#判断该月是否处于促销月
data_train['isPromoMonth'] = 0
for i in data_train.PromoInterval.unique():
if i != 0 :
for month in i.split(','):
data_train.loc[(data_train.Monthenglish == month)&(data_train.PromoInterval == i),'isPromoMonth']=1
划分数据集
feature = ['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
'StateHoliday', 'StoreType', 'Assortment', 'SchoolHoliday',
'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek',
'Promo2SinceYear', 'Year', 'Month', 'Day', 'DayOfWeek', 'WeekOfYear',
'CompetitionOpen', 'PromoOpen', 'isPromoMonth','Customers']
#划分自变量和因变量即 x and y
data_y = data_train['Sales'].values
data_x = data_train.drop('Sales',axis=1)[feature].values
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(data_x,data_y,test_size=0.3,random_state=102)
建模并预测
from lightgbm import LGBMRegressor
lgb = LGBMRegressor(boosting_type='gbdt',
objective='regression',
subsample=0.8,#用于训练模型的子样本占整个样本集合的比例,0.8即随机抽取80%的样本建立模型
colsample_bytree=0.8 #用来控制每棵随机采样的列数的占比,即特征比例
)
#使用GridSearchCV 进行调参
from sklearn.model_selection import GridSearchCV
param_grid = {
'learning_rate': [0.01, 0.1, 0.3],#学习率
'n_estimators': [100,200],#基学习器的数量
'num_leaves':[31,90,270] #最大叶子节点
}
gridsearch = GridSearchCV(lgb,param_grid)
gridsearch.fit(X_train,Y_train)
gridsearch.best_params_
#得出的 考虑选择的参数
'''{'learning_rate': 0.1, 'n_estimators': 200, 'num_leaves': 270}'''
#调整参数进行lightgbm回归
lgb = LGBMRegressor(boosting_type='gbdt',
objective='regression',
subsample=0.8,
colsample_bytree=0.8,
learning_rate=0.1,
n_estimators=200,
num_leaves=270,
)
lgb.fit(X_train,Y_train)
predict_y = lgb.predict(X_test)
#模型r2值,
from sklearn import metrics
metrics.r2_score(Y_test,predict_y)#结果为0.9809077483792858
sns.jointplot(x=Y_test,y=predict_y,kind='reg')
回归线斜率大致为1,表明lightgbm预测结果较为满意。