https://www.kesci.com/mw/dataset/5e981cf7ebb37f002c6007d6/content
import pandas as pd
import numpy as np
import math
class Sample:
def __init__(self, date, date_block_num, shop_id, item_id, item_price, item_cnt_day):
self.date = date
self.date_block_num = date_block_num
self.shop_id = shop_id
self.item_id = item_id
self.item_price = item_price
self.item_cnt_day = item_cnt_day
def __repr__(self):
return str([self.date, self.date_block_num, self.shop_id, self.item_id, self.item_price, self.item_cnt_day])
def data_read_and_clean():
data_train = pd.read_csv("D:\!data\sales_forecast\sales_train.csv")
data_test = pd.read_csv("D:\\!data\\sales_forecast\\test.csv")
#提出离群值
data_train = data_train[data_train['item_cnt_day'] < 1001]
data_train = data_train[data_train['item_price'] < 300000]
#负值填充为中位数
median = data_train[(data_train['date_block_num'] == 4) & (data_train['shop_id'] == 32)\
& (data_train['item_id'] == 2973) & (data_train['item_price']>0)].item_price.median()
data_train.loc[data_train['item_price']<0,'item_price'] = median
#经过分析,发现以下商店名为同一个商店,可以合并shop_id.
# 11 => 10 1 => 58 0 => 57 40 => 39
data_train.loc[data_train['shop_id'] == 0, 'shop_id'] = 57
data_train.loc[data_train['shop_id'] == 1, 'shop_id'] = 58
data_train.loc[data_train['shop_id'] == 11, 'shop_id'] = 10
data_train.loc[data_train['shop_id'] == 40, 'shop_id'] = 39
return data_train, data_test
def res_range_limit(res, left, right):
res1 = []
for row in res:
if row[1] < left:
res1.append([row[0], left])
elif row[1] <= right:
res1.append(row)
else:
res1.append([row[0], right])
return res1
def w_WMA(p):
return [2*i/p/(p+1) for i in range(1,p+1)]
def predict_WMA(w, X):
return sum([w[i] * X[i] for i in range(len(w))])
def predict_EMA(X, alpha):
return sum([alpha*math.pow(1-alpha, len(X)-1-i)*X[i] for i in range(len(X))])
def f(X, p, alpha):
X_predict = [x for x in X[:p]]
if len(X) > p:
X_predict.append(predict_EMA(X[:p], alpha))
for i in range(len(X)-p-1):
#加速版
X_predict.append(alpha*X[p+i]+(1-alpha)*X_predict[-1])
#基础版
# X_predict.append(predict_EMA(X[:p+i+1], alpha))
loss = sum([math.pow(X[i]-X_predict[i], 2) for i in range(len(X))])
return loss, X_predict
def predict_generate_EMA(X, p, is_print=False):
loss0, X_predict0 = f(X, p, 0.99)
alpha0 = 0.99
for alpha in range(99, 50, -1):
loss, X_predict = f(X, p, alpha/100)
if loss0 > loss:
loss0, X_predict0 = loss, X_predict
alpha0 = alpha/100
if is_print:
print("\nloss_EMA:", loss0, "\n")
return alpha0, X_predict0
#最后c个周期的加权平均值
def MA(data_train, data_test, c, range_limit=False, default=0.0, Type = "MA"):
data_train = np.array(data_train).tolist()
data_train = [Sample(row[0], row[1], row[2], row[3], row[4], row[5]) for row in data_train]
D_train = {}
for s in data_train:
if (s.shop_id, s.item_id) not in D_train:
D_train[s.shop_id, s.item_id] = [0 for i in range(34)]
D_train[s.shop_id, s.item_id][s.date_block_num] += s.item_cnt_day
data_test = np.array(data_test).tolist()
for i in range(len(data_test)):
shop, item = data_test[i][1], data_test[i][2]
if (shop, item) in D_train and c > 0:
if Type == "WMA":
data_test[i].append(predict_WMA(w_WMA(c), D_train[shop, item][-c:]))
elif Type == "EMA":
alpha, _ = predict_generate_EMA(D_train[shop, item], p=c, is_print=False)
data_test[i].append(predict_EMA(D_train[shop, item][-c:], alpha))
else:
data_test[i].append(sum(D_train[shop, item][-c:])/c)
else:
data_test[i].append(default)
res = [[row[0], row[3]] for row in data_test]
if range_limit:
res = res_range_limit(res, 0, 20)
res = pd.DataFrame(res, columns=["ID", "item_cnt_month"])
res.to_csv("D:\\!data\\sales_forecast\\result\\{}_{}_{}.csv".format(Type, c, range_limit), index=False)
#自适应的MA
def MA_self_adaptive(data_train, data_test, range_limit=False):
data_train = np.array(data_train).tolist()
data_train = [Sample(row[0], row[1], row[2], row[3], row[4], row[5]) for row in data_train]
D_train = {}
for s in data_train:
if (s.shop_id, s.item_id) not in D_train:
D_train[s.shop_id, s.item_id] = [0 for i in range(34)]
D_train[s.shop_id, s.item_id][s.date_block_num] += s.item_cnt_day
data_test = np.array(data_test).tolist()
MSEs = []
for i in range(len(data_test)):
shop, item = data_test[i][1], data_test[i][2]
if (shop, item) in D_train:
L = D_train[shop, item]
L_MSE = [[c, sum([pow(sum(L[j:j+c])/c-L[j+c],2) for j in range(34-c)])] for c in range(1, 13)]
L_MSE.sort(key=lambda x:x[1])
c = L_MSE[0][0]
data_test[i].append(sum(L[-c:])/c)
MSEs.append(L_MSE[0][1])
else:
data_test[i].append(0.0)
res = [[row[0], row[3]] for row in data_test]
if range_limit:
res = res_range_limit(res, 0, 20)
res = pd.DataFrame(res, columns=["ID", "item_cnt_month"])
res.to_csv("D:\\!data\\sales_forecast\\result\\MA_self_adaptive_{}.csv".format(range_limit), index=False)
return MSEs
if __name__ == '__main__':
data_train, data_test = data_read_and_clean()
#0baseline 所有都为0.5 1.23646
# MA
# c=1 range_limit=False 8.53027 rank 9640
# c=1 range_limit=True 1.16777 6281 这里说明了range_limit的重要性,后续默认都需要加这一步骤
# c=2 range_limit=True 1.14102 5923
# c=3 range_limit=True 1.09979 5653
# c=6 range_limit=True 1.10126
# c=12 range_limit=True 1.18527
# c=3 range_limit=True default=0.5 1.11810
# c=3 range_limit=True default=0.3 1.09770
# c=3 range_limit=True default=0.1 1.09469 5627
# c=3 range_limit=True default=0.05 1.09669
MA(data_train, data_test, c=3, range_limit=True, default=0.1)
# 2自适应的MA
# range_limit=False 3.15999
# range_limit=True 1.21841
# MSEs_MA_self_adaptive = MA_self_adaptive(data_train, data_test, range_limit=True)
# print("MSEs_MA_self_adaptive_avg: ", sum(MSEs_MA_self_adaptive) / len(MSEs_MA_self_adaptive))
# 3WMA
# c=3, range_limit=True, default=0.1, Type="WMA" 1.09686
# c=3, range_limit=True, default=0.0, Type="WMA" 1.10195
# MA(data_train, data_test, c=3, range_limit=True, default=0.0, Type="WMA")
# 4EMA
# c=3, range_limit=True, default=0.1, Type="EMA" 1.13864
# MA(data_train, data_test, c=3, range_limit=True, default=0.1, Type="EMA")