数据预处理:数据平滑法

1. 移动平均


   
   
  1. from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error
  2. # 滑动窗口估计,发现数据变化趋势
  3. def plotMovingAverage( series, window, plot_intervals=False, scale=1.96, plot_anomalies=False):
  4. """
  5. series - dataframe with timeseries
  6. window - rolling window size
  7. plot_intervals - show confidence intervals
  8. plot_anomalies - show anomalies
  9. """
  10. rolling_mean = series.rolling(window=window).mean()
  11. plt.figure(figsize=( 13, 5))
  12. plt.title( "Moving average\n window size = {}". format(window))
  13. plt.plot(rolling_mean, "g", label= "Rolling mean trend")
  14. # Plot confidence intervals for smoothed values
  15. if plot_intervals:
  16. mae = mean_absolute_error(series[window:], rolling_mean[window:])
  17. deviation = np.std(series[window:] - rolling_mean[window:])
  18. lower_bond = rolling_mean - (mae + scale * deviation)
  19. upper_bond = rolling_mean + (mae + scale * deviation)
  20. plt.plot(upper_bond, "r--", label= "Upper Bond / Lower Bond")
  21. plt.plot(lower_bond, "r--")
  22. # Having the intervals, find abnormal values
  23. if plot_anomalies:
  24. anomalies = pd.DataFrame(index=series.index)
  25. anomalies[ 'series<lower_bond'] = series[series<lower_bond]
  26. anomalies[ 'series>upper_bond'] = series[series>upper_bond]
  27. plt.plot(anomalies, "ro", markersize= 10)
  28. plt.plot(series[window:], label= "Actual values")
  29. plt.legend(loc= "upper left")
  30. plt.grid( True)
  31. plotMovingAverage(data[ 'trend'], 12,plot_intervals= True, scale= 1.96, plot_anomalies= True)

拟合图像:

2. 单指数平滑

公式:    s_{i} = \alpha \cdot x_{i} + (1-\alpha )\cdot s_{i-1}

              x_{i+h} = s_{i}

单指数平滑值s_{i}表示为当前统计值x_{i}与上一期平滑值s_{i-1}的加权平均,\alpha\in [0,1]叫做平滑因子,调节远期和近期数据对预测值的影响权重,平滑因子取值越接近0时拟合曲线越平滑,单指数平滑法仅考虑数据的baseline,适用于没有总体趋势的时间序列,如果用来处理有总体趋势的序列,平滑值将滞后于原始数据。


   
   
  1. # 单指数平滑
  2. def exponential_smoothing( series, alpha):
  3. """
  4. series - dataset with timestamps
  5. alpha - float [0.0, 1.0], smoothing parameter
  6. """
  7. result = [series[ 0]] # first value is same as series
  8. for n in range( 1, len(series)):
  9. result.append(alpha * series[n] + ( 1 - alpha) * result[n- 1])
  10. return result
  11. def plotExponentialSmoothing( series, alphas):
  12. """
  13. Plots exponential smoothing with different alphas
  14. series - dataset with timestamps
  15. alphas - list of floats, smoothing parameters
  16. """
  17. with plt.style.context( 'seaborn-white'):
  18. plt.figure(figsize=( 15, 7))
  19. for alpha in alphas:
  20. plt.plot(exponential_smoothing(series, alpha), label= "Alpha {}". format(alpha))
  21. plt.plot(series.values, "c", label = "Actual")
  22. plt.legend(loc= "best")
  23. plt.axis( 'tight')
  24. plt.title( "Exponential Smoothing")
  25. plt.grid( True);
  26. plotExponentialSmoothing(data[ 'trend'], [ 0.5, 0.1])

拟合图像:

3. 双指数平滑

公式:     s_{i} = \alpha \cdot x_{i} + (1-\alpha )\cdot (s_{i-1} + t_{i-1})

               t_{i} = \beta \cdot (s_{i} - s_{i-1} ) + (1-\beta )\cdot t_{i-1}

               x_{i+h} = s_{i} + h\cdot t_{i}

双指数平滑法在单指数平滑法基础上增加趋势信息,第二个等式描述趋势平滑过程,趋势的未平滑值是当前时刻平滑值s_{i}减去前一时刻平滑值s_{i-1},再引入参数\beta对趋势进行一次指数平滑处理;双指数平滑法同时考虑了时间序列数据的baseline和趋势,适用于具有趋势的时间序列预测;


   
   
  1. # 双指数平滑
  2. def double_exponential_smoothing( series, alpha, beta):
  3. """
  4. series - dataset with timeseries
  5. alpha - float [0.0, 1.0], smoothing parameter for level
  6. beta - float [0.0, 1.0], smoothing parameter for trend
  7. """
  8. # first value is same as series
  9. result = [series[ 0]]
  10. for n in range( 1, len(series)+ 1):
  11. if n == 1:
  12. level, trend = series[ 0], series[ 1] - series[ 0]
  13. if n >= len(series): # forecasting
  14. value = result[- 1]
  15. else:
  16. value = series[n]
  17. last_level, level = level, alpha*value + ( 1-alpha)*(level+trend)
  18. trend = beta*(level-last_level) + ( 1-beta)*trend
  19. result.append(level+trend)
  20. return result
  21. def plotDoubleExponentialSmoothing( series, alphas, betas):
  22. """
  23. Plots double exponential smoothing with different alphas and betas
  24. series - dataset with timestamps
  25. alphas - list of floats, smoothing parameters for level
  26. betas - list of floats, smoothing parameters for trend
  27. """
  28. with plt.style.context( 'seaborn-white'):
  29. plt.figure(figsize=( 13, 5))
  30. for alpha in alphas:
  31. for beta in betas:
  32. plt.plot(double_exponential_smoothing(series, alpha, beta), label= "Alpha {}, beta {}". format(alpha, beta))
  33. plt.plot(series.values, label = "Actual")
  34. plt.legend(loc= "best")
  35. plt.axis( 'tight')
  36. plt.title( "Double Exponential Smoothing")
  37. plt.grid( True)
  38. plotDoubleExponentialSmoothing(data[ 'trend'], alphas=[ 0.5, 0.3], betas=[ 0.9, 0.3])

拟合结果:

4. 三次指数平滑法(Holt-Winters)

公式:     s_{i} = \alpha \cdot (x_{i} - p_{i-k}) + (1-\alpha )\cdot (s_{i-1} + t_{i-1})

               t_{i} = \beta \cdot (s_{i} - s_{i-1} ) + (1-\beta )\cdot t_{i-1}

               p_{i} = \gamma \cdot (x_{i} - s_{i} ) + (1-\gamma )\cdot p_{i-k}

               x_{i+h} = s_{i} + h\cdot t_{i} + p_{i-k+h}

三次指数平滑法增加了一个季节分量,适用于带有季节特征的时间序列预测。3个参数值都位于 [0, 1] 之间,可以通过交叉验证、网格搜索寻的最优参数值;s, t, p初值选取对于算法整体影响不大,通常的取值为s_{0} = x_{0}, t_{0} = x_{1} - x_{0}, p = 0,累乘时p=1


   
   
  1. class HoltWinters:
  2. """
  3. Holt-Winters model with the anomalies detection using Brutlag method
  4. # series - initial time series
  5. # slen - length of a season
  6. # alpha, beta, gamma - Holt-Winters model coefficients
  7. # n_preds - predictions horizon
  8. # scaling_factor - sets the width of the confidence interval by Brutlag (usually takes values from 2 to 3)
  9. """
  10. def __init__( self, series, slen, alpha, beta, gamma, n_preds, scaling_factor=1.96):
  11. self.series = series
  12. self.slen = slen
  13. self.alpha = alpha
  14. self.beta = beta
  15. self.gamma = gamma
  16. self.n_preds = n_preds
  17. self.scaling_factor = scaling_factor
  18. def initial_trend( self):
  19. s = 0.0
  20. for i in range(self.slen):
  21. s += float(self.series[i + self.slen] - self.series[i]) / self.slen
  22. return s / self.slen
  23. def initial_seasonal_components( self):
  24. seasons = {}
  25. season_averages = []
  26. n_seasons = int( len(self.series) / self.slen)
  27. # calculate season averages
  28. for j in range(n_seasons):
  29. season_averages.append(self.series[self.slen * j:self.slen * j + self.slen]. sum() / float(self.slen))
  30. # calculate initial values
  31. for i in range(self.slen):
  32. sum_of_vals_over_avg = 0.0
  33. for j in range(n_seasons):
  34. sum_of_vals_over_avg += self.series[self.slen * j + i] - season_averages[j]
  35. seasons[i] = sum_of_vals_over_avg / n_seasons
  36. return seasons
  37. def triple_exponential_smoothing( self):
  38. self.result = []
  39. self.Smooth = []
  40. self.Season = []
  41. self.Trend = []
  42. self.PredictedDeviation = []
  43. self.UpperBond = []
  44. self.LowerBond = []
  45. seasons = self.initial_seasonal_components()
  46. for i in range( len(self.series) + self.n_preds):
  47. if i == 0: # components initialization
  48. smooth = self.series[ 0]
  49. trend = self.initial_trend()
  50. self.result.append(self.series[ 0])
  51. self.Smooth.append(smooth)
  52. self.Trend.append(trend)
  53. self.Season.append(seasons[i % self.slen])
  54. self.PredictedDeviation.append( 0)
  55. self.UpperBond.append(self.result[ 0] +
  56. self.scaling_factor *
  57. self.PredictedDeviation[ 0])
  58. self.LowerBond.append(self.result[ 0] -
  59. self.scaling_factor *
  60. self.PredictedDeviation[ 0])
  61. continue
  62. if i >= len(self.series): # predicting
  63. m = i - len(self.series) + 1
  64. self.result.append((smooth + m * trend) + seasons[i % self.slen])
  65. # when predicting we increase uncertainty on each step
  66. self.PredictedDeviation.append(self.PredictedDeviation[- 1] * 1.1314)
  67. else:
  68. val = self.series[i]
  69. last_smooth, smooth = smooth, self.alpha * (val - seasons[i % self.slen]) + ( 1 - self.alpha) * (
  70. smooth + trend)
  71. trend = self.beta * (smooth - last_smooth) + ( 1 - self.beta) * trend
  72. seasons[i % self.slen] = self.gamma * (val - smooth) + ( 1 - self.gamma) * seasons[i % self.slen]
  73. self.result.append(smooth + trend + seasons[i % self.slen])
  74. # Deviation is calculated according to Brutlag algorithm.
  75. self.PredictedDeviation.append(self.gamma * np. abs(self.series[i] - self.result[i])
  76. + ( 1 - self.gamma) * self.PredictedDeviation[- 1])
  77. self.UpperBond.append(self.result[- 1] +
  78. self.scaling_factor *
  79. self.PredictedDeviation[- 1])
  80. self.LowerBond.append(self.result[- 1] -
  81. self.scaling_factor *
  82. self.PredictedDeviation[- 1])
  83. self.Smooth.append(smooth)
  84. self.Trend.append(trend)
  85. self.Season.append(seasons[i % self.slen])
  86. # 图形结果展示
  87. def plot_holt_winters( series, model, plot_intervals=False, plot_anomalies=False, save_pic=False):
  88. """
  89. series - dataset with timeseries
  90. plot_intervals - show confidence intervals
  91. plot_anomalies - show anomalies
  92. """
  93. plt.figure(figsize=( 13, 4))
  94. xt = series.index
  95. plt.plot(xt, model.result, 'g', label= "Model")
  96. plt.plot(xt, series.values, 'b', label= "Actual")
  97. error = mean_absolute_percentage_error(series.values[-model.n_preds:], model.result[-model.n_preds:])
  98. pic_title = ' ( ' + series.name + ' ) ' + 'Mean Absolute Percentage Error: {0:.2f}%'. format(error)
  99. plt.title(pic_title)
  100. if plot_anomalies:
  101. anomalies = np.array([np.NaN] * len(series))
  102. anomalies[series.values < model.LowerBond[: len(series)]] = \
  103. series.values[series.values < model.LowerBond[: len(series)]]
  104. anomalies[series.values > model.UpperBond[: len(series)]] = \
  105. series.values[series.values > model.UpperBond[: len(series)]]
  106. plt.plot(xt, anomalies, "r*", markersize= 10, label= "Anomalies")
  107. if plot_intervals:
  108. plt.plot(xt, model.UpperBond, "r--", alpha= 0.5, label= "Up/Low confidence")
  109. plt.plot(xt, model.LowerBond, "r--", alpha= 0.5)
  110. plt.fill_between(x=xt[ 0: len(model.result)], y1=model.UpperBond,
  111. y2=model.LowerBond, alpha= 0.2, color= "grey")
  112. plt.vlines(xt[ len(series)-model.n_preds], ymin= min(model.LowerBond), ymax= max(model.UpperBond),
  113. linestyles= 'dashed')
  114. plt.axvspan(xt[ len(series)-model.n_preds], xt[ len(model.result)- 1], alpha= 0.3, color= 'lightgrey')
  115. plt.grid( True)
  116. plt.axis( 'tight')
  117. plt.legend(loc= "best", fontsize= 13)
  118. if save_pic:
  119. pic_name = './out/TestResult/202007/{}.png'. format(series.name)
  120. plt.savefig(pic_name)
  121. # 交叉验证求参数
  122. def cv_score( params, series, loss_function=mean_squared_error, slen=12):
  123. """
  124. Returns error on CV
  125. params - vector of parameters for optimization
  126. series - dataset with timeseries
  127. slen - season length for Holt-Winters model
  128. """
  129. # errors array
  130. errors = []
  131. values = series.values
  132. alpha, beta, gamma = params
  133. # set the number of folds for cross-validation
  134. tscv = TimeSeriesSplit(n_splits= 4)
  135. # iterating over folds, train model on each, forecast and calculate error
  136. for train, test in tscv.split(values):
  137. if len(train) < 24:
  138. print( ' : The train set is not large enough!')
  139. else:
  140. model = HoltWinters(series=values[train], slen=slen,
  141. alpha=alpha, beta=beta, gamma=gamma, n_preds= len(test))
  142. model.triple_exponential_smoothing()
  143. predictions = model.result[- len(test):]
  144. actual = values[test]
  145. error = loss_function(predictions, actual)
  146. errors.append(error)
  147. return np.mean(np.array(errors))
  148. # 网格搜索参数初值
  149. def get_best_params( Series):
  150. warnings.filterwarnings( "ignore")
  151. best_score = 100
  152. best_param_ini = 0
  153. best_param_fin = 0
  154. for i in list(np.arange( 0, 1.1, 0.1)):
  155. try:
  156. x = [i, i, i]
  157. opt = minimize(cv_score, x0=x, args=(Series, mean_squared_log_error),
  158. method= "TNC", bounds = (( 0, 1), ( 0, 1), ( 0, 1)))
  159. alpha_final, beta_final, gamma_final = opt.x
  160. except ValueError:
  161. continue
  162. else:
  163. hw = HoltWinters(Series, slen = 12, alpha = alpha_final, beta = beta_final, gamma = gamma_final,
  164. n_preds = 12, scaling_factor = 3)
  165. hw.triple_exponential_smoothing()
  166. error = mean_absolute_percentage_error(Series.values[- 12:], hw.result[- 24:- 12])
  167. # print(x,': ',mape)
  168. if error < best_score:
  169. best_score = error
  170. best_param_ini = i
  171. best_param_fin = alpha_final, beta_final, gamma_final
  172. # print("best_score:{:.2f}".format(best_score))
  173. # print("best_para_initial:{}".format(best_param_ini))
  174. # print("best_para_final:{}".format(best_param_fin))
  175. return best_param_fin
  176. alpha_final, beta_final, gamma_final = get_best_params(train_scalar)
  177. model = HoltWinters(train_scalar, slen = 12,
  178. alpha = alpha_final,
  179. beta = beta_final,
  180. gamma = gamma_final,
  181. n_preds = 3,
  182. scaling_factor = 20)
  183. model.triple_exponential_smoothing()

 

文章转载自 http://t.csdn.cn/Sj4y6

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值