电力预测
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
import os
os. chdir( "D:\\LengPY\\AI电力能耗预测" )
data = pd. read_csv( 'zhenjiang_power.csv' )
data_9 = pd. read_csv( 'zhenjiang_power_9.csv' )
print ( data. shape)
print ( data_9. shape)
(885486, 3)
(43620, 3)
data. head( )
user_id record_date power_consumption 0 1 2015-01-01 1135.0 1 1 2015-01-02 570.0 2 1 2015-01-03 3418.0 3 1 2015-01-04 3968.0 4 1 2015-01-05 3986.0
data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 885486 entries, 0 to 885485
Data columns (total 3 columns):
user_id 885486 non-null int64
record_date 885486 non-null object
power_consumption 885486 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 20.3+ MB
data. describe( )
user_id power_consumption count 885486.000000 8.854860e+05 mean 727.500000 2.619980e+03 std 419.733783 3.154743e+04 min 1.000000 1.000000e+00 25% 364.000000 4.200000e+01 50% 727.500000 2.610000e+02 75% 1091.000000 8.250000e+02 max 1454.000000 1.310016e+06
拼接data和data_9
train_df = pd. concat( [ data, data_9] )
D:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
"""Entry point for launching an IPython kernel.
train_df. shape
(929106, 3)
train_df. head( )
power_consumption record_date user_id 0 1135.0 2015-01-01 1 1 570.0 2015-01-02 1 2 3418.0 2015-01-03 1 3 3968.0 2015-01-04 1 4 3986.0 2015-01-05 1
查看user_id的种类,即有多少公司使用电
len ( train_df[ 'user_id' ] . unique( ) )
1454
目标:预测未来整个高新区,每一天的总用电量
先将record_date改成datetime格式¶
train_df. loc[ : , 'record_date' ] = pd. to_datetime( train_df[ 'record_date' ] )
train_df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 929106 entries, 0 to 43619
Data columns (total 3 columns):
power_consumption 929106 non-null float64
record_date 929106 non-null datetime64[ns]
user_id 929106 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 28.4 MB
把数据集按照‘日期’进行分组后,求每天的电力总和
train_df = train_df[ [ 'record_date' , 'power_consumption' ] ] . groupby( 'record_date' ) . agg( 'sum' )
train_df. head( )
power_consumption record_date 2015-01-01 2900575.0 2015-01-02 3158211.0 2015-01-03 3596487.0 2015-01-04 3939672.0 2015-01-05 4101790.0
恢复索引
train_df = train_df. reset_index( )
train_df. head( )
record_date power_consumption 0 2015-01-01 2900575.0 1 2015-01-02 3158211.0 2 2015-01-03 3596487.0 3 2015-01-04 3939672.0 4 2015-01-05 4101790.0
快速查看总时间轴上的电力消耗
% matplotlib inline
train_df[ 'power_consumption' ] . plot( )
<matplotlib.axes._subplots.AxesSubplot at 0x20e6f763c18>
train_df[ ( train_df[ 'record_date' ] >= '2015-09-01' ) & ( train_df[ 'record_date' ] <= '2015-10-31' ) ] [ 'power_consumption' ] . plot( )
<matplotlib.axes._subplots.AxesSubplot at 0x20e6f820be0>
细化x轴的信息
% matplotlib inline
tmp_df = train_df[ ( train_df[ 'record_date' ] >= '2015-09-01' ) & ( train_df[ 'record_date' ] <= '2015-10-31' ) ] . copy( )
tmp_df = tmp_df. set_index( [ 'record_date' ] )
tmp_df[ 'power_consumption' ] . plot( )
<matplotlib.axes._subplots.AxesSubplot at 0x20e6f89dda0>
结论: 可以发现2015年10月1号,电力消耗比较少
添加测试数据
train_df. tail( )
record_date power_consumption 634 2016-09-26 4042132.0 635 2016-09-27 4287965.0 636 2016-09-28 4086998.0 637 2016-09-29 3941842.0 638 2016-09-30 3783264.0
创建测试集,时间为2016年10月1号,总共31天
test_df = pd. date_range( '2016-10-01' , periods= 31 , freq= 'D' )
test_df = pd. DataFrame( test_df)
test_df. head( )
0 0 2016-10-01 1 2016-10-02 2 2016-10-03 3 2016-10-04 4 2016-10-05
test_df. columns = [ 'record_date' ]
test_df. loc[ : , 'power_consumption' ] = 0
test_df. head( 15 )
record_date power_consumption 0 2016-10-01 0 1 2016-10-02 0 2 2016-10-03 0 3 2016-10-04 0 4 2016-10-05 0 5 2016-10-06 0 6 2016-10-07 0 7 2016-10-08 0 8 2016-10-09 0 9 2016-10-10 0 10 2016-10-11 0 11 2016-10-12 0 12 2016-10-13 0 13 2016-10-14 0 14 2016-10-15 0
拼成一整份数据,做特征工程
total_df = pd. concat( [ train_df, test_df] )
total_df. tail( )
record_date power_consumption 26 2016-10-27 0.0 27 2016-10-28 0.0 28 2016-10-29 0.0 29 2016-10-30 0.0 30 2016-10-31 0.0
构造时间特征
构造一些强时间指代特征:
星期几 一个月当中的第几天(月初还是月末) 一年当中的第几天(季节信息) 一年当中的第几个月(季节)、哪一年
total_df. loc[ : , 'week' ] = total_df[ 'record_date' ] . apply ( lambda x: x. dayofweek)
total_df. loc[ : , 'day' ] = total_df[ 'record_date' ] . apply ( lambda x: x. day)
total_df. loc[ : , 'month' ] = total_df[ 'record_date' ] . apply ( lambda x: x. month)
total_df. loc[ : , 'year' ] = total_df[ 'record_date' ] . apply ( lambda x: x. year)
total_df. head( )
record_date power_consumption week day month year 0 2015-01-01 2900575.0 3 1 1 2015 1 2015-01-02 3158211.0 4 2 1 2015 2 2015-01-03 3596487.0 5 3 1 2015 3 2015-01-04 3939672.0 6 4 1 2015 4 2015-01-05 4101790.0 0 5 1 2015
添加周末特征
total_df. loc[ : , 'weekend' ] = 0
total_df. loc[ : , 'weekend_sat' ] = 0
total_df. loc[ : , 'weekend_sun' ] = 0
total_df. head( 10 )
record_date power_consumption week day month year weekend weekend_sat weekend_sun 0 2015-01-01 2900575.0 3 1 1 2015 0 0 0 1 2015-01-02 3158211.0 4 2 1 2015 0 0 0 2 2015-01-03 3596487.0 5 3 1 2015 0 0 0 3 2015-01-04 3939672.0 6 4 1 2015 0 0 0 4 2015-01-05 4101790.0 0 5 1 2015 0 0 0 5 2015-01-06 4149164.0 1 6 1 2015 0 0 0 6 2015-01-07 4161928.0 2 7 1 2015 0 0 0 7 2015-01-08 4182622.0 3 8 1 2015 0 0 0 8 2015-01-09 4153509.0 4 9 1 2015 0 0 0 9 2015-01-10 3913704.0 5 10 1 2015 0 0 0
total_df. loc[ ( total_df[ 'week' ] > 4 ) , 'weekend' ] = 1
total_df. loc[ ( total_df[ 'week' ] == 5 ) , 'weekend_sat' ] = 1
total_df. loc[ ( total_df[ 'week' ] == 6 ) , 'weekend_sun' ] = 1
total_df. head( 10 )
record_date power_consumption week day month year weekend weekend_sat weekend_sun 0 2015-01-01 2900575.0 3 1 1 2015 0 0 0 1 2015-01-02 3158211.0 4 2 1 2015 0 0 0 2 2015-01-03 3596487.0 5 3 1 2015 1 1 0 3 2015-01-04 3939672.0 6 4 1 2015 1 0 1 4 2015-01-05 4101790.0 0 5 1 2015 0 0 0 5 2015-01-06 4149164.0 1 6 1 2015 0 0 0 6 2015-01-07 4161928.0 2 7 1 2015 0 0 0 7 2015-01-08 4182622.0 3 8 1 2015 0 0 0 8 2015-01-09 4153509.0 4 9 1 2015 0 0 0 9 2015-01-10 3913704.0 5 10 1 2015 1 1 0
添加一个月4周的信息
def week_of_month ( day) :
if day in range ( 1 , 8 ) :
return 1
if day in range ( 8 , 15 ) :
return 2
if day in range ( 15 , 22 ) :
return 3
else :
return 4
total_df. loc[ : , 'week_of_month' ] = total_df[ 'week' ] . apply ( lambda x: week_of_month( x) )
total_df. head( )
record_date power_consumption week day month year weekend weekend_sat weekend_sun week_of_month 0 2015-01-01 2900575.0 3 1 1 2015 0 0 0 1 1 2015-01-02 3158211.0 4 2 1 2015 0 0 0 1 2 2015-01-03 3596487.0 5 3 1 2015 1 1 0 1 3 2015-01-04 3939672.0 6 4 1 2015 1 0 1 1 4 2015-01-05 4101790.0 0 5 1 2015 0 0 0 4
添加月的上中下旬
def period_of_month ( day) :
if day in range ( 1 , 11 ) :
return 1
if day in range ( 11 , 21 ) :
return 2
else :
return 3
total_df. loc[ : , 'period_of_month' ] = total_df[ 'week' ] . apply ( lambda x: period_of_month( x) )
添加上半月和下半月
def period2_of_month ( day) :
if day in range ( 1 , 16 ) :
return 1
else :
return 2
total_df. loc[ : , 'period2_of_month' ] = total_df[ 'week' ] . apply ( lambda x: period2_of_month( x) )
total_df. head( )
record_date power_consumption week day month year weekend weekend_sat weekend_sun week_of_month period_of_month period2_of_month 0 2015-01-01 2900575.0 3 1 1 2015 0 0 0 1 1 1 1 2015-01-02 3158211.0 4 2 1 2015 0 0 0 1 1 1 2 2015-01-03 3596487.0 5 3 1 2015 1 1 0 1 1 1 3 2015-01-04 3939672.0 6 4 1 2015 1 0 1 1 1 1 4 2015-01-05 4101790.0 0 5 1 2015 0 0 0 4 3 2
填充法定节假日
total_df. loc[ : , 'festival' ] = 0
total_df. loc[ ( total_df. month== 10 ) & ( total_df. day< 8 ) , 'festival' ] = 1
total_df. head( 15 )
record_date power_consumption week day month year weekend weekend_sat weekend_sun week_of_month period_of_month period2_of_month festival 0 2015-01-01 2900575.0 3 1 1 2015 0 0 0 1 1 1 0 1 2015-01-02 3158211.0 4 2 1 2015 0 0 0 1 1 1 0 2 2015-01-03 3596487.0 5 3 1 2015 1 1 0 1 1 1 0 3 2015-01-04 3939672.0 6 4 1 2015 1 0 1 1 1 1 0 4 2015-01-05 4101790.0 0 5 1 2015 0 0 0 4 3 2 0 5 2015-01-06 4149164.0 1 6 1 2015 0 0 0 1 1 1 0 6 2015-01-07 4161928.0 2 7 1 2015 0 0 0 1 1 1 0 7 2015-01-08 4182622.0 3 8 1 2015 0 0 0 1 1 1 0 8 2015-01-09 4153509.0 4 9 1 2015 0 0 0 1 1 1 0 9 2015-01-10 3913704.0 5 10 1 2015 1 1 0 1 1 1 0 10 2015-01-11 3635468.0 6 11 1 2015 1 0 1 1 1 1 0 11 2015-01-12 4011329.0 0 12 1 2015 0 0 0 4 3 2 0 12 2015-01-13 3969860.0 1 13 1 2015 0 0 0 1 1 1 0 13 2015-01-14 4225259.0 2 14 1 2015 0 0 0 1 1 1 0 14 2015-01-15 4106437.0 3 15 1 2015 0 0 0 1 1 1 0
total_df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 670 entries, 0 to 30
Data columns (total 13 columns):
record_date 670 non-null datetime64[ns]
power_consumption 670 non-null float64
week 670 non-null int64
day 670 non-null int64
month 670 non-null int64
year 670 non-null int64
weekend 670 non-null int64
weekend_sat 670 non-null int64
weekend_sun 670 non-null int64
week_of_month 670 non-null int64
period_of_month 670 non-null int64
period2_of_month 670 non-null int64
festival 670 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(11)
memory usage: 73.3 KB
total_df. columns
Index(['record_date', 'power_consumption', 'week', 'day', 'month', 'year',
'weekend', 'weekend_sat', 'weekend_sun', 'week_of_month',
'period_of_month', 'period2_of_month', 'festival'],
dtype='object')
var_to_encoding = [ 'week' , 'day' , 'month' , 'year' ,
'weekend' , 'weekend_sat' , 'weekend_sun' , 'week_of_month' ,
'period_of_month' , 'period2_of_month' ]
var_to_encoding
['week',
'day',
'month',
'year',
'weekend',
'weekend_sat',
'weekend_sun',
'week_of_month',
'period_of_month',
'period2_of_month']
dummy_df = pd. get_dummies( total_df, columns= var_to_encoding)
dummy_df. head( )
record_date power_consumption festival week_0 week_1 week_2 week_3 week_4 week_5 week_6 ... weekend_sat_0 weekend_sat_1 weekend_sun_0 weekend_sun_1 week_of_month_1 week_of_month_4 period_of_month_1 period_of_month_3 period2_of_month_1 period2_of_month_2 0 2015-01-01 2900575.0 0 0 0 0 1 0 0 0 ... 1 0 1 0 1 0 1 0 1 0 1 2015-01-02 3158211.0 0 0 0 0 0 1 0 0 ... 1 0 1 0 1 0 1 0 1 0 2 2015-01-03 3596487.0 0 0 0 0 0 0 1 0 ... 0 1 1 0 1 0 1 0 1 0 3 2015-01-04 3939672.0 0 0 0 0 0 0 0 1 ... 1 0 0 1 1 0 1 0 1 0 4 2015-01-05 4101790.0 0 1 0 0 0 0 0 0 ... 1 0 1 0 0 1 0 1 0 1
5 rows × 67 columns
dummy_df. columns
Index(['record_date', 'power_consumption', 'festival', 'week_0', 'week_1',
'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'day_1', 'day_2',
'day_3', 'day_4', 'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10',
'day_11', 'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17',
'day_18', 'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24',
'day_25', 'day_26', 'day_27', 'day_28', 'day_29', 'day_30', 'day_31',
'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
'year_2015', 'year_2016', 'weekend_0', 'weekend_1', 'weekend_sat_0',
'weekend_sat_1', 'weekend_sun_0', 'weekend_sun_1', 'week_of_month_1',
'week_of_month_4', 'period_of_month_1', 'period_of_month_3',
'period2_of_month_1', 'period2_of_month_2'],
dtype='object')
train_X = dummy_df[ dummy_df. record_date< '2016-10-01' ]
train_y = dummy_df[ dummy_df. record_date< '2016-10-01' ] [ 'power_consumption' ]
test_X = dummy_df[ dummy_df. record_date>= '2016-10-01' ]
train_X. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 639 entries, 0 to 638
Data columns (total 67 columns):
record_date 639 non-null datetime64[ns]
power_consumption 639 non-null float64
festival 639 non-null int64
week_0 639 non-null uint8
week_1 639 non-null uint8
week_2 639 non-null uint8
week_3 639 non-null uint8
week_4 639 non-null uint8
week_5 639 non-null uint8
week_6 639 non-null uint8
day_1 639 non-null uint8
day_2 639 non-null uint8
day_3 639 non-null uint8
day_4 639 non-null uint8
day_5 639 non-null uint8
day_6 639 non-null uint8
day_7 639 non-null uint8
day_8 639 non-null uint8
day_9 639 non-null uint8
day_10 639 non-null uint8
day_11 639 non-null uint8
day_12 639 non-null uint8
day_13 639 non-null uint8
day_14 639 non-null uint8
day_15 639 non-null uint8
day_16 639 non-null uint8
day_17 639 non-null uint8
day_18 639 non-null uint8
day_19 639 non-null uint8
day_20 639 non-null uint8
day_21 639 non-null uint8
day_22 639 non-null uint8
day_23 639 non-null uint8
day_24 639 non-null uint8
day_25 639 non-null uint8
day_26 639 non-null uint8
day_27 639 non-null uint8
day_28 639 non-null uint8
day_29 639 non-null uint8
day_30 639 non-null uint8
day_31 639 non-null uint8
month_1 639 non-null uint8
month_2 639 non-null uint8
month_3 639 non-null uint8
month_4 639 non-null uint8
month_5 639 non-null uint8
month_6 639 non-null uint8
month_7 639 non-null uint8
month_8 639 non-null uint8
month_9 639 non-null uint8
month_10 639 non-null uint8
month_11 639 non-null uint8
month_12 639 non-null uint8
year_2015 639 non-null uint8
year_2016 639 non-null uint8
weekend_0 639 non-null uint8
weekend_1 639 non-null uint8
weekend_sat_0 639 non-null uint8
weekend_sat_1 639 non-null uint8
weekend_sun_0 639 non-null uint8
weekend_sun_1 639 non-null uint8
week_of_month_1 639 non-null uint8
week_of_month_4 639 non-null uint8
period_of_month_1 639 non-null uint8
period_of_month_3 639 non-null uint8
period2_of_month_1 639 non-null uint8
period2_of_month_2 639 non-null uint8
dtypes: datetime64[ns](1), float64(1), int64(1), uint8(64)
memory usage: 59.9 KB
drop_columns = [ 'record_date' , 'power_consumption' ]
train_X = train_X. drop( drop_columns, axis= 1 )
test_X = test_X. drop( drop_columns, axis= 1 )
train_X. head( )
festival week_0 week_1 week_2 week_3 week_4 week_5 week_6 day_1 day_2 ... weekend_sat_0 weekend_sat_1 weekend_sun_0 weekend_sun_1 week_of_month_1 week_of_month_4 period_of_month_1 period_of_month_3 period2_of_month_1 period2_of_month_2 0 0 0 0 0 1 0 0 0 1 0 ... 1 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 ... 1 0 1 0 1 0 1 0 1 0 2 0 0 0 0 0 0 1 0 0 0 ... 0 1 1 0 1 0 1 0 1 0 3 0 0 0 0 0 0 0 1 0 0 ... 1 0 0 1 1 0 1 0 1 0 4 0 1 0 0 0 0 0 0 0 0 ... 1 0 1 0 0 1 0 1 0 1
5 rows × 65 columns
建立线性模型
from sklearn. linear_model import RidgeCV
linear_reg = RidgeCV( alphas= [ 0.2 , 0.5 , 0.8 ] , cv= 5 )
linear_reg. fit( train_X, train_y)
RidgeCV(alphas=array([0.2, 0.5, 0.8]), cv=5, fit_intercept=True,
gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)
评估模型,这里返回的是R^2分数
linear_reg. score( train_X, train_y)
0.537404260499189
prdictions = linear_reg. predict( test_X)
预测10月份电力
prdictions
array([3213269.92309517, 3123678.42816242, 3488047.72956573,
3549293.25537305, 3483007.25110293, 3526193.91861047,
3553449.2940896 , 3822642.14908673, 3647254.75965857,
3920055.32252978, 3963349.38044719, 3993343.10094771,
3971175.731758 , 3963793.56136557, 3774211.69037114,
3657509.20919986, 3949241.60693345, 4011946.71989673,
4016839.22480092, 3917574.35561122, 3946839.7769619 ,
3891639.02982068, 3766109.02571362, 4043628.80876831,
4073053.0043004 , 4071511.5183789 , 4071951.35102406,
3950797.94209951, 3954010.07971396, 3730478.65367573,
3908992.01170212])
test_df. head( )
record_date power_consumption 0 2016-10-01 0 1 2016-10-02 0 2 2016-10-03 0 3 2016-10-04 0 4 2016-10-05 0
test_df. loc[ : , 'power_consumption' ] = prdictions
test_df. head( )
record_date power_consumption 0 2016-10-01 3.213270e+06 1 2016-10-02 3.123678e+06 2 2016-10-03 3.488048e+06 3 2016-10-04 3.549293e+06 4 2016-10-05 3.483007e+06