0 2022/4/25
1 泰迪杯数据挖掘
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib. pyplot as plt
from adtk. detector import GeneralizedESDTestAD
from matplotlib. font_manager import FontProperties
from statsmodels. graphics. tsaplots import plot_acf, plot_pacf
import data_handle
def data_box ( ) :
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据.csv' )
box_1, box_2, box_3, box_4 = dataset[ '总有功功率(kw)' ] [ 0 : 35040 ] , dataset[ '总有功功率(kw)' ] [ 35040 : 70016 ] , dataset[ '总有功功率(kw)' ] [
70016 : 105131 ] , \
dataset[ '总有功功率(kw)' ] [ 105131 : 128156 ]
plt. figure( figsize= ( 10 , 5 ) )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
plt. title( '箱线图' , fontsize= 20 )
labels = '2018' , '2019' , '2020' , '2021'
plt. boxplot( [ box_1, box_2, box_3, box_4] , labels= labels)
plt. show( )
def data_time ( ) :
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据.csv' )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
plt. plot( dataset[ '总有功功率(kw)' ] , label= 'xx' )
plt. ylabel( '用电负荷/KW' , fontsize= 18 )
plt. xlabel( '时间' , fontsize= 18 )
plt. show( )
def data_bar ( ) :
dataset = pd. read_csv( './全部数据/附件2-行业日负荷数据.csv' )
dataset[ '数据时间' ] = pd. to_datetime( dataset[ '数据时间' ] , format = "%Y-%m-%d %H:%M:%S" )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
x = np. array( [ '大工业用电' , '非普工业' , '普通工业' , '商业' ] )
data1 = dataset[ '数据时间' ] . loc[ dataset[ '行业类型' ] == "大工业用电" ]
data2 = dataset[ '数据时间' ] . loc[ dataset[ '行业类型' ] == "非普工业" ]
data3 = dataset[ '数据时间' ] . loc[ dataset[ '行业类型' ] == "普通工业" ]
data4 = dataset[ '数据时间' ] . loc[ dataset[ '行业类型' ] == "商业" ]
def hot_data ( ) :
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据.csv' )
dataset[ '数据时间' ] = pd. to_datetime( dataset[ '数据时间' ] , format = "%Y-%m-%d %H:%M:%S" )
dataset = dataset[ 0 : 35040 ]
dataset. set_index( '数据时间' , inplace= True )
data = dataset. resample( 'd' ) . sum ( )
k = 0
m = 0
a = np. ones( ( 12 , 30 ) )
for i in range ( 360 ) :
if k < 30 :
a[ m] [ k] = data[ '总有功功率(kw)' ] [ i]
k = k + 1
else :
k = 0
m = m + 1
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
fig, ax = plt. subplots( figsize= ( 9 , 9 ) )
sns. heatmap( a)
labels = [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 ]
y = [ 0.5 , 1.5 , 2.5 , 3.5 , 4.5 , 5.5 , 6.5 , 7.5 , 8.5 , 9.5 , 10.5 , 11.5 ]
plt. yticks( y, labels= labels)
x = [ ]
x_labels = [ ]
for i in range ( 30 ) :
x. append( i + 0.5 )
x_labels. append( i + 1 )
plt. xticks( x, labels= x_labels)
ax. set_title( '2018年每日负荷热力图' , fontsize= 18 )
ax. set_ylabel( 'T/month' , fontsize= 18 )
ax. set_xlabel( 'T/day' , fontsize= 18 )
plt. show( )
print ( a)
if __name__ == '__main__' :
hot_data( )
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
from sklearn. preprocessing import MinMaxScaler
from sklearn. metrics import r2_score, mean_absolute_error, accuracy_score, mean_squared_error
import tensorflow as tf
from tensorflow. keras import Sequential, layers, utils
import warnings
def create_new_dataset ( dataset, seq_len= 12 ) :
'''
基于原始数据集构造新的序列特征数据集
Params:
dataset : 原始数据集
seq_len : 序列长度(时间跨度)
Returns:
X, y
'''
X = [ ]
y = [ ]
scaler = MinMaxScaler( )
scaler. fit( dataset)
dataset = scaler. transform( dataset)
val_data_len = len ( dataset) // 10
validata = dataset[ len ( dataset) - val_data_len - seq_len- 1 : ]
traindata = dataset[ : len ( dataset) - val_data_len]
start = 0
end = traindata. shape[ 0 ] - 2 * seq_len
for i in range ( start, end) :
sample = traindata[ i: i + seq_len]
label = traindata[ i + seq_len: i+ 2 * seq_len]
X. append( sample)
y. append( label)
return np. array( X) , np. reshape( np. array( y) , ( len ( y) , SEQ_LEN) ) , np. array( validata)
def split_dataset ( X, y, train_ratio= 0.8 ) :
'''基于X和y,切分为train和test
Params:
X : 特征数据集
y : 标签数据集
train_ratio : 训练集占X的比例
Returns:
X_train, X_test, y_train, y_test
'''
X_len = len ( X)
train_data_len = int ( X_len * train_ratio)
X_train = X[ : train_data_len]
y_train = y[ : train_data_len]
X_test = X[ train_data_len: ]
y_test = y[ train_data_len: ]
return X_train, X_test, y_train, y_test
def create_batch_data ( X, y, batch_size= 32 , data_type= 1 ) :
'''基于训练集和测试集,创建批数据
Params:
X : 特征数据集
y : 标签数据集
batch_size : batch的大小,即一个数据块里面有几个样本
data_type : 数据集类型(测试集表示1,训练集表示2)
Returns:
train_batch_data 或 test_batch_data
'''
if data_type == 1 :
dataset = tf. data. Dataset. from_tensor_slices( ( tf. constant( X) , tf. constant( y) ) )
test_batch_data = dataset. batch( batch_size)
return test_batch_data
else :
dataset = tf. data. Dataset. from_tensor_slices( ( tf. constant( X) , tf. constant( y) ) )
train_batch_data = dataset. cache( ) . shuffle( 1000 ) . batch( batch_size)
return train_batch_data
def build_model ( ) :
model = Sequential( [
layers. LSTM( 32 , input_shape= ( SEQ_LEN, 1 ) , return_sequences= True ) ,
layers. Dropout( 0.4 ) ,
layers. LSTM( units= 32 , return_sequences= False ) ,
layers. Dense( SEQ_LEN)
] )
model. compile ( optimizer= 'adam' , loss= "mse" )
return model
def train_model ( model, train_batch_dataset, test_batch_dataset) :
file_path = './best_checkpoint.hdf5'
history = model. fit( train_batch_dataset,
epochs= 10 ,
validation_data= test_batch_dataset
)
y_pred = model. predict( X_test)
mse = mean_squared_error( y_pred, y_test)
mae = mean_absolute_error( y_test, y_pred)
r2 = r2_score( y_test[ : , 0 ] , y_pred[ : , 0 ] )
print ( "mes" , mse)
print ( "MAE" , mae)
print ( "r2_score" , r2)
plt. figure( figsize= ( 16 , 8 ) )
plt. plot( y_test[ 0 , : ] , label= 'Ture label' )
plt. plot( y_pred[ 0 , : ] , label= 'Pred label' )
plt. title( "True vs Pred" )
plt. legend( loc= 'best' )
plt. show( )
return
def predict_next ( model, sample, epoch= 36 ) :
temp1 = model. predict( sample. reshape( 1 , SEQ_LEN, 1 ) )
return temp1. reshape( SEQ_LEN, 1 )
if __name__ == '__main__' :
warnings. filterwarnings( 'ignore' )
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据.csv' )
dataset[ '数据时间' ] = pd. to_datetime( dataset[ '数据时间' ] , format = "%Y-%m-%d %H:%M:%S" )
dataset. index = dataset[ '数据时间' ]
dataset. drop( columns= [ '数据时间' ] , axis= 1 , inplace= True )
dataset. columns = [ 'DOM_kW' ]
SEQ_LEN = 36
X, y, valid_data = create_new_dataset( dataset. values, seq_len= SEQ_LEN)
X_train, X_test, y_train, y_test = split_dataset( X, y)
train_batch_dataset = create_batch_data( X_train, y_train, data_type= 0 )
test_batch_dataset = create_batch_data( X_test, y_test, data_type= 1 )
train_model( build_model( ) , train_batch_dataset, test_batch_dataset)
model = build_model( )
model. load_weights( r'./best_checkpoint.hdf5' )
pred = predict_next( model, sample= X_test[ - 1 ] , epoch= 1 )
plt. figure( figsize= ( 12 , 6 ) )
plt. plot( pred, color= 'red' , label= 'Prediction' )
plt. plot( valid_data[ : 36 ] , color= 'blue' , label= 'Truth' )
plt. xlabel( "Epochs" )
plt. ylabel( 'Value' )
plt. legend( loc= 'best' )
plt. show( )
0 2022/4/26
1 数据挖掘
def pearson_data ( ) :
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv' )
index = [ 'month' , 'day' , 'hour' , 'minute' , 'dayofweek' , 'dayofyear' , '最高温度' , '最低温度' , '白天最低风力' , '白天最高风力' ,
'夜晚最低风力' , '夜晚最高风力' , '最好天气' , '最坏天气' , 'holiday' ]
index_labels = [ 'month' , 'day' , 'hour' , 'minute' , 'dofwe' , 'dofye' , 'maxte' , 'minte' , 'minwd' , 'maxwd' ,
'minwn' , 'maxwn' , 'bewe' , 'bawe' , 'holiday' ]
r_data = np. ones( ( 15 , 15 ) )
for i in range ( len ( index) ) :
for j in range ( len ( index) ) :
r = pearsonr( dataset[ index[ i] ] , dataset[ index[ j] ] )
r_data[ i] [ j] = r[ 0 ]
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
fig, ax = plt. subplots( figsize= ( 15 , 15 ) )
sns. heatmap( r_data, cmap= 'YlGnBu' )
x = [ 0.5 , 1.5 , 2.5 , 3.5 , 4.5 , 5.5 , 6.5 , 7.5 , 8.5 , 9.5 , 10.5 , 11.5 , 12.5 , 13.5 , 14.5 ]
plt. xticks( x, labels= index_labels)
plt. yticks( x, labels= index_labels)
ax. set_title( '特征相关系数' , fontsize= 18 )
plt. show( )
def vmd_data ( ) :
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv' , usecols= [ 0 ] )
dataset1 = pd. read_csv( './全部数据/VMDban-1.csv' )
dataset2 = pd. read_csv( './全部数据/VMDban-2.csv' )
dataset3 = pd. read_csv( './全部数据/VMDban-3.csv' )
dataset4 = pd. read_csv( './全部数据/VMDban-4.csv' )
dataset5 = pd. read_csv( './全部数据/VMDban-5.csv' )
dataset6 = pd. read_csv( './全部数据/VMDban-6.csv' )
dataset7 = dataset[ '总有功功率(kw)' ] - dataset1[ 'v1' ] - dataset2[ 'v2' ] - dataset3[ 'v3' ] - dataset4[ 'v4' ] - dataset5[ 'v5' ] - dataset6[ 'v6' ]
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
plt. subplot( 7 , 1 , 1 )
plt. plot( dataset1[ 0 : 6000 ] , label= 'u1' )
plt. legend( loc= 'upper right' )
plt. ylabel( 'KW' , fontsize= 12 )
plt. subplot( 7 , 1 , 2 )
plt. plot( dataset2[ 0 : 6000 ] , label= 'u2' )
plt. legend( loc= 'upper right' )
plt. ylabel( 'KW' , fontsize= 12 )
plt. subplot( 7 , 1 , 3 )
plt. plot( dataset3[ 0 : 6000 ] , label= 'u3' )
plt. legend( loc= 'upper right' )
plt. ylabel( 'KW' , fontsize= 12 )
plt. subplot( 7 , 1 , 4 )
plt. plot( dataset4[ 0 : 6000 ] , label= 'u4' )
plt. legend( loc= 'upper right' )
plt. ylabel( 'KW' , fontsize= 12 )
plt. subplot( 7 , 1 , 5 )
plt. plot( dataset5[ 0 : 6000 ] , label= 'u5' )
plt. legend( loc= 'upper right' )
plt. ylabel( 'KW' , fontsize= 12 )
plt. subplot( 7 , 1 , 6 )
plt. plot( dataset6[ 0 : 6000 ] , label= 'u6' )
plt. legend( loc= 'upper right' )
plt. ylabel( 'KW' , fontsize= 12 )
plt. subplot( 7 , 1 , 7 )
plt. plot( dataset7[ 0 : 6000 ] , label= '残差' )
plt. legend( loc= 'upper right' )
plt. ylabel( 'KW' , fontsize= 12 )
plt. show( )
def data_step_per ( step) :
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv' )
dataset[ '数据时间' ] = pd. to_datetime( dataset[ '数据时间' ] , format = "%Y-%m-%d %H:%M:%S" )
dataset. set_index( '数据时间' , inplace= True )
data = dataset. resample( 'd' ) . sum ( )
data_person = np. ones( ( step, step) )
for i in range ( step) :
dataset1 = data[ 0 + i: 1310 + i]
for j in range ( step) :
dataset2 = data[ 0 + j: 1310 + j]
r = pearsonr( dataset1[ '总有功功率(kw)' ] , dataset2[ '总有功功率(kw)' ] )
data_person[ i] [ j] = r[ 0 ]
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
fig, ax = plt. subplots( figsize= ( 15 , 15 ) )
sns. heatmap( data_person, cmap= 'YlGnBu' , square= True , annot= True )
plt. show( )
def season_data ( ) :
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv' )
decomposition = sm. tsa. seasonal_decompose( dataset[ '总有功功率(kw)' ] . values, model= 'addictive' , period= 96 * 120 )
season = decomposition. seasonal
plt. plot( season)
plt. show( )
0 2022/4/27
1 数据挖掘
def data_step_per ( step) :
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv' )
dataset[ '数据时间' ] = pd. to_datetime( dataset[ '数据时间' ] , format = "%Y-%m-%d %H:%M:%S" )
dataset. set_index( '数据时间' , inplace= True )
data = dataset. resample( 'd' ) . sum ( )
data_person = np. ones( ( step, step) )
for i in range ( step) :
dataset1 = data[ 0 + i: 1310 + i]
for j in range ( step) :
dataset2 = data[ 0 + j: 1310 + j]
r = pearsonr( dataset1[ '总有功功率(kw)' ] , dataset2[ '总有功功率(kw)' ] )
data_person[ i] [ j] = r[ 0 ]
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
fig, ax = plt. subplots( figsize= ( 15 , 15 ) )
sns. heatmap( data_person, cmap= 'YlGnBu' , square= True , annot= True )
plt. show( )
def season_data ( ) :
dataset = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv' )
decomposition = sm. tsa. seasonal_decompose( dataset[ '总有功功率(kw)' ] . values, model= 'addictive' , period= 96 * 120 )
season = decomposition. seasonal
plt. plot( season)
plt. show( )
def seasonal ( df) :
mean_load = df[ "总有功功率(kw)" ] . mean( )
dataset = df. copy( )
df. drop( range ( 75745 , 75745 + 96 ) , axis= 0 , inplace= True )
res = [ ]
df[ "year" ] = df[ "数据时间" ] . dt. year
year_group = df. groupby( "year" )
group_2018 = year_group. get_group( 2018 )
group_2019 = year_group. get_group( 2019 )
group_2020 = year_group. get_group( 2020 )
group_2021 = year_group. get_group( 2021 )
for i in range ( 35040 ) :
if i <= 23327 :
a = ( group_2018. iloc[ i] [ 0 ] +
group_2019. iloc[ i] [ 0 ] +
group_2020. iloc[ i] [ 0 ] +
group_2021. iloc[ i] [ 0 ] ) / 4
else :
a = ( group_2018. iloc[ i] [ 0 ] + group_2019. iloc[ i] [ 0 ] + group_2020. iloc[ i] [ 0 ] ) / 3
res. append( a / mean_load)
res_2020 = res. copy( )
for i in range ( 96 ) :
res_2020. insert( 0 , 5664 )
res = res + res + res_2020 + res[ 0 : len ( group_2021) ]
dataset[ "总有功功率(kw)" ] /= res
return dataset
def delete_personal ( ) :
df = pd. read_csv( './全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv' )
df[ '数据时间' ] = pd. to_datetime( df[ '数据时间' ] , format = "%Y-%m-%d %H:%M:%S" )
df2 = df. copy( )
mean_load = df[ "总有功功率(kw)" ] . mean( )
df2. loc[ : , '比例' ] = df[ 'month' ]
a = [ ]
for index, row in df2. iterrows( ) :
data = df. loc[ ( df[ 'month' ] == row[ 'month' ] ) & ( df[ 'day' ] == row[ 'day' ] ) & ( df[ 'minute' ] == row[ 'minute' ] ) & (
df[ 'hour' ] == row[ 'hour' ] ) ]
x = data[ '总有功功率(kw)' ] . sum ( ) / data. shape[ 0 ]
a. append( x / mean_load)
df2[ '总有功功率(kw)' ] [ index] = df[ '总有功功率(kw)' ] [ index] / ( x / mean_load)
df2[ '比例' ] [ index] = x / mean_load
df2. to_csv( "./全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday_personal_2.csv" , index= None )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
plt. plot( df2[ '总有功功率(kw)' ] , label= 'xx' )
plt. ylabel( '用电负荷/KW' , fontsize= 18 )
plt. xlabel( '时间/年/月' , fontsize= 18 )
labels = [ 0 , 50000 , 100000 , 150000 , 200000 , 250000 , 300000 , 350000 ]
y = [ 0 , 50000 , 100000 , 150000 , 200000 , 250000 , 300000 , 350000 ]
plt. yticks( y, labels= labels)
x_labels = [ '2018/1' , '2018/6' , '2019/1' , '2019/6' , '2020/1' , '2020/6' , '2021/1' ]
x = [ 0 , 20000 , 40000 , 58000 , 74000 , 90000 , 109000 ]
plt. xticks( x, labels= x_labels)
plt. show( )
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt. rcParams[ 'font.family' ] = [ 'MicroSoft YaHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
df = pd. read_csv( './全部数据/附件2-行业日负荷数据.csv' )
df = df. loc[ df[ '行业类型' ] == '普通工业' ]
x = df[ '数据时间' ]
y = df[ '有功功率最小值(kw)' ]
n = len ( y)
Sk = np. zeros( n)
UFk = np. zeros( n)
s = 0
for i in range ( 1 , n) :
for j in range ( 0 , i) :
if y. iloc[ i] > y. iloc[ j] :
s += 1
Sk[ i] = s
E = ( i+ 1 ) * ( i/ 4 )
Var = ( i+ 1 ) * i* ( 2 * ( i+ 1 ) + 5 ) / 72
UFk[ i] = ( Sk[ i] - E) / np. sqrt( Var)
y2 = np. zeros( n)
Sk2 = np. zeros( n)
UBk = np. zeros( n)
s = 0
y2 = y[ : : - 1 ]
for i in range ( 1 , n) :
for j in range ( 0 , i) :
if y2. iloc[ i] > y2. iloc[ j] :
s += 1
Sk2[ i] = s
E = ( i+ 1 ) * ( i/ 4 )
Var = ( i+ 1 ) * i* ( 2 * ( i+ 1 ) + 5 ) / 72
UBk[ i] = - ( Sk2[ i] - E) / np. sqrt( Var)
UBk2 = UBk[ : : - 1 ]
plt. figure( figsize= ( 18 , 18 ) )
plt. plot( range ( 973 ) , UFk, label= 'UF' )
plt. plot( range ( 973 ) , UBk2, label= 'UB' )
plt. ylabel( 'Mann-Kendall检验值' )
plt. xlabel( '年份 Year' )
x_lim = plt. xlim( )
plt. plot( x_lim, [ - 1.96 , - 1.96 ] , ':' , color= 'black' , label= '5%显著水平' )
plt. plot( x_lim, [ 0 , 0 ] , '--' , color= 'black' )
plt. plot( x_lim, [ 1.96 , 1.96 ] , ':' , color= 'black' )
plt. legend( bbox_to_anchor= ( 0.75 , 0.07 ) , facecolor= 'w' , frameon= False )
plt. text( 0 , - 1.6 , '突变点检验' )
plt. savefig( "./图片/MK检验.png" )
plt. show( )