系统配置
python3.7.9
tensorflow1.14
numpy1.16.5(没找到包,直接pip install numpy==1.16.5了)
清华园tensorflow包
tensorflow-1.14.0-cp37-cp37m-win_amd64.whl (pip install tensorflow-1.14.0-cp37-cp37m-win_amd64.whl)
tensorflow安装
数据
算法总结
1.数据处理部分
# -*- coding=utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
#导入数据
p = r'D:\LoadPred\train.csv'
with open(p,encoding = 'utf-8') as f:
data = np.loadtxt(f,float,delimiter = ",", skiprows = 1)
p = r'D:\LoadPred\train_single.csv'
with open(p,encoding = 'utf-8') as f:
data_csv = np.loadtxt(f,float,delimiter = ",", skiprows = 1)
print(data)
print(data_csv)
#数据提取
dataSeason = data[0:,:1]
dataMonth = data[0:,1:2]
dataWeek = data[0:,2:3]
dataDay = data[0:,3:4]
dataHour = data[0:,4:5]
dataTemperature = data[0:,5:6]
dataAvgTemp = data[0:,6:7]
dataAllLoad = data[0:,7:8]
dataAllUserLoad = data[0:,8:]
print(dataAllUserLoad)
x = data[0:,:7]
y = data[0:,7:8]
df = pd.DataFrame(x, columns=['season','month','week','wk_day','hour','temperature','avg_temp'])
df['Target'] = pd.DataFrame(y, columns=['all_load'])
df.head()
plt.figure(figsize=(8,8))
# 画热力图,数值为两个变量之间的相关系数
p=sns.heatmap(df.corr(), annot=True, square=True)
#画相关图
p = r'D:\LoadPred\train_single.csv'
with open(p,encoding = 'utf-8') as f:
data_csv = pd.read_csv(f,float,delimiter = ",", skiprows = 0)
sns.pairplot(data_csv)
2.K-Means聚类
#导入D:\Program Files (x86)\Python37\Scripts文件夹下的文件
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = np.genfromtxt("./LoadPred/train_300usr.csv",delimiter=',' )
print(data)
data_300usr = data[1:,1:]
print(data_300usr.shape)#7896行数据,300个用户,24小时的数据
#300个用户,将7896行数据按照小时数相加
data_300usr_24 = [[0 for i in range(24)] for j in range(300)]
for k in range(300):
for i in range(24):
sum = 0.0
for j in range(i,data_300usr.shape[0],24):
sum += data_300usr[j][k]
data_300usr_24[k][i] = sum
data_300usr_24 = np.array(data_300usr_24)
print(data_300usr_24.shape)
#保存数据
np.savetxt("result1.csv",data_300usr_24,delimiter=',')
#再读取
data_csv = pd.read_csv("./result1.csv",header=None)
# df = pd.DataFrame(data_csv)
# df.head()
data_csv#显示数据
#绘制数据曲线
fig,ax=plt.subplots(figsize=(12,6))
plt.plot(data_csv.T)
plt.show()
#聚类代码
from sklearn.cluster import KMeans
class EnergyFingerPrints():
def __init__(self,data):
# will contain the centroid of each cluster
self.means = []
self.data = data
def elbow_method(self,n_clusters):
"""Performs elbow method for a predefined number
of clusters.
Parameters
----------
n_clusters : int
the number of clusters to perform the elbow method
Returns
---------
A plot the of elbow method
"""
fig,ax=plt.subplots(figsize=(8,4))
distortions = []
for i in range(1, n_clusters):
km = KMeans(n_clusters = i,
init='k-means++', #初始中心簇的获取方式,k-means++一种比较快的收敛的方法
n_init=10, #初始中心簇的迭代次数
max_iter=300, #数据分类的迭代次数
random_state=0) #初始化中心簇的方式
km.fit(self.data)
distortions.append(km.inertia_) #inertia计算样本点到最近的中心点的距离之和
plt.plot(range(1,n_clusters), distortions, marker='o',lw=1)
plt.xlabel('聚类数量')
plt.ylabel('至中心点距离之和')
plt.show()
def get_cluster_counts(self):
return pd.Series(self.predictions).value_counts()
def labels(self,n_clusters):
self.n_clusters = n_clusters
return KMeans(self.n_clusters, init='k-means++', n_init=10,max_iter=300,random_state=0).fit(self.data).labels_
def fit(self,n_clusters):
"""Performs K-means clustering for the load-profiles
Parameters
----------
n_clusters : int
Returns
--------
count_dict : dict
The number of load-profiles in each cluster
"""
self.n_clusters = n_clusters
self.kmeans = KMeans(self.n_clusters)
self.predictions = self.kmeans.fit_predict(self.data)
def plot(self):
"""Plots all loads in each cluster"""
self.cluster_names = [str(x) for x in range(self.n_clusters)]
fig,ax=plt.subplots(figsize=(12,16))
for i in range(0,self.n_clusters):
all_data = []
for x,y in zip(self.data,self.predictions):
if y == i:
all_data.append(x)
plt.subplot(4,1,i+1)
plt.plot(x,alpha=0.06,color="blue",lw=2)
#plt.ylim(0,4)
plt.xlim(0,96)
plt.title('Cluster%s'%(i+1))
plt.ylabel('用电量/kW')
all_data_array = np.array(all_data)
mean = all_data_array.mean(axis=0)
self.means.append(mean)
plt.plot(mean, color="black",linewidth=4)
plt.show()
def plot_energy_fingerprints(self):
"""Plots the mean of each cluster in single plot"""
fig,ax=plt.subplots(figsize=(8,5))
for i,item in enumerate(self.means):
plt.plot(item, label = "cluster %s"%(str(i+1)))
plt.xlim(0,96)
plt.ylabel('用电量/kW')
plt.xticks([0,20,40,60,80],['00:00','05:00','10:00','15:00','20:00'],rotation=60)
plt.grid()
plt.legend()
plt.show()
#聚类簇设计
load_data=np.array(data_csv)
energy_clusters = EnergyFingerPrints(load_data)
energy_clusters.elbow_method(n_clusters=13)
分为4类
energy_clusters.fit(n_clusters = 4)
energy_clusters.get_cluster_counts()#计算每类有多少,并输出
group=energy_clusters.labels(n_clusters = 4)#显示300个用户分别是哪一类
print(group)
num=data_csv.index
print(num)
cls=pd.DataFrame(list(num))
cls['cluster']=list(group)
cls.columns=['user_id','cluster']
#通过排序可以得到每个类中的用户id
cls=cls.sort_values(by='cluster',ascending=True)
cls.reset_index(drop=True)
#获得属于第一分类簇的用户id
np.array(cls.loc[cls.cluster ==0].user_id)
#获得属于第二分类簇的用户id
np.array(cls.loc[cls.cluster ==1].user_id)
#获得属于第三分类簇的用户id
np.array(cls.loc[cls.cluster ==2].user_id)
#获得属于第四分类簇的用户id
np.array(cls.loc[cls.cluster ==3].user_id)
3.回归决策树、分类决策树
#预测温度,回归决策树
from sklearn import tree
model = tree.DecisionTreeRegressor()
model.fit(x_train_wd, y_train_temperature)
answer_temperature = model.predict(x_test)
import pandas as pd
answer_temperature = pd.to_numeric(answer_temperature)#数组
from numpy import *
answer_temperature=mat(answer_temperature)#转换为矩阵
answer_temperature = np.reshape(answer_temperature, (864,1))
print(answer_temperature)
np.savetxt("answer_temperature_bp.csv", answer_temperature, delimiter=',')
#分类决策树,对第二类用户的预测
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
import pydotplus
# 使用信息熵作为划分标准,对决策树进行训练
clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='random',max_depth=40,min_samples_split=2)
print(clf)
clf.fit(x_train_1, y_train_1)
# 打印参数,反映每个特征的影响力,越大表示该特征在分类中起到的作用越大
print(clf.feature_importances_)
# 对测试数据进行预测,准确度较低,说明过拟合
y_pred_1 = clf.predict(x_test_2)#字符串
import pandas as pd
y_pred_1 = pd.to_numeric(y_pred_1)#转换为数组
from numpy import *
y_pred_1=mat(y_pred_1)#转换为矩阵
y_pred_1 = np.reshape(y_pred_1, (864,1))
print(y_pred_1)
np.savetxt("y_pred_1.csv", y_pred_1, delimiter=',')
4.BP NN
#对一、三、四类用户的预测
import tensorflow as tf
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(20, input_shape=(6,), activation="relu")) # tanh 一层
# 中间层的节点数20,6列特征,激活函数为relu函数
model.add(tf.keras.layers.Dense(1, activation="relu"))
model.compile(optimizer='adam', # # adam优化器
loss='mse', # mse 损失函数
metrics=['acc']) # 准确率
histroy = model.fit(x_train_0, y_train_0, batch_size=8, epochs=50)
# 批处理,一次处理8个,更改一次权重, epochs迭代次数
#预测
y_pred_0 = model.predict(x_test_2)
print(y_pred_0)
np.savetxt("y_pred_0.csv", y_pred_0, delimiter=',')
#绘制预测图形
import matplotlib.pyplot as plt
plt.plot(y_pred)
plt.show()