(本文最后,提供整个工程下载)
准备工作
本篇的内容都是基于我前两篇的环境,和数据集进行的。
NILM(非侵入式电力负荷监测)学习笔记 —— 准备工作(一)配置环境NILMTK Toolkit
NILM(非侵入式电力负荷监测)学习笔记 —— 准备工作(二)下载和转换REDD数据集
实验环境
1,语言: Python 3.6
2,IDE:PyCharm Community Edition 2019.3.1
3,主要工具方法:nilmtk和nilm_metadata以及所依赖的类库
4,实验中分别使用两种方法预测分解:
- NILM-组合优化算法 combinationrial optimization
- 多条隐马尔科夫模型(FHMM)
5,测试数据:REDD数据集
Combinationrial Optimization算法
借鉴NILM-组合优化算法 combinationrial optimization(附代码)
实现的算法,并改成了Pycharm可以跑的代码,原文是用的jupyter。
下面贴出来我稍稍改造了一下的代码,
我工程目录里面的几个脚本
CombinatorialOptimisation.py
from __future__ import print_function, division
from warnings import warn
import pandas as pd
import numpy as np
import pickle
import copy
######## 本代码是基于NILMTK这个包开发的,所以首先需要安装nilmtk这个代码
from nilmtk.utils import find_nearest
from nilmtk.feature_detectors import cluster
from nilmtk.disaggregate import Disaggregator
from nilmtk.datastore import HDFDataStore
# Fix the seed for repeatability of experiments
SEED = 42
np.random.seed(SEED)
class CombinatorialOptimisation(Disaggregator):
"""1 dimensional combinatorial optimisation NILM algorithm.
Attributes
----------
model : list of dicts
Each dict has these keys:
states : list of ints (the power (Watts) used in different states)
training_metadata : ElecMeter or MeterGroup object used for training
this set of states. We need this information because we
need the appliance type (and perhaps some other metadata)
for each model.
state_combinations : 2D array
Each column is an appliance.
Each row is a possible combination of power demand values e.g.
[[0, 0, 0, 0],
[0, 0, 0, 100],
[0, 0, 50, 0],
[0, 0, 50, 100], ...]
MIN_CHUNK_LENGTH : int
"""
def __init__(self): ### 定义一些初始化变量,变量的意思大家一看就知道了
self.model = [] # model列表中用来存放当个电器的训练model
self.state_combinations = None ##电器状态的组合
self.MIN_CHUNK_LENGTH = 100 # 就像深度学习中的mini—batch,设定batch_size 为100,分批分解
self.MODEL_NAME = 'CO' ### 算法的名字就叫CO
def train(self, metergroup, num_states_dict=None, **load_kwargs): ## 训练部分,传入的变量就不一一解释了,英文基础薄弱的可以用谷歌翻译以下。
"""Train using 1D CO. Places the learnt model in the `model` attribute.
Parameters
----------
metergroup : a nilmtk.MeterGroup object ### metergroup是nilmtk包的一种数据格式,其他数据格式还包括Dataset,elements等
num_states_dict : dict ### 哎,强迫症患者还是解释以下吧,状态的数量
**load_kwargs : keyword arguments passed to `meter.power_series()`
Notes
-----
* only uses first chunk for each meter (TODO: handle all chunks).
"""
if num_states_dict is None:
num_states_dict = {}
if self.model:
raise RuntimeError(
"This implementation of Combinatorial Optimisation"
" does not support multiple calls to `train`.")
num_meters = len(metergroup.meters) ### 电器的种类,比如有5中电器,num_meters就是5
if num_meters > 12: ### 如果num_meters > 12,那么聚类中心的个数就是2,否则为3,聚类这部分在后面再解释
max_num_clusters = 2
else:
max_num_clusters = 3
for i, meter in enumerate(metergroup.submeters().meters): ### metersgroup是传进来的变量,metergroup就是将多种电器的消耗功率数据封装成group了,通过调用group下的函数如meters()和submeters()可以调出单个电器的独立历史数据,这个循环主要就是用来遍历group中的电器的种类
print("Training model for submeter '{}'".format(meter))
power_series = meter.power_series(**load_kwargs) ### meter.power_series()可以返回pandas格式的数据,这里得到的是一个生成器,power_series()中可如参数如sample_period = 6,即采样周期是6s
chunk = next(power_series) # 得到pandas 格式的数据
num_total_states = num_states_dict.get(meter) # 总的状态数量
if num_total_states is not None:
num_on_states = num_total_states - 1 # 开状态的数量
else:
num_on_states = None
self.train_on_chunk(chunk, meter, max_num_clusters, num_on_states) ### 批训练
# Check to see if there are any more chunks.
# TODO handle multiple chunks per appliance.
try:
next(power_series)
except StopIteration:
pass
else:
warn("The current implementation of CombinatorialOptimisation"
" can only handle a single chunk. But there are multiple"
" chunks available. So have only trained on the"
" first chunk!")
print("Done training!")
def train_on_chunk(self, chunk, meter, max_num_clusters, num_on_states): #### 这是核心部分
# Check if we've already trained on this meter
meters_in_model = [d['training_metadata'] for d in self.model] #### 整合数据集中的一些元数据,元数据包含了该电器的很多信息,如线路信息,型号信息等等。
if meter in meters_in_model:
raise RuntimeError(
"Meter {} is already in model!"
" Can't train twice on the same meter!"
.format(meter))
states = cluster(chunk, max_num_clusters, num_on_states) ### 这是核心中的核心,下面会将clustering的代码解释一下
self.model.append({
'states': states,
'training_metadata': meter})
def _set_state_combinations_if_necessary(self):
"""Get centroids"""
# If we import sklearn at the top of the file then auto doc fails.
if (self.state_combinations is None or
self.state_combinations.shape[1] != len(self.model)):
from sklearn.utils.extmath import cartesian
centroids = [model['states'] for model in self.model]
self.state_combinations = cartesian(centroids)
def disaggregate(self, mains, output_datastore,**load_kwargs): ## 分解部分,先训练再分解
'''Disaggregate mains according to the model learnt previously.
Parameters
----------
mains : nilmtk.ElecMeter or nilmtk.MeterGroup ### 解释了以下mains的数据格式
output_datastore : instance of nilmtk.DataStore subclass ## 这是用来保存输出结果的格式
For storing power predictions from disaggregation algorithm.
sample_period : number, optional ## 采样频率
The desired sample period in seconds. Set to 60 by default.
sections : TimeFrameGroup, optional ### 在很多的算法中,在数据的预处理部分,都会有这样一个参数,这个参数是用来选取部分数据的
Set to mains.good_sections() by default.
**load_kwargs : key word arguments
Passed to `mains.power_series(**kwargs)`
'''
load_kwargs = self._pre_disaggregation_checks(load_kwargs)
load_kwargs.setdefault('sample_period', 60)
load_kwargs.setdefault('sections', mains.good_sections())
timeframes = [] #### timeframe中包含的是时间
building_path = '/building{}'.format(mains.building()) ### building是家庭号,如REDD中有6个家庭
mains_data_location = building_path + '/elec/meter1'
data_is_available = False
for chunk in mains.power_series(**load_kwargs):
# Check that chunk is sensible size
if len(chunk) < self.MIN_CHUNK_LENGTH:
continue
# Record metadata
timeframes.append(chunk.timeframe)
measurement = chunk.name
appliance_powers = self.disaggregate_chunk(chunk)
for i, model in enumerate(self.model):
appliance_power = appliance_powers.iloc[:, i]
if len(appliance_power) == 0:
continue
data_is_available = True
cols = pd.MultiIndex.from_tuples([chunk.name])
meter_instance = model['training_metadata'].instance()
df = pd.DataFrame(
appliance_power.values, index=appliance_power.index,
columns=cols)
key = '{}/elec/meter{}'.format(building_path, meter_instance)
output_datastore.append(key, df)
# Copy mains data to disag output
mains_df = pd.DataFrame(chunk, columns=cols)
output_datastore.append(key=mains_data_location, value=mains_df)
if data_is_available:
self._save_metadata_for_disaggregation(
output_datastore=output_datastore,
sample_period=load_kwargs['sample_period'],
measurement=measurement,
timeframes=timeframes,
building=mains.building(),
meters=[d['training_metadata'] for d in self.model]
)
def disaggregate_chunk(self, mains):
"""In-memory disaggregation.
Parameters
----------
mains : pd.Series
Returns
-------
appliance_powers : pd.DataFrame where each column represents a
disaggregated appliance. Column names are the integer index
into `self.model` for the appliance in question.
"""
if not self.model:
raise RuntimeError(
"The model needs to be instantiated before"
" calling `disaggregate`. The model"
" can be instantiated by running `train`.")
if len(mains) < self.MIN_CHUNK_LENGTH:
raise RuntimeError("Chunk is too short.")
# sklearn produces lots of DepreciationWarnings with PyTables
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Because CombinatorialOptimisation could have been trained using
# either train() or train_on_chunk(), we must
# set state_combinations here.
self._set_state_combinations_if_necessary()
"""
# Add vampire power to the model
if vampire_power is None:
vampire_power = get_vampire_power(mains)
if vampire_power > 0:
print("Including vampire_power = {} watts to model..."
.format(vampire_power))
n_rows = self.state_combinations.shape[0]
vampire_power_array = np.zeros((n_rows, 1)) + vampire_power
state_combinations = np.hstack(
(self.state_combinations, vampire_power_array))
else:
state_combinations = self.state_combinations
"""
state_combinations = self.state_combinations
summed_power_of_each_combination = np.sum(state_combinations, axis=1)
# summed_power_of_each_combination is now an array where each
# value is the total power demand for each combination of states.
# Start disaggregation
indices_of_state_combinations, residual_power = find_nearest(
summed_power_of_each_combination, mains.values)
appliance_powers_dict = {}
for i, model in enumerate(self.model):
print("Estimating power demand for '{}'"
.format(model['training_metadata']))
predicted_power = state_combinations[
indices_of_state_combinations, i].flatten()
column = pd.Series(predicted_power, index=mains.index, name=i)
appliance_powers_dict[self.model[i]['training_metadata']] = column
appliance_powers = pd.DataFrame(appliance_powers_dict, dtype='float32')
return appliance_powers
def import_model(self, filename):
with open(filename, 'rb') as in_file:
imported_model = pickle.load(in_file)
self.model = imported_model.model
# Recreate datastores from filenames
for pair in self.model:
store_filename = pair['training_metadata'].store
pair['training_metadata'].store = HDFDataStore(store_filename)
self.state_combinations = imported_model.state_combinations
self.MIN_CHUNK_LENGTH = imported_model.MIN_CHUNK_LENGTH
def export_model(self, filename):
# Can't pickle datastore, so convert to filenames
original_stores = []
for pair in self.model:
original_store = pair['training_metadata'].store
original_stores.append(original_store)
pair['training_metadata'].store = original_store.store.filename
try:
with open(filename, 'wb') as out_file:
pickle.dump(self, out_file)
finally:
# Restore the stores even if the pickling fails
for original_store, pair in zip(original_stores, self.model):
pair['training_metadata'].store = original_store
聚类的代码,用于得到每种电器消耗功率的聚类中心。
cluster.py
from __future__ import print_function, division
import numpy as np
import pandas as pd
# Fix the seed for repeatability of experiments
SEED = 42
np.random.seed(SEED)
def cluster(X, max_num_clusters=3, exact_num_clusters=None):
'''Applies clustering on reduced data,
i.e. data where power is greater than threshold.
Parameters
----------
X : pd.Series or single-column pd.DataFrame
max_num_clusters : int
Returns
-------
centroids : ndarray of int32s
Power in different states of an appliance, sorted
'''
# Find where power consumption is greater than 10
data = _transform_data(X)
# Find clusters
centroids = _apply_clustering(data, max_num_clusters, exact_num_clusters)
centroids = np.append(centroids, 0) # add 'off' state
centroids = np.round(centroids).astype(np.int32)
centroids = np.unique(centroids) # np.unique also sorts
# TODO: Merge similar clusters
return centroids
def _transform_data(data):
'''Subsamples if needed and converts to column vector (which is what
scikit-learn requires).
Parameters
----------
data : pd.Series or single column pd.DataFrame
Returns
-------
data_above_thresh : ndarray
column vector
'''
MAX_NUMBER_OF_SAMPLES = 2000
MIN_NUMBER_OF_SAMPLES = 20
DATA_THRESHOLD = 10
data_above_thresh = data[data > DATA_THRESHOLD].dropna().values
n_samples = len(data_above_thresh)
if n_samples < MIN_NUMBER_OF_SAMPLES:
return np.zeros((MAX_NUMBER_OF_SAMPLES, 1))
elif n_samples > MAX_NUMBER_OF_SAMPLES:
# Randomly subsample (we don't want to smoothly downsample
# because that is likely to change the values)
random_indices = np.random.randint(0, n_samples, MAX_NUMBER_OF_SAMPLES)
resampled = data_above_thresh[random_indices]
return resampled.reshape(MAX_NUMBER_OF_SAMPLES, 1)
else:
return data_above_thresh.reshape(n_samples, 1)
def _apply_clustering_n_clusters(X, n_clusters):
"""
:param X: ndarray
:param n_clusters: exact number of clusters to use
:return:
"""
from sklearn.cluster import KMeans
k_means = KMeans(init='k-means++', n_clusters=n_clusters)
k_means.fit(X)
return k_means.labels_, k_means.cluster_centers_
def _apply_clustering(X, max_num_clusters, exact_num_clusters=None):
'''
Parameters
----------
X : ndarray
max_num_clusters : int
Returns
-------
centroids : list of numbers
List of power in different states of an appliance
'''
# If we import sklearn at the top of the file then it makes autodoc fail
from sklearn import metrics
# sklearn produces lots of DepreciationWarnings with PyTables
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Finds whether 2 or 3 gives better Silhouellete coefficient
# Whichever is higher serves as the number of clusters for that
# appliance
num_clus = -1
sh = -1
k_means_labels = {}
k_means_cluster_centers = {}
k_means_labels_unique = {}
# If the exact number of clusters are specified, then use that
if exact_num_clusters is not None:
labels, centers = _apply_clustering_n_clusters(X, exact_num_clusters)
return centers.flatten()
# Exact number of clusters are not specified, use the cluster validity measures
# to find the optimal number
for n_clusters in range(1, max_num_clusters):
try:
labels, centers = _apply_clustering_n_clusters(X, n_clusters)
k_means_labels[n_clusters] = labels
k_means_cluster_centers[n_clusters] = centers
k_means_labels_unique[n_clusters] = np.unique(labels)
try:
sh_n = metrics.silhouette_score(
X, k_means_labels[n_clusters], metric='euclidean')
if sh_n > sh:
sh = sh_n
num_clus = n_clusters
except Exception:
num_clus = n_clusters
except Exception:
if num_clus > -1:
return k_means_cluster_centers[num_clus]
else:
return np.array([0])
return k_means_cluster_centers[num_clus].flatten()
def hart85_means_shift_cluster(pair_buffer_df, cols):
from sklearn.cluster import MeanShift
# Creating feature vector
cluster_df = pd.DataFrame()
power_types = [col[1] for col in cols]
if 'active' in power_types:
cluster_df['active'] = pd.Series(pair_buffer_df.apply(lambda row:
((np.fabs(row['T1 Active']) + np.fabs(row['T2 Active'])) / 2), axis=1), index=pair_buffer_df.index)
if 'reactive' in power_types:
cluster_df['reactive'] = pd.Series(pair_buffer_df.apply(lambda row:
((np.fabs(row['T1 Reactive']) + np.fabs(row['T2 Reactive'])) / 2), axis=1), index=pair_buffer_df.index)
X = cluster_df.values.reshape((len(cluster_df.index), len(cols)))
ms = MeanShift(bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
return pd.DataFrame(cluster_centers, columns=cols)
执行脚本,执行所有流程,包括读数据,训练,预测。
do.py
from __future__ import print_function, division
import time
from matplotlib import rcParams
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
from nilmtk.legacy.disaggregate import FHMM
from six import iteritems
warnings.filterwarnings('ignore')
# %matplotlib inline
rcParams['figure.figsize'] = (13, 6)
from nilmtk import DataSet, TimeFrame, MeterGroup, HDFDataStore
from ComOpt.CombinatorialOptimisation import CombinatorialOptimisation
train = DataSet('../low_freq/redd_low.h5') # 读取数据集
test = DataSet('../low_freq/redd_low.h5') # 读取数据集
building = 1 ## 选择家庭house
train.set_window(end="30-4-2011") ## 划分数据集,2011年4月30号之前的作为训练集
test.set_window(start="1-5-2011") ## 五月1号之后的作为测试集
train_elec = train.buildings[building].elec ## elec包含了这个家庭中的所有的电器信息和总功率信息
test_elec = test.buildings[building].elec
top_5_train_elec = train_elec.submeters().select_top_k(k=5) ## 选择用电量排在前5的来进行训练和测试
def predict(clf, test_elec, sample_period, timezone): ## 定义预测的方法
pred = {}
gt= {}
for i, chunk in enumerate(test_elec.mains().load(sample_period=sample_period)):
chunk_drop_na = chunk.dropna() ### 丢到缺省值
pred[i] = clf.disaggregate_chunk(chunk_drop_na) #### 分解,disaggregate_chunk是CO下的一个方法,通过调用这个方法实现分解,这部分代码在下面可以见到
gt[i]={} ## 这是groudtruth,即真实的单个电器的消耗功率
for meter in test_elec.submeters().meters:
# Only use the meters that we trained on (this saves time!)
gt[i][meter] = next(meter.load(sample_period=sample_period))
gt[i] = pd.DataFrame({k:v.squeeze() for k,v in iteritems(gt[i])}, index=next(iter(gt[i].values())).index).dropna() #### 上面这一块主要是为了得到pandas格式的gt数据
# If everything can fit in memory
gt_overall = pd.concat(gt)
gt_overall.index = gt_overall.index.droplevel()
pred_overall = pd.concat(pred)
pred_overall.index = pred_overall.index.droplevel()
# Having the same order of columns
gt_overall = gt_overall[pred_overall.columns]
#Intersection of index
gt_index_utc = gt_overall.index.tz_convert("UTC")
pred_index_utc = pred_overall.index.tz_convert("UTC")
common_index_utc = gt_index_utc.intersection(pred_index_utc)
common_index_local = common_index_utc.tz_convert(timezone)
gt_overall = gt_overall.ix[common_index_local]
pred_overall = pred_overall.ix[common_index_local]
appliance_labels = [m.label() for m in gt_overall.columns.values]
gt_overall.columns = appliance_labels
pred_overall.columns = appliance_labels
'''
以上这一块可以看作是对gt和pred的处理,用于后面评估指标的计算。
'''
return gt_overall, pred_overall
classifiers = {'CO':CombinatorialOptimisation(), 'FHMM':FHMM()} ### 设置了两种算法,一种是CO,一种是FHMM
predictions = {}
sample_period = 120 ## 采样周期是两分钟
for clf_name, clf in classifiers.items():
print("*"*20)
print(clf_name)
print("*" *20)
clf.train(top_5_train_elec, sample_period=sample_period) ### 训练部分
gt, predictions[clf_name] = predict(clf, test_elec, 120, train.metadata['timezone'])### 预测和分解
def compute_rmse(gt, pred): ### 评估指标 rmse
from sklearn.metrics import mean_squared_error
rms_error = {}
for appliance in gt.columns:
rms_error[appliance] = np.sqrt(mean_squared_error(gt[appliance], pred[appliance])) ## 评价指标的定义很简单,就是均方根误差
return pd.Series(rms_error)
rmse = {}
for clf_name in classifiers.keys():
rmse[clf_name] = compute_rmse(gt, predictions[clf_name])
rmse = pd.DataFrame(rmse)
print(rmse)
算法流程简单介绍:
1. 读取采集数据,选取训练集 和 测试集。
实验选择1号楼的数据作为实验数据,并选取排名前五的电器用作训练和预测
读取到的数据
测试集为例,读取到了6间房子的信息,并且可以看到其中的电器数据
测试所用的第一间房子的电器详细信息
测试所用的第一间房子的电器数据
instance 为该电器的自增长id
由于全部家电项目太多,跑起是个漫长的等待,以及电脑性能问题
↓ 这是其中一部分
选取出的前五名电器,用于训练和预测
2. 训练和预测
分别采取两种方法进行训练和预测
3. 分解结果(CO)
分别是每个时刻
对应前五电器的分解结果
依次为:冰箱,点灯,插座,微波炉,洗碗机
更多预测分解结果(某个早晨)
更多预测分解结果(某个下午)
分解结果(FHMM)
以上分解的结果,设置了断点,在每次的时候,可见
代码中最后打印的结果为预测前五个电器的使用耗电均方根。
到此,使用NILMTK的demo跑通了,具体流程和优化方案有待探究。
整个工程下载,包括NILMTK,数据集,算法调用。
按照我的流程配好环境,直接可以运行do.py
下载链接:https://download.csdn.net/download/wwb1990/12093037