NILM（非侵入式电力负荷监测）学习笔记 —— 使用NILMTK Toolkit，REDD数据集，CO和FHMM两种算法

最新推荐文章于 2024-10-31 11:30:44 发布

wwb_0218

最新推荐文章于 2024-10-31 11:30:44 发布

阅读量2.1w

点赞数 25

分类专栏： NILM学习笔记文章标签：算法 python 机器学习深度学习

本文链接：https://blog.csdn.net/wwb1990/article/details/103904340

版权

NILM学习笔记专栏收录该内容

3 篇文章 68 订阅

订阅专栏

（本文最后，提供整个工程下载）

准备工作

本篇的内容都是基于我前两篇的环境，和数据集进行的。
NILM（非侵入式电力负荷监测）学习笔记 —— 准备工作（一）配置环境NILMTK Toolkit

NILM（非侵入式电力负荷监测）学习笔记 —— 准备工作（二）下载和转换REDD数据集

实验环境

1，语言： Python 3.6
2，IDE：PyCharm Community Edition 2019.3.1
3，主要工具方法：nilmtk和nilm_metadata以及所依赖的类库
4，实验中分别使用两种方法预测分解：

NILM-组合优化算法 combinationrial optimization
多条隐马尔科夫模型（FHMM）

5，测试数据：REDD数据集
在这里插入图片描述

Combinationrial Optimization算法

借鉴NILM-组合优化算法 combinationrial optimization（附代码）
实现的算法，并改成了Pycharm可以跑的代码，原文是用的jupyter。

下面贴出来我稍稍改造了一下的代码，
我工程目录里面的几个脚本

CombinatorialOptimisation.py

from __future__ import print_function, division
from warnings import warn
import pandas as pd
import numpy as np
import pickle
import copy
######## 本代码是基于NILMTK这个包开发的，所以首先需要安装nilmtk这个代码
from nilmtk.utils import find_nearest
from nilmtk.feature_detectors import cluster
from nilmtk.disaggregate import Disaggregator
from nilmtk.datastore import HDFDataStore

# Fix the seed for repeatability of experiments
SEED = 42
np.random.seed(SEED)

class CombinatorialOptimisation(Disaggregator):

    """1 dimensional combinatorial optimisation NILM algorithm.
    Attributes

    ----------

    model : list of dicts

       Each dict has these keys:

           states : list of ints (the power (Watts) used in different states)

           training_metadata : ElecMeter or MeterGroup object used for training

               this set of states.  We need this information because we

               need the appliance type (and perhaps some other metadata)

               for each model.



    state_combinations : 2D array

        Each column is an appliance.

        Each row is a possible combination of power demand values e.g.

            [[0, 0,  0,   0],

             [0, 0,  0, 100],

             [0, 0, 50,   0],

             [0, 0, 50, 100], ...]

    MIN_CHUNK_LENGTH : int

    """

    def __init__(self):   ### 定义一些初始化变量，变量的意思大家一看就知道了

        self.model = []  # model列表中用来存放当个电器的训练model

        self.state_combinations = None  ##电器状态的组合

        self.MIN_CHUNK_LENGTH = 100  # 就像深度学习中的mini—batch，设定batch_size 为100，分批分解

        self.MODEL_NAME = 'CO'   ### 算法的名字就叫CO



    def train(self, metergroup, num_states_dict=None, **load_kwargs):   ## 训练部分，传入的变量就不一一解释了，英文基础薄弱的可以用谷歌翻译以下。

        """Train using 1D CO. Places the learnt model in the `model` attribute.



        Parameters

        ----------

        metergroup : a nilmtk.MeterGroup object   ### metergroup是nilmtk包的一种数据格式，其他数据格式还包括Dataset，elements等

        num_states_dict : dict   ### 哎，强迫症患者还是解释以下吧，状态的数量

        **load_kwargs : keyword arguments passed to `meter.power_series()`

        Notes

        -----

        * only uses first chunk for each meter (TODO: handle all chunks).

        """

        if num_states_dict is None:

            num_states_dict = {}



        if self.model:

            raise RuntimeError(

                "This implementation of Combinatorial Optimisation"

                " does not support multiple calls to `train`.")



        num_meters = len(metergroup.meters)   ### 电器的种类，比如有5中电器，num_meters就是5

        if num_meters > 12:   ### 如果num_meters > 12,那么聚类中心的个数就是2，否则为3，聚类这部分在后面再解释

            max_num_clusters = 2

        else:

            max_num_clusters = 3



        for i, meter in enumerate(metergroup.submeters().meters):   ### metersgroup是传进来的变量，metergroup就是将多种电器的消耗功率数据封装成group了，通过调用group下的函数如meters()和submeters()可以调出单个电器的独立历史数据，这个循环主要就是用来遍历group中的电器的种类

            print("Training model for submeter '{}'".format(meter))

            power_series = meter.power_series(**load_kwargs)  ### meter.power_series()可以返回pandas格式的数据，这里得到的是一个生成器，power_series()中可如参数如sample_period = 6,即采样周期是6s

            chunk = next(power_series)  # 得到pandas 格式的数据

            num_total_states = num_states_dict.get(meter)   # 总的状态数量

            if num_total_states is not None:

                num_on_states = num_total_states - 1  # 开状态的数量

            else:

                num_on_states = None

            self.train_on_chunk(chunk, meter, max_num_clusters, num_on_states)  ### 批训练



            # Check to see if there are any more chunks.

            # TODO handle multiple chunks per appliance.

            try:

                next(power_series)

            except StopIteration:

                pass

            else:

                warn("The current implementation of CombinatorialOptimisation"

                     " can only handle a single chunk.  But there are multiple"

                     " chunks available.  So have only trained on the"

                     " first chunk!")



        print("Done training!")



    def train_on_chunk(self, chunk, meter, max_num_clusters, num_on_states):  #### 这是核心部分

        # Check if we've already trained on this meter

        meters_in_model = [d['training_metadata'] for d in self.model]   #### 整合数据集中的一些元数据，元数据包含了该电器的很多信息，如线路信息，型号信息等等。

        if meter in meters_in_model:

            raise RuntimeError(

                "Meter {} is already in model!"

                "  Can't train twice on the same meter!"

                .format(meter))



        states = cluster(chunk, max_num_clusters, num_on_states)   ### 这是核心中的核心，下面会将clustering的代码解释一下

        self.model.append({

            'states': states,

            'training_metadata': meter})



    def _set_state_combinations_if_necessary(self): 

        """Get centroids"""

        # If we import sklearn at the top of the file then auto doc fails.

        if (self.state_combinations is None or

                self.state_combinations.shape[1] != len(self.model)):

            from sklearn.utils.extmath import cartesian

            centroids = [model['states'] for model in self.model]

            self.state_combinations = cartesian(centroids)



    def disaggregate(self, mains, output_datastore,**load_kwargs):  ## 分解部分，先训练再分解

        '''Disaggregate mains according to the model learnt previously.



        Parameters

        ----------

        mains : nilmtk.ElecMeter or nilmtk.MeterGroup   ### 解释了以下mains的数据格式

        output_datastore : instance of nilmtk.DataStore subclass   ## 这是用来保存输出结果的格式

            For storing power predictions from disaggregation algorithm.

        sample_period : number, optional   ## 采样频率

            The desired sample period in seconds.  Set to 60 by default.

        sections : TimeFrameGroup, optional   ### 在很多的算法中，在数据的预处理部分，都会有这样一个参数，这个参数是用来选取部分数据的

            Set to mains.good_sections() by default.

        **load_kwargs : key word arguments

            Passed to `mains.power_series(**kwargs)`

        '''

        load_kwargs = self._pre_disaggregation_checks(load_kwargs)



        load_kwargs.setdefault('sample_period', 60)

        load_kwargs.setdefault('sections', mains.good_sections())



        timeframes = []  #### timeframe中包含的是时间

        building_path = '/building{}'.format(mains.building())  ### building是家庭号，如REDD中有6个家庭

        mains_data_location = building_path + '/elec/meter1'

        data_is_available = False



        for chunk in mains.power_series(**load_kwargs):

            # Check that chunk is sensible size

            if len(chunk) < self.MIN_CHUNK_LENGTH:

                continue



            # Record metadata

            timeframes.append(chunk.timeframe)

            measurement = chunk.name



            appliance_powers = self.disaggregate_chunk(chunk)



            for i, model in enumerate(self.model):

                appliance_power = appliance_powers.iloc[:, i]

                if len(appliance_power) == 0:

                    continue

                data_is_available = True

                cols = pd.MultiIndex.from_tuples([chunk.name])

                meter_instance = model['training_metadata'].instance()

                df = pd.DataFrame(

                    appliance_power.values, index=appliance_power.index,

                    columns=cols)

                key = '{}/elec/meter{}'.format(building_path, meter_instance)

                output_datastore.append(key, df)



            # Copy mains data to disag output

            mains_df = pd.DataFrame(chunk, columns=cols)

            output_datastore.append(key=mains_data_location, value=mains_df)



        if data_is_available:

            self._save_metadata_for_disaggregation(

                output_datastore=output_datastore,

                sample_period=load_kwargs['sample_period'],

                measurement=measurement,

                timeframes=timeframes,

                building=mains.building(),

                meters=[d['training_metadata'] for d in self.model]

            )



    def disaggregate_chunk(self, mains):

        """In-memory disaggregation.



        Parameters

        ----------

        mains : pd.Series





        Returns

        -------

        appliance_powers : pd.DataFrame where each column represents a

            disaggregated appliance.  Column names are the integer index

            into `self.model` for the appliance in question.

        """

        if not self.model:

            raise RuntimeError(

                "The model needs to be instantiated before"

                " calling `disaggregate`.  The model"

                " can be instantiated by running `train`.")



        if len(mains) < self.MIN_CHUNK_LENGTH:

            raise RuntimeError("Chunk is too short.")



        # sklearn produces lots of DepreciationWarnings with PyTables

        import warnings

        warnings.filterwarnings("ignore", category=DeprecationWarning)



        # Because CombinatorialOptimisation could have been trained using

        # either train() or train_on_chunk(), we must

        # set state_combinations here.

        self._set_state_combinations_if_necessary()



        """

         # Add vampire power to the model

        if vampire_power is None:

            vampire_power = get_vampire_power(mains)

        if vampire_power > 0:

            print("Including vampire_power = {} watts to model..."

                  .format(vampire_power))

            n_rows = self.state_combinations.shape[0]

            vampire_power_array = np.zeros((n_rows, 1)) + vampire_power

            state_combinations = np.hstack(

                (self.state_combinations, vampire_power_array))

        else:

            state_combinations = self.state_combinations

        """





        state_combinations = self.state_combinations



        summed_power_of_each_combination = np.sum(state_combinations, axis=1)

        # summed_power_of_each_combination is now an array where each

        # value is the total power demand for each combination of states.



        # Start disaggregation

        indices_of_state_combinations, residual_power = find_nearest(

            summed_power_of_each_combination, mains.values)



        appliance_powers_dict = {}

        for i, model in enumerate(self.model):

            print("Estimating power demand for '{}'"

                  .format(model['training_metadata']))

            predicted_power = state_combinations[

                indices_of_state_combinations, i].flatten()

            column = pd.Series(predicted_power, index=mains.index, name=i)

            appliance_powers_dict[self.model[i]['training_metadata']] = column

        appliance_powers = pd.DataFrame(appliance_powers_dict, dtype='float32')

        return appliance_powers



    def import_model(self, filename):

        with open(filename, 'rb') as in_file:

            imported_model = pickle.load(in_file)



        self.model = imported_model.model



        # Recreate datastores from filenames

        for pair in self.model:

            store_filename = pair['training_metadata'].store

            pair['training_metadata'].store = HDFDataStore(store_filename)



        self.state_combinations = imported_model.state_combinations

        self.MIN_CHUNK_LENGTH = imported_model.MIN_CHUNK_LENGTH



    def export_model(self, filename):

        # Can't pickle datastore, so convert to filenames

        original_stores = []

        for pair in self.model:

            original_store = pair['training_metadata'].store

            original_stores.append(original_store)

            pair['training_metadata'].store = original_store.store.filename



        try:

            with open(filename, 'wb') as out_file:

                pickle.dump(self, out_file)

        finally:

            # Restore the stores even if the pickling fails

            for original_store, pair in zip(original_stores, self.model):

                pair['training_metadata'].store = original_store

聚类的代码，用于得到每种电器消耗功率的聚类中心。
cluster.py

from __future__ import print_function, division
import numpy as np
import pandas as pd


# Fix the seed for repeatability of experiments
SEED = 42
np.random.seed(SEED)


def cluster(X, max_num_clusters=3, exact_num_clusters=None):
    '''Applies clustering on reduced data,
    i.e. data where power is greater than threshold.

    Parameters
    ----------
    X : pd.Series or single-column pd.DataFrame
    max_num_clusters : int

    Returns
    -------
    centroids : ndarray of int32s
        Power in different states of an appliance, sorted
    '''
    # Find where power consumption is greater than 10
    data = _transform_data(X)

    # Find clusters
    centroids = _apply_clustering(data, max_num_clusters, exact_num_clusters)
    centroids = np.append(centroids, 0)  # add 'off' state
    centroids = np.round(centroids).astype(np.int32)
    centroids = np.unique(centroids)  # np.unique also sorts
    # TODO: Merge similar clusters
    return centroids


def _transform_data(data):
    '''Subsamples if needed and converts to column vector (which is what
    scikit-learn requires).

    Parameters
    ----------
    data : pd.Series or single column pd.DataFrame

    Returns
    -------
    data_above_thresh : ndarray
        column vector
    '''

    MAX_NUMBER_OF_SAMPLES = 2000
    MIN_NUMBER_OF_SAMPLES = 20
    DATA_THRESHOLD = 10

    data_above_thresh = data[data > DATA_THRESHOLD].dropna().values
    n_samples = len(data_above_thresh)
    if n_samples < MIN_NUMBER_OF_SAMPLES:
        return np.zeros((MAX_NUMBER_OF_SAMPLES, 1))
    elif n_samples > MAX_NUMBER_OF_SAMPLES:
        # Randomly subsample (we don't want to smoothly downsample
        # because that is likely to change the values)
        random_indices = np.random.randint(0, n_samples, MAX_NUMBER_OF_SAMPLES)
        resampled = data_above_thresh[random_indices]
        return resampled.reshape(MAX_NUMBER_OF_SAMPLES, 1)
    else:
        return data_above_thresh.reshape(n_samples, 1)


def _apply_clustering_n_clusters(X, n_clusters):
    """
    :param X: ndarray
    :param n_clusters: exact number of clusters to use
    :return:
    """
    from sklearn.cluster import KMeans
    k_means = KMeans(init='k-means++', n_clusters=n_clusters)
    k_means.fit(X)
    return k_means.labels_, k_means.cluster_centers_


def _apply_clustering(X, max_num_clusters, exact_num_clusters=None):
    '''
    Parameters
    ----------
    X : ndarray
    max_num_clusters : int

    Returns
    -------
    centroids : list of numbers
        List of power in different states of an appliance
    '''
    # If we import sklearn at the top of the file then it makes autodoc fail

    from sklearn import metrics

    # sklearn produces lots of DepreciationWarnings with PyTables
    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning)

    # Finds whether 2 or 3 gives better Silhouellete coefficient
    # Whichever is higher serves as the number of clusters for that
    # appliance
    num_clus = -1
    sh = -1
    k_means_labels = {}
    k_means_cluster_centers = {}
    k_means_labels_unique = {}

    # If the exact number of clusters are specified, then use that
    if exact_num_clusters is not None:
        labels, centers = _apply_clustering_n_clusters(X, exact_num_clusters)
        return centers.flatten()

    # Exact number of clusters are not specified, use the cluster validity measures
    # to find the optimal number
    for n_clusters in range(1, max_num_clusters):

        try:
            labels, centers = _apply_clustering_n_clusters(X, n_clusters)
            k_means_labels[n_clusters] = labels
            k_means_cluster_centers[n_clusters] = centers
            k_means_labels_unique[n_clusters] = np.unique(labels)
            try:
                sh_n = metrics.silhouette_score(
                    X, k_means_labels[n_clusters], metric='euclidean')

                if sh_n > sh:
                    sh = sh_n
                    num_clus = n_clusters
            except Exception:
                num_clus = n_clusters
        except Exception:
            if num_clus > -1:
                return k_means_cluster_centers[num_clus]
            else:
                return np.array([0])

    return k_means_cluster_centers[num_clus].flatten()


def hart85_means_shift_cluster(pair_buffer_df, cols):


    from sklearn.cluster import MeanShift
    # Creating feature vector
    cluster_df = pd.DataFrame()
    power_types = [col[1] for col in cols]
    if 'active' in power_types:
        cluster_df['active'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                   ((np.fabs(row['T1 Active']) + np.fabs(row['T2 Active'])) / 2), axis=1), index=pair_buffer_df.index)
    if 'reactive' in power_types:
        cluster_df['reactive'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                     ((np.fabs(row['T1 Reactive']) + np.fabs(row['T2 Reactive'])) / 2), axis=1), index=pair_buffer_df.index)

    X = cluster_df.values.reshape((len(cluster_df.index), len(cols)))
    ms = MeanShift(bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    return pd.DataFrame(cluster_centers, columns=cols)

执行脚本，执行所有流程，包括读数据，训练，预测。
do.py

from __future__ import print_function, division
import time
from matplotlib import rcParams
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

from nilmtk.legacy.disaggregate import FHMM
from six import iteritems

warnings.filterwarnings('ignore')
# %matplotlib inline

rcParams['figure.figsize'] = (13, 6)

from nilmtk import DataSet, TimeFrame, MeterGroup, HDFDataStore
from ComOpt.CombinatorialOptimisation import CombinatorialOptimisation

train = DataSet('../low_freq/redd_low.h5')  # 读取数据集
test = DataSet('../low_freq/redd_low.h5') # 读取数据集
building = 1  ## 选择家庭house
train.set_window(end="30-4-2011")  ## 划分数据集，2011年4月30号之前的作为训练集
test.set_window(start="1-5-2011") ## 五月1号之后的作为测试集


train_elec = train.buildings[building].elec  ## elec包含了这个家庭中的所有的电器信息和总功率信息
test_elec = test.buildings[building].elec

top_5_train_elec = train_elec.submeters().select_top_k(k=5)  ## 选择用电量排在前5的来进行训练和测试


def predict(clf, test_elec, sample_period, timezone):   ## 定义预测的方法
    pred = {}
    gt= {}

    for i, chunk in enumerate(test_elec.mains().load(sample_period=sample_period)):
        chunk_drop_na = chunk.dropna()   ### 丢到缺省值
        pred[i] = clf.disaggregate_chunk(chunk_drop_na)  #### 分解，disaggregate_chunk是CO下的一个方法，通过调用这个方法实现分解，这部分代码在下面可以见到
        gt[i]={}  ## 这是groudtruth，即真实的单个电器的消耗功率

        for meter in test_elec.submeters().meters:
            # Only use the meters that we trained on (this saves time!)
            gt[i][meter] = next(meter.load(sample_period=sample_period))
        gt[i] = pd.DataFrame({k:v.squeeze() for k,v in iteritems(gt[i])}, index=next(iter(gt[i].values())).index).dropna()   #### 上面这一块主要是为了得到pandas格式的gt数据

    # If everything can fit in memory
    gt_overall = pd.concat(gt)
    gt_overall.index = gt_overall.index.droplevel()
    pred_overall = pd.concat(pred)
    pred_overall.index = pred_overall.index.droplevel()

    # Having the same order of columns
    gt_overall = gt_overall[pred_overall.columns]

    #Intersection of index
    gt_index_utc = gt_overall.index.tz_convert("UTC")
    pred_index_utc = pred_overall.index.tz_convert("UTC")
    common_index_utc = gt_index_utc.intersection(pred_index_utc)


    common_index_local = common_index_utc.tz_convert(timezone)
    gt_overall = gt_overall.ix[common_index_local]
    pred_overall = pred_overall.ix[common_index_local]
    appliance_labels = [m.label() for m in gt_overall.columns.values]
    gt_overall.columns = appliance_labels
    pred_overall.columns = appliance_labels
    '''
        以上这一块可以看作是对gt和pred的处理，用于后面评估指标的计算。
    '''
    return gt_overall, pred_overall
classifiers = {'CO':CombinatorialOptimisation(), 'FHMM':FHMM()}   ### 设置了两种算法，一种是CO，一种是FHMM
predictions = {}
sample_period = 120  ## 采样周期是两分钟
for clf_name, clf in classifiers.items():
    print("*"*20)
    print(clf_name)
    print("*" *20)
    clf.train(top_5_train_elec, sample_period=sample_period)  ### 训练部分
    gt, predictions[clf_name] = predict(clf, test_elec, 120, train.metadata['timezone'])### 预测和分解

def compute_rmse(gt, pred):   ### 评估指标 rmse
    from sklearn.metrics import mean_squared_error
    rms_error = {}
    for appliance in gt.columns:
        rms_error[appliance] = np.sqrt(mean_squared_error(gt[appliance], pred[appliance])) ## 评价指标的定义很简单，就是均方根误差
    return pd.Series(rms_error)
rmse = {}
for clf_name in classifiers.keys():
    rmse[clf_name] = compute_rmse(gt, predictions[clf_name])
rmse = pd.DataFrame(rmse)
print(rmse)