1、背景和分析
本次数据挖掘建模目标如下:
1、归纳出漏窃电用户的关键特征,构建窃漏电用户的识别模型
2、利用实时检测数据,调用窃漏电用户识别模型实现实时的检测
2、代码
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from random import shuffle
from keras.layers.core import Activation, Dense
from keras.models import Sequential
from scipy.interpolate import largrange
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix, toc_curve
from sklearn.tree import DecisionTreeClassifier
使用拉格朗日插值法,插补缺少的数据
input_file = 'E:/1-GitHubCode-book/python_data_analysis_and_mining_action/chapter6/data/missing_data.xls'
out_file = './missing_data_processed.xls'
data = pd.read_excel(input_file, header = None)
def ployinterp_column(s, n, k=5):
# s是series
y = s[list(range(n-k, n)) + list(range(n+1, k+n+1))]
y = y[y.notnull()]
return largrange(y.index, list(y))(n)
for i in data.columns:
for j in range(len(data)):
if (data[i].isnull()):
data[i][j] = ployinterp_column(data[i], j)
data.to_excel(out_file, header = None, index = False)
使用CART, 对290个用户进行训练
data_file = 'E:/1-GitHubCode-book/python_data_analysis_and_mining_action/chapter6/data/model.xls'
data = pd.read_excel(data_file)
# 定义混淆矩阵
def cm_plot(y, yp):
cm = confusion_matrix(y, yp)
plt.matshow(cm, cmap = plt.cm.Reds)
plt.colorbar()
for x in range(len(cm)):
for y in range(len(cm)):
plt.annotate(
cm[x, y],
xy = (x, y),
horizontalalignment = 'center',
verticalalignment = 'center'
)
plt.ylabel('True label')
plt.xlabel('Predicted label')
return plt
data = data.as_matrix() # 转换为矩阵
shuffle(data)
p = 0.8
train = data[:int(len(data)*p), :]
test = data[int(len(data)*p), :]
treefile = './tree.pkl'
tree - DecisionTreeClassifier()
tree.fit(train[:, :3], train[:, 3])
joblib.dump(tree, treefile)
cm_plot(train[:, 3], tree.predict(train[:, :3])).show()
fpr, tpr, thresholds = roc_curve(
test[:, 3], tree.predict_proba(test[:, :3])[:, 1], pos_labl=1
)
plt.plot(fpr, tpr, linewidth=2, label='ROC OF CART', color='green')
plt.xlabel('FALSE POSITIVE RATE')
plt.ylabel('TRUE POSITIVE RATE')
plt.ylim(0, 1.05)
plt.xlim(0, 1.05)
plt.show()
print(thresholds)
使用LM神经网络
设定LM神经网络的输入节点为3,输出节点为1,隐层节点数为10, 使用Adam方法求解,对于激活函数,在隐藏层使用Relu(x) = max(x, 0)作为激活函数。
data_file = 'E:/1-GitHubCode-book/python_data_analysis_and_mining_action/chapter6/data/model.xls'
data = pd.read_excel(data_file)
data = data.as_matrix()
shuffle(data)
data
p = 0.8
train = data[:int(len(data)*0.8), :]
test = data[int(len(data)*0.8):, :]
# 构建神经网络
netfile = './net.model'
net = Sequential()
net.add(Dense(10, input_shape = (3, ))) # 添加输入层 3 节点,到隐藏层 10 节点的链接
net.add(Activation('relu'))
net.add(Dense(1, input_shape = (10, ))) #ned.add(Dense(10, 1)) 添加隐藏层10节点到输出层1节点的链接
net.add(Activation('sigmoid')) # 输出层使用sigmoid函数
net.compile(
loss = 'binary_crossentropy',
optimizer = 'adam',
sample_weight_mode = 'binary'
)
net.fit(train[:, :3], train[:, 3], epoches = 100, batch_size = 1)
net.save_weights(netfile)
predict_result = net.predict_classes(train[:, :3]).reshape(len(data))
result = net.predict(train[:, :3])
cm_plot(train[:, 3], predict_result).show()
predict_result = net.predict(test[:, :3]).reshape(len(test))
fpr, tpr, thresholds = roc_curve(test[:, 3], predict_result, pos_label = 1)
plt.plot(fpr, tpr, linewidth = 2, label = 'ROC OF LM')
plt.xlabel('False positive Rate')
plt.ylabel('True positive Rate')
plt.ylim(0, 1.05)
plt.xlim(0, 1.05)
plt.legend(loc = 4)
plt.show()
print(thresholds)
实验表明,分类准确率为(161 + 58) / (161 + 58 +6 +7) = 94.4%,正常用户被判为漏电用户占正常用户的7 / (161 + 7) = 4.2%, 漏电用户被判为正常用户占正常窃漏电用户的 6 / (6+58)=9.4%
望您:
“情深不寿,强极则辱,谦谦君子,温润如玉”。