我们以2022年全国服务外包大赛的A03题目作为示例代码演示异常值识别过程。
问题的主要任务时找出商品的销量异常和价格异常,提供4个月的商品信息数据,共1700万余条,4个月的店铺信息数据,共60万余条,强调时间复杂度空间复杂度、异常值识别率和准确率。我们用店铺分析辅助商品的异常,以提高可信度和准确率。
店铺部分数据链接:https://pan.baidu.com/s/1iAp-s2JwG_YTB35BevMNyQ 提取码:jhnb
思路如下:
缺失值填补:https://blog.csdn.net/Hjh1906008151/article/details/124338450
衍生变量计算:https://blog.csdn.net/Hjh1906008151/article/details/124330708
聚类:https://blog.csdn.net/Hjh1906008151/article/details/124341064
多种方法联合
因为比赛强调时间空间复杂度,因此我们将程序设定为两种模式,根据需求选取快速异常值识别和精确异常值识别:
import numpy as np
import pandas as pd
import time
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import MinMaxScaler
from pyod.models.pca import PCA
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.abod import ABOD
from pyod.models.copod import COPOD
from pyod.models.iforest import IForest
from pyod.models.loda import LODA
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.vae import VAE
from pyod.models.hbos import HBOS
from pyod.models.knn import KNN
from pyod.models.mcd import MCD
def normalize_model(file, col):
min_max = MinMaxScaler(feature_range=(0, 1))
file[col] = min_max.fit_transform(file[col])
return file
def models(data, col, file_name, mode):
data_NoNull = data.dropna(axis=0, how='any', subset=col)
data_Normed = normalize_model(data_NoNull, col)
random_state = np.random.RandomState(1129)
n_clusters = 10
if data_Normed.shape[1] < 64:
if len(col)>16:
AEneurons = [16, 8, 8, 16]
else:
AEneurons = [len(col), len(col)/2, len(col)/2, len(col)]
VAEneurons = [16, 8, 4], [4, 8, 16]
else:
AEneurons = [64, 32, 32, 64]
VAEneurons = [128, 64, 32], [32, 64, 128]
print("Competing with conventional unsupervised outlier detection algorithms...")
if mode == "fast":
classifiers = {
'PCA': PCA(random_state=random_state), # fast
'COPOD': COPOD(), # fast
'HBOS': HBOS(), # fast
'LODA': LODA(), # fast
'CBLOF': CBLOF(n_clusters=n_clusters, check_estimator=False, random_state=random_state), # fast
'Iforest': IForest(random_state=random_state), # fast
'MCD': MCD(), # 小slow 需要2维+
'Fb': FeatureBagging(random_state=random_state), # 小slow 需要2维+
# 'KNN': KNN(), # 一般般slow
# 'LOF': LOF(), # 一般般slow
}
else:
classifiers = {
'PCA': PCA(random_state=random_state), # fast
'COPOD': COPOD(), # fast
'Iforest': IForest(random_state=random_state), # fast
'LODA': LODA(), # fast
'CBLOF': CBLOF(n_clusters=n_clusters, check_estimator=False, random_state=random_state), # fast
'HBOS': HBOS(), # fast
'MCD': MCD(), # 小slow 需要2维+
'Fb': FeatureBagging(random_state=random_state), # 小slow 需要2维+
'KNN': KNN(), # 一般般slow
'LOF': LOF(), # 一般般slow
'ABOD': ABOD(n_neighbors=20), # slow
# 'OCSVM': OCSVM(), # 巨无敌slow,得1w年
'VAE1': VAE(batch_size=100, encoder_neurons=VAEneurons[0], decoder_neurons=VAEneurons[1], random_state=random_state), # wrong
'AutoEncoder1': AutoEncoder(batch_size=100, hidden_neurons=AEneurons, random_state=random_state), # wrong
'AutoEncoder2': AutoEncoder(batch_size=100, random_state=random_state), # wrong 没到16维
'VAE2': VAE(batch_size=100, random_state=random_state), # wrong
# 'COF': COF(), # wrong 内存绷不住,直接下一个
}
for clf_name, clf in classifiers.items():
try:
print(f"Using {clf_name} method")
starttime = time.time()
clf.fit(data_Normed[col])
except:
continue
time_taken = time.time() - starttime
print(f"Using time {time_taken}")
test_scores = clf.decision_scores_
# -----fix some broken scores----- #
for i in range(len(test_scores)):
cur = test_scores[i]
if np.isnan(cur) or not np.isfinite(cur):
test_scores[i] = 0
data_NoNull[f"{clf_name}_Score"] = normalize_model(pd.DataFrame(test_scores, columns=[f"{clf_name}_Score"], index=data_NoNull.index), [f"{clf_name}_Score"])
# print(data[f"{clf_name}_Score"])
data_NoNull.to_csv(file_name, encoding="utf-8-sig")
print(data_NoNull)
print(file_name, "保存成功!!!")
def wrong_in_price(data, mode="fast"):
# 检测价格异常变动
for i in data.value_counts("volumn_cluster").index:
clustered_data = data[data["volumn_cluster"] == i]
models(clustered_data, ["Growth_price"], f"价格异常变动cluster{i}.csv", mode)
print("价格异常变动检测完毕!!!")
for i in data.value_counts("volumn_cluster").index:
clustered_data = data[data["volumn_cluster"] == i]
models(clustered_data, ["Rate_price", "Credit_Score", "Reputation_Score", "ave_price_x"], f"价格过高异常cluster{i}.csv", mode)
print("过高价格检测完毕!!!")
def wrong_in_volumn(data, mode="fast"):
# 检测销量异常变动
for i in data.value_counts("volumn_cluster").index:
clustered_data = data[data["volumn_cluster"] == i]
models(clustered_data, ["Growth_volumn"], f"销量异常变动cluster{i}.csv", mode)
print("销量异常变动检测完毕!!!")
# 检测销量过高
for i in data.value_counts("volumn_cluster").index:
clustered_data = data[data["volumn_cluster"] == i]
models(clustered_data, ["SHOP_SALES_VOLUME_x", "Credit_Score", "Reputation_Score", "Rate_volumn"], f"销量过高异常cluster{i}.csv", mode)
print("过高销量检测完毕!!!")
每种模型的时间对比如下:
单个方法实现
除了上文的多模型联合,我们还针对1w的商品信息(数据链接:https://pan.baidu.com/s/1KatV_6ozYHjPkNjfVGBPmw 提取码:ee8i )尝试了几个单模型方法,流程上大同小异:
IForest
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from numpy import percentile
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from pyod.models.iforest import IForest
def IForest_test(data, contamination=0.1):
clf = IForest(contamination=contamination, random_state=0)
clf.fit(data)
scores_pred = clf.decision_function(data) * -1
y_pred = clf.predict(data)
n_inliers = len(y_pred) - np.count_nonzero(y_pred) # 统计数组中非零元素的个数 => 非异常值
n_outliers = np.count_nonzero(y_pred == 1) # 计算异常值个数
X1 = data
X1['outlier'] = y_pred.tolist()
print('OUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers) # 输出异常值数量
threshold = np.percentile(scores_pred, 100 * contamination) # 输出这个百分比之下的数字,相当于求多少分位数
draw(clf, X1, threshold)
def draw(clf, X1, threshold): # 画图
xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 120), np.linspace(-0.1, 1.1, 120))
inliers_sales = np.array(X1['ITEM_PRICE'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时可视化这几个
inliers_amount = np.array(X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时你俩
outliers_sales = X1['ITEM_PRICE'][X1['outlier'] == 1].values.reshape(-1, 1)
outliers_amount = X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 1].values.reshape(-1, 1)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 # ravel多维变一维,靠np._c连接两个矩阵 => 计算图上每个点的
Z = Z.reshape(xx.shape) # 是为了画那个蓝色区域
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) # plt.coutour用来画等高线的
a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') # 红线表示阈值
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') # 异常值到最大异常值得分的地方画橙色
b = plt.scatter(inliers_sales, inliers_amount, c='white', s=20, edgecolor='k')
c = plt.scatter(outliers_sales, outliers_amount, c='black', s=20, edgecolor='k')
plt.axis('tight')
plt.legend([a.collections[0], b, c], ['learned decision function', 'inliers', 'outliers'],
prop=matplotlib.font_manager.FontProperties(size=20), loc='lower right')
plt.xlim((-0.1, 1.1))
plt.ylim((-0.1, 1.1))
plt.title('Isolation Forest')
plt.show()
def normalize(file, group):
min_max = MinMaxScaler(feature_range=(0, 1))
ret = file
ret[file.columns[group]] = min_max.fit_transform(file[file.columns[group]])
return ret
def main():
df = pd.read_csv(r"../data_202106_head.tsv", encoding="utf-8", sep="\t")
# 筛选无缺失的
df1 = df.dropna(axis=0, how='any', subset=["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"])
# 有缺失的
df2 = df[df[["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"]].isnull().T.any()]
df1 = normalize(df1, [5, 6, 7, 13, 14, 15])
normalize(df2, [5, 6, 7])
IForest_test(df1[df1.columns[[5, 6, 7, 13, 14, 15]]], contamination=0.05)
IForest_test(df2[df2.columns[[5, 6, 7]]], contamination=0.05)
if __name__ == '__main__':
main()
KNN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.preprocessing import MinMaxScaler
from pyod.models.knn import KNN
import warnings
warnings.filterwarnings("ignore")
def KNN_test(X, contamination=0.1):
xx, yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
clf = KNN(contamination=contamination)
clf.fit(X)
scores_pred = clf.decision_function(X) * -1 # 得到异常值得分
y_pred = clf.predict(X) # 按照得分改成01,1是异常值
n_inliers = len(y_pred) - np.count_nonzero(y_pred) # 统计数组中非零元素的个数 => 非异常值
n_outliers = np.count_nonzero(y_pred == 1) # 计算异常值个数
plt.figure(figsize=(8, 8))
X1 = X
X1['outlier'] = y_pred.tolist() # 新添一列,异常值列
print('OUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers) # 输出异常值数量
threshold = np.percentile(scores_pred, 100 * contamination) # 输出这个百分比之下的数字,相当于求多少分位数
# draw(clf, X1, threshold)
def draw(clf, X1, threshold): # 画图
xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 120), np.linspace(-0.1, 1.1, 120))
inliers_sales = np.array(X1['ITEM_PRICE'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时可视化这几个
inliers_amount = np.array(X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时你俩
outliers_sales = X1['ITEM_PRICE'][X1['outlier'] == 1].values.reshape(-1, 1)
outliers_amount = X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 1].values.reshape(-1, 1)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 # ravel多维变一维,靠np._c连接两个矩阵 => 计算图上每个点的
Z = Z.reshape(xx.shape) # 是为了画那个蓝色区域
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) # plt.coutour用来画等高线的
a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') # 红线表示阈值
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') # 异常值到最大异常值得分的地方画橙色
b = plt.scatter(inliers_sales, inliers_amount, c='white', s=20, edgecolor='k')
c = plt.scatter(outliers_sales, outliers_amount, c='black', s=20, edgecolor='k')
plt.axis('tight')
plt.legend([a.collections[0], b, c], ['learned decision function', 'inliers', 'outliers'],
prop=matplotlib.font_manager.FontProperties(size=20), loc='lower right')
plt.xlim((-0.1, 1.1))
plt.ylim((-0.1, 1.1))
plt.title('K Nearest Neighbors (KNN)')
plt.show()
def normalize(file, group):
min_max = MinMaxScaler(feature_range=(0, 1))
ret = file
ret[file.columns[group]] = min_max.fit_transform(file[file.columns[group]])
return ret
def main():
df = pd.read_csv(r"../data_202106_head.tsv", encoding="utf-8", sep="\t")
# 筛选无缺失的
df1 = df.dropna(axis=0, how='any', subset=["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"])
# 有缺失的
df2 = df[df[["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"]].isnull().T.any()]
df1 = normalize(df1, [5, 6, 7, 13, 14, 15])
normalize(df2, [5, 6, 7])
KNN_test(df1[df1.columns[[5, 6, 7, 13, 14, 15]]], contamination=0.05)
KNN_test(df2[df2.columns[[5, 6, 7]]], contamination=0.05)
if __name__ == '__main__':
main()
CBLOF
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from numpy import percentile
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from pyod.models.cblof import CBLOF
def CBLOF_test(data, contamination=0.1):
clf = CBLOF(contamination=contamination, check_estimator=False, random_state=0)
clf.fit(data)
scores_pred = clf.decision_function(data) * -1
y_pred = clf.predict(data)
n_inliers = len(y_pred) - np.count_nonzero(y_pred) # 统计数组中非零元素的个数 => 非异常值
n_outliers = np.count_nonzero(y_pred == 1) # 计算异常值个数
X1 = data
X1['outlier'] = y_pred.tolist()
print('OUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers) # 输出异常值数量
threshold = np.percentile(scores_pred, 100 * contamination) # 输出这个百分比之下的数字,相当于求多少分位数
draw(clf, X1, threshold)
def draw(clf, X1, threshold): # 画图
xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 120), np.linspace(-0.1, 1.1, 120))
inliers_sales = np.array(X1['ITEM_PRICE'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时可视化这几个
inliers_amount = np.array(X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时你俩
outliers_sales = X1['ITEM_PRICE'][X1['outlier'] == 1].values.reshape(-1, 1)
outliers_amount = X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 1].values.reshape(-1, 1)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 # ravel多维变一维,靠np._c连接两个矩阵 => 计算图上每个点的
Z = Z.reshape(xx.shape) # 是为了画那个蓝色区域
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) # plt.coutour用来画等高线的
a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') # 红线表示阈值
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') # 异常值到最大异常值得分的地方画橙色
b = plt.scatter(inliers_sales, inliers_amount, c='white', s=20, edgecolor='k')
c = plt.scatter(outliers_sales, outliers_amount, c='black', s=20, edgecolor='k')
plt.axis('tight')
plt.legend([a.collections[0], b, c], ['learned decision function', 'inliers', 'outliers'],
prop=matplotlib.font_manager.FontProperties(size=20), loc='lower right')
plt.xlim((-0.1, 1.1))
plt.ylim((-0.1, 1.1))
plt.title('Cluster-based Local Outlier Factor (CBLOF)')
plt.show()
def normalize(file, group):
min_max = MinMaxScaler(feature_range=(0, 1))
ret = file
print(ret)
print(group)
ret[file.columns[group]] = min_max.fit_transform(file[file.columns[group]])
return ret
def main():
df = pd.read_csv(r"../data_202106_head.tsv", encoding="utf-8", sep="\t")
# 筛选无缺失的
df1 = df.dropna(axis=0, how='any', subset=["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"])
# 有缺失的
df2 = df[df[["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"]].isnull().T.any()]
df1 = normalize(df1, [5, 6, 7, 13, 14, 15])
normalize(df2, [5, 6, 7])
CBLOF_test(df1[df1.columns[[5, 6, 7, 13, 14, 15]]], contamination=0.05)
CBLOF_test(df2[df2.columns[[5, 6, 7]]], contamination=0.05)
if __name__ == '__main__':
main()
MCD
from pyod.models.mcd import MCD
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
def MCD_test(X, contamination=0.1):
xx, yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
clf = MCD(contamination=contamination)
clf.fit(X)
scores_pred = clf.decision_function(X) * -1 # 得到异常值得分
y_pred = clf.predict(X) # 按照得分改成01,1是异常值
n_inliers = len(y_pred) - np.count_nonzero(y_pred) # 统计数组中非零元素的个数 => 非异常值
n_outliers = np.count_nonzero(y_pred == 1) # 计算异常值个数
plt.figure(figsize=(8, 8))
X1 = X
X1['outlier'] = y_pred.tolist() # 新添一列,异常值列
print('OUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers) # 输出异常值数量
threshold = np.percentile(scores_pred, 100 * contamination) # 输出这个百分比之下的数字,相当于求多少分位数
draw(clf, X1, threshold)
def draw(clf, X1, threshold): # 画图
xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 120), np.linspace(-0.1, 1.1, 120))
inliers_sales = np.array(X1['ITEM_PRICE'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时可视化这几个
inliers_amount = np.array(X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时你俩
outliers_sales = X1['ITEM_PRICE'][X1['outlier'] == 1].values.reshape(-1, 1)
outliers_amount = X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 1].values.reshape(-1, 1)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 # ravel多维变一维,靠np._c连接两个矩阵 => 计算图上每个点的
Z = Z.reshape(xx.shape) # 是为了画那个蓝色区域
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) # plt.coutour用来画等高线的
a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') # 红线表示阈值
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') # 异常值到最大异常值得分的地方画橙色
b = plt.scatter(inliers_sales, inliers_amount, c='white', s=20, edgecolor='k')
c = plt.scatter(outliers_sales, outliers_amount, c='black', s=20, edgecolor='k')
plt.axis('tight')
plt.legend([a.collections[0], b, c], ['learned decision function', 'inliers', 'outliers'],
prop=matplotlib.font_manager.FontProperties(size=20), loc='lower right')
plt.xlim((-0.1, 1.1))
plt.ylim((-0.1, 1.1))
plt.title('K Nearest Neighbors (KNN)')
plt.show()
def normalize(file, group):
min_max = MinMaxScaler(feature_range=(0, 1))
ret = file
ret[file.columns[group]] = min_max.fit_transform(file[file.columns[group]])
return ret
def main():
df = pd.read_csv(r"../data_202106_head.tsv", encoding="utf-8", sep="\t")
# 筛选无缺失的
df1 = df.dropna(axis=0, how='any', subset=["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"])
# 有缺失的
df2 = df[df[["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"]].isnull().T.any()]
df1 = normalize(df1, [5, 6, 7, 13, 14, 15])
normalize(df2, [5, 6, 7])
MCD_test(df1[df1.columns[[5, 6, 7, 13, 14, 15]]], contamination=0.05)
MCD_test(df2[df2.columns[[5, 6, 7]]], contamination=0.05)
if __name__ == '__main__':
main()
PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.preprocessing import MinMaxScaler
from pyod.models.pca import PCA
import warnings
warnings.filterwarnings("ignore")
def KNN_test(X, contamination=0.1):
xx, yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
clf = PCA(contamination=contamination)
clf.fit(X)
scores_pred = clf.decision_function(X) * -1 # 得到异常值得分
y_pred = clf.predict(X) # 按照得分改成01,1是异常值
n_inliers = len(y_pred) - np.count_nonzero(y_pred) # 统计数组中非零元素的个数 => 非异常值
n_outliers = np.count_nonzero(y_pred == 1) # 计算异常值个数
plt.figure(figsize=(8, 8))
X1 = X
X1['outlier'] = y_pred.tolist() # 新添一列,异常值列
print('OUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers) # 输出异常值数量
threshold = np.percentile(scores_pred, 100 * contamination) # 输出这个百分比之下的数字,相当于求多少分位数
# draw(clf, X1, threshold)
def draw(clf, X1, threshold): # 画图
xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 120), np.linspace(-0.1, 1.1, 120))
inliers_sales = np.array(X1['ITEM_PRICE'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时可视化这几个
inliers_amount = np.array(X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 0]).reshape(-1, 1) # 暂时你俩
outliers_sales = X1['ITEM_PRICE'][X1['outlier'] == 1].values.reshape(-1, 1)
outliers_amount = X1['ITEM_SALES_AMOUNT'][X1['outlier'] == 1].values.reshape(-1, 1)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 # ravel多维变一维,靠np._c连接两个矩阵 => 计算图上每个点的
Z = Z.reshape(xx.shape) # 是为了画那个蓝色区域
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) # plt.coutour用来画等高线的
a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') # 红线表示阈值
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') # 异常值到最大异常值得分的地方画橙色
b = plt.scatter(inliers_sales, inliers_amount, c='white', s=20, edgecolor='k')
c = plt.scatter(outliers_sales, outliers_amount, c='black', s=20, edgecolor='k')
plt.axis('tight')
plt.legend([a.collections[0], b, c], ['learned decision function', 'inliers', 'outliers'],
prop=matplotlib.font_manager.FontProperties(size=20), loc='lower right')
plt.xlim((-0.1, 1.1))
plt.ylim((-0.1, 1.1))
plt.title('K Nearest Neighbors (KNN)')
plt.show()
def normalize(file, group):
min_max = MinMaxScaler(feature_range=(0, 1))
ret = file
ret[file.columns[group]] = min_max.fit_transform(file[file.columns[group]])
return ret
def main():
df = pd.read_csv(r"../data_202106_head.tsv", encoding="utf-8", sep="\t")
# 筛选无缺失的
df1 = df.dropna(axis=0, how='any', subset=["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"])
# 有缺失的
df2 = df[df[["ITEM_FAV_NUM", "TOTAL_EVAL_NUM", "ITEM_STOCK"]].isnull().T.any()]
df1 = normalize(df1, [5, 6, 7, 13, 14, 15])
normalize(df2, [5, 6, 7])
KNN_test(df1[df1.columns[[5, 6, 7, 13, 14, 15]]], contamination=0.05)
KNN_test(df2[df2.columns[[5, 6, 7]]], contamination=0.05)
if __name__ == '__main__':
main()
其他的就不赘述了,大同小异。