我们以2022年全国服务外包大赛的A03题目作为示例代码演示异常值识别过程。
问题的主要任务时找出商品的销量异常和价格异常,提供4个月的商品信息数据,共1700万余条,4个月的店铺信息数据,共60万余条,强调时间复杂度空间复杂度、异常值识别率和准确率。我们用店铺分析辅助商品的异常,以提高可信度和准确率。
部分数据链接:https://pan.baidu.com/s/1KatV_6ozYHjPkNjfVGBPmw 提取码:ee8i
在这里我们只是对1w的商品信息进行尝试性的分析计算。如果想要更进一步了解其他异常值识别方法的可以看这篇博文:https://editor.csdn.net/md/?articleId=124340047
岭回归
import numpy
import pandas
from sklearn.linear_model import Ridge # 通过sklearn.linermodel加载岭回归方法
from sklearn import model_selection # 加载交叉验证模块
import matplotlib.pyplot as plt # 画图
from sklearn.preprocessing import StandardScaler # 标准化
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import pandas as pd
target_file = r"D:\PythonFiles\Service Outsourcing\Distribution testing\ITEM_STOCK已补202106_10000_drop.tsv"
def draw(file, x, y, pre_y):
# plot1 = file.plot.scatter(x="ITEM_STOCK", y="ITEM_SALES_VOLUME") # 销售量与点赞的关系
plot1 = plt.plot(x, y, 'b', label='Fitting Curve')
plot2 = plt.plot(x, pre_y, 'r', label='Fitting Curve')
plt.show()
def error_calculation(file, pre_y, presentage):
y = numpy.array(file["ITEM_SALES_VOLUME"])
pre_y = numpy.array(pre_y)
n = pre_y.shape[0]
# print(n)
diff = abs(y-pre_y[:, 0])
diff = pandas.DataFrame(diff)
diff.columns = ["error"]
file1 = file.join(diff)
file2 = file1.sort_values(by="error", ascending=False)
# print(file2)
file2.head(int((1-presentage)*n)).to_csv("2.csv", encoding="utf-8")
def ridge(file):
file = file.sort_values(by="ITEM_STOCK", ascending=False)
x = file[["ITEM_STOCK", "ITEM_PRICE", "ITEM_FAV_NUM", "TOTAL_EVAL_NUM"]]
print(x)
y = file[["ITEM_SALES_AMOUNT"]]
# for i in range(10):
# poly_reg = PolynomialFeatures(degree=6)
# x_poly = poly_reg.fit_transform(x)
estimator = Ridge(alpha=1)
estimator.fit(x, y)
y_predict = estimator.predict(x)
# print("预测值为:\n", y_predict)
# print("模型中的系数为:\n", estimator.coef_)
# print("模型中的偏置为:\n", estimator.intercept_)
error = mean_squared_error(y, y_predict)
print("误差为:\n", error)
error_calculation(file, y_predict, 0.95)
draw(file, x, y, y_predict)
def main():
f = pd.read_csv(target_file, sep="\t", encoding="utf-8")
ridge(f.head(10000))
if __name__ == '__main__':
main()
结果如下:
可以发现效果非常非常一般,因为不太适合这个分布的数据。
经验公式拟合
根据最简单的经济学常识,我们知道价格与销量大致是反比关系,因此不太适合岭回归线性模型,或者多项式拟合模型。我们采取经验公式最小二乘法拟合,挑选三种经验公式拟合中效果最佳的。
import numpy as np
import pandas
import pandas as pd
import pylab
import math
from scipy.optimize import curve_fit
def choose(x, y):
popt1, pcov1 = curve_fit(func1, x, y) # 曲线拟合, popt为函数的参数list
y_pred1 = [func1(i, popt1[0], popt1[1], popt1[2]) for i in x] # 直接用函数和函数参数list来进行y值的计算
popt2, pcov2 = curve_fit(func2, x, y)
y_pred2 = [func2(i, popt2[0], popt2[1], popt2[2]) for i in x]
popt3, pcov3 = curve_fit(func3, x, y)
y_pred3 = [func3(i, popt3[0], popt3[1], popt3[2], popt3[3], popt3[4]) for i in x]
error1, error1_save = error_cul(y, y_pred1)
error2, error2_save = error_cul(y, y_pred2)
error3, error3_save = error_cul(y, y_pred3)
temp = [error1, error2, error3]
print(error1, error2, error3)
if error2 == min(temp):
print(popt2)
return error2_save, y_pred2
elif error1 == min(temp):
print(popt1)
return error1_save, y_pred1
else:
print(popt3)
return error3_save, y_pred3
def func1(x, a, b, c):
return a * np.exp(b * x)+c
def func2(x, a, b, c):
return a * pow(x, b) + c
def func3(x, a, b, c, d, e):
return a * pow(x, b) + c * np.exp(d * x) + e
def error_cul(y, pre_y):
diff = abs(np.array(y) - np.array(pre_y))
error_every = diff/np.array(y)
error_all = np.sum(error_every)
return error_all, error_every
def draw(x, y, y_pred):
plot1 = pylab.plot(x, y, '*', label='original values')
plot2 = pylab.plot(x, y_pred, 'r', label='fit values')
pylab.title('')
pylab.xlabel('')
pylab.ylabel('')
pylab.legend(loc=3, borderaxespad=0., bbox_to_anchor=(0, 0))
pylab.show()
def save(file, error_rate, presentage):
n = error_rate.shape[0]
error_rate = pandas.DataFrame(error_rate)
error_rate.columns = ["error"]
file = file.join(error_rate)
file = file.sort_values(by="error", ascending=False)
file.head(int((1-presentage)*n)).to_csv("2.csv", encoding="utf-8")
def main():
df = pd.read_csv("data_202106_head.tsv", sep="\t", encoding="utf-8")
df=df.dropna(axis=0, how='any', subset=["ITEM_FAV_NUM"])
df = df.sort_values(by="ITEM_FAV_NUM", ascending=False)
# x= np.loadtxt("data_202106_head.tsv", delimiter="\t", usecols=6, encoding="utf-8", skiprows=1)
x = df["ITEM_FAV_NUM"]
y = df["ITEM_SALES_VOLUME"]
error_rate, y_pred = choose(x, y)
draw(x, y, y_pred)
save(df, error_rate, 0.95)
if __name__ == '__main__':
main()
效果如下:
放大之后:
效果比岭回归强点,但还是强差人意。而且存在受远端异常值影响严重的问题:
孤立森林
我们先尝试单变量检测,直接删除Nan项。
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import IsolationForest # 这里尝试sklearn的包
def Isolation_Forest(file, target, n_estimators = 100):
for i in target:
file_for_this = file.dropna(axis=0, how='any', subset=["ITEM_FAV_NUM"])
clf = IsolationForest(n_estimators=n_estimators)
clf.fit(file_for_this[file.columns[i]].values.reshape(-1, 1)) # 训练=>靠reshap变成2维一列的数据
xx = np.linspace(file_for_this[file.columns[i]].min(), file_for_this[file.columns[i]].max(), len(file_for_this)).reshape(-1, 1) # 序列生成器生成序列,均匀一下差值
anomaly_score = clf.decision_function(xx) # 分类器的一种方法,看是在分类器超平面的左右哪边 => 算是打出了一个异常得分
outlier = clf.predict(xx) # 预测,看看谁是异常的
plt.figure(figsize=(20, 10))
plt.plot(xx, anomaly_score, color='r', linewidth=1, label='anomaly score')
plt.fill_between(xx.T[0], np.min(anomaly_score), np.max(anomaly_score),
alpha=0.4, where=outlier == -1,
label='outlier_region') # 很神奇,这里可以直接写outlier == -1
plt.legend()
plt.ylabel('anomaly score')
plt.xlabel(file.columns[i])
print(file.columns[i], "的异常值范围是:", np.min(anomaly_score), "----", np.max(anomaly_score))
plt.show()
def main():
df = pd.read_csv(r"../data_202106_head.tsv", encoding="utf-8", sep="\t")
target = [5, 6, 7, 13, 14, 15]
# target是目标列,n_estimators是估算器数量
Isolation_Forest(df, target, n_estimators=101)
if __name__ == '__main__':
main()
注意
这部分主要是刚拿到数据的尝试,更多的是熟悉此领域的一些方法思路,整套流程可以在博主的异常值识别专栏中查看。