DM05-奇异点处理总结

sklearn的奇异点检测

这里写图片描述
可以考虑四种方法去处理这个问题。

经过词典及停用过滤并分词

收集文本,本次选择母婴主题的文章,建立相关词典及停词之后,再进行分词:
这里写图片描述

hash向量化作降维

文本向量化,由于文本语料比较大,采用hash向量化。

class ForHash(object):
    def __init__(self, in_file, n):
        self.in_file = in_file
        self.n = n
        self.y = []

    def __iter__(self):
        for line in open(self.in_file, encoding='utf-8'):
            if self.n:
                # print(self.n)
                t = line.split(';')
                self.y.append(int(t[0]))
                #t[3]解释出来的是题目,t[4]解释出来的是文章内容。
                yield t[3] + ' ' + t[4]
                self.n -= 1

def tran_data(R=100, N=2 ** 12):
    #只要用到HashingVectorizer作文本的向量化
    vectorizer = HashingVectorizer(n_features=N)
    data = ForHash(ROOTPATH + '/data/corpus/' + '%s.cut' % ('all'), R)
    X = vectorizer.transform(data)
    print('完成X')
    X = X.todense()[:, :].A
    y = np.array(data.y)
    pca = PCA(n_components=256)
    X_pca = pca.fit_transform(X)
    np.save(ROOTPATH + '/data/corpus/' + 'data_X_%d.npy' % (R), X_pca)
    np.save(ROOTPATH + '/data/corpus/' + 'data_y_%d.npy' % (R), y)
    print('完成y')

保存成*.npy文件。

四个奇异点模型计算
# coding=utf-8
import matplotlib.font_manager
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from sklearn import svm, preprocessing
from sklearn.covariance import EllipticEnvelope
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# 指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
# 解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

rng = np.random.RandomState(42)
n_samples = 200
outliers_fraction = 0.1
clusters_separation = [0, 1, 2]

classifiers = {
    "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                                     kernel="rbf", gamma=0.1),
    "Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
    "Isolation Forest": IsolationForest(max_samples=n_samples,
                                        contamination=outliers_fraction,
                                        random_state=rng),
    "Local Outlier Factor": LocalOutlierFactor(
        n_neighbors=35,
        contamination=outliers_fraction)}

xx, yy = np.meshgrid(np.linspace(-2, 2, 100), np.linspace(-2, 2, 100))

X = np.load(ROOTPATH + '/data/corpus/' + 'data_X_%d.npy' % (1000))
print('shap:', X.shape)
print('X:', X[:1, :])

print('normalize...')
X = preprocessing.normalize(X, norm='l2')
print('pca...')
pca = PCA(n_components=2)
X = pca.fit_transform(X)

plt.figure(figsize=(9, 7))
for i, (clf_name, clf) in enumerate(classifiers.items()):
    # fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
        y_pred = clf.predict(X)
    threshold = stats.scoreatpercentile(scores_pred,
                                        100 * outliers_fraction)
    if clf_name == "Local Outlier Factor":
        Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    subplot = plt.subplot(2, 2, i + 1)
    subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                     cmap=plt.cm.Blues_r)
    a = subplot.contour(xx, yy, Z, levels=[threshold],
                        linewidths=2, colors='red')
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
                     colors='orange')
    b = subplot.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], c='white',
                        s=20, edgecolor='k')
    c = subplot.scatter(X[y_pred == -1, 0], X[y_pred == -1, 1], c='black',
                        s=20, edgecolor='k')
    subplot.axis('tight')
    subplot.legend(
        [a.collections[0], b, c],
        ['learned decision function', 'regular inliers', 'abnormal outliers'],
        prop=matplotlib.font_manager.FontProperties(size=10),
        loc='lower right')
    subplot.set_xlabel("%d. %s" % (i + 1, clf_name))
plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
plt.suptitle("婴儿文章分类奇异点分析")

plt.show()
运行结果

这里写图片描述

shap: (1000, 256)
X: [[ -3.45411337e-02   4.86248015e-01   3.65496229e-02   8.50815275e-02
   -5.19749721e-02  -1.16925828e-02  -7.34607193e-02   1.65651158e-01
   -1.31629096e-01  -1.74638800e-01  -1.60052187e-02   4.03688810e-01
    2.42585151e-01   1.52919377e-01  -1.24645165e-01   5.83321312e-03
   -1.48419340e-02   1.34481031e-01   3.66996154e-03   3.23902195e-02
   -7.89237336e-02  -1.51580351e-02  -1.76917479e-03   3.28018899e-02
    3.36829459e-02  -1.15863109e-02   3.72514349e-02  -2.70418580e-03
   -7.37516898e-03  -3.68969879e-02  -1.04074658e-02   1.66102467e-04
   -5.95340364e-02  -1.83078064e-02   2.30508127e-02   2.37664616e-02
   -1.67538223e-02  -1.01534977e-03   1.42186630e-02  -5.69730784e-03
    1.34286601e-02   1.00701730e-02  -3.87582299e-03   1.47002871e-02
    1.52362999e-02   2.71289382e-03  -4.31252892e-02  -7.20457934e-03
   -1.58855913e-02   5.34313606e-03   6.38473558e-03   1.51416166e-02
    4.84966944e-03   3.36798352e-03  -2.31231073e-02  -2.31017405e-03
    1.32852326e-02  -3.06326743e-02   2.48876282e-02  -4.54163232e-03
    1.11865358e-04   2.40459164e-02   9.41057928e-04   4.13050466e-03
    8.78487819e-03   1.26569294e-02   1.21086785e-02  -1.35378985e-03
    1.73430761e-02  -1.81448588e-02   1.20429369e-02  -1.35287641e-02
   -3.07067659e-03  -8.57009859e-03  -1.70300947e-02  -1.21419554e-02
   -3.77661152e-03   1.68064407e-02  -2.37168550e-03   3.02081894e-04
    2.94208735e-02   3.05244205e-03   1.56829286e-02  -2.00402536e-03
    3.10488715e-03   1.67431875e-02   4.37773520e-03  -5.30261586e-03
   -1.18827481e-02  -1.27051448e-02   9.42004757e-03  -1.94108252e-02
    1.42393343e-02   4.53406317e-02   1.73546612e-03  -1.14221371e-02
   -5.79668978e-03   1.65842835e-02  -1.47619572e-02   8.04849706e-03
    9.98531898e-03  -1.40948123e-02  -5.25045974e-03  -3.65689056e-03
    1.65000619e-03   2.66638141e-02   1.10484072e-02   1.86986091e-02
    7.10051704e-03   1.07484566e-02  -1.71402067e-02   1.00205517e-03
   -2.06523678e-02  -7.23253816e-03   2.78610079e-02  -1.44773009e-02
    3.06548583e-02  -1.92142477e-02   2.10011424e-03   1.70134137e-04
    1.42329084e-02  -2.06363024e-02  -6.09332339e-04   6.29057371e-03
    1.69411189e-02   1.07996184e-02  -7.00908422e-04   8.47523589e-03
    8.95803233e-03  -4.46238477e-03   2.71364555e-03  -1.22199674e-02
   -2.19792773e-03  -3.46359886e-03   2.33316185e-02   2.23616967e-02
    1.07848265e-02   8.60374223e-03  -7.75775464e-03   7.68231924e-03
   -1.48827397e-02  -2.20904076e-02  -4.17214383e-03  -4.89461292e-03
    5.72871548e-03   4.16521970e-03   1.26487913e-02   2.46678504e-02
    1.24846938e-02  -5.43862267e-03   5.96307202e-03   2.43883985e-02
    1.31828913e-02  -2.79505922e-05   3.23426799e-02  -1.98114424e-02
    5.64604472e-03   1.98456692e-02   2.02394200e-02   4.27344159e-03
    2.30321433e-02   8.96719076e-03  -1.43201570e-02  -3.41568724e-02
   -1.24891308e-02  -1.58528439e-02   1.83471065e-02   1.64532832e-02
   -1.65468060e-02   3.45518798e-02  -4.32648915e-03   3.13717653e-02
   -4.74949429e-03  -2.36046525e-02   1.89461626e-02   5.97657517e-03
   -3.05636031e-02  -1.47363476e-03   1.33279411e-03  -2.08900180e-03
   -9.83489554e-03  -3.31655175e-02   2.92109538e-02   1.45905893e-02
    3.97159879e-03  -2.05730040e-02   2.47625608e-02  -1.38133811e-02
   -1.08149931e-02  -4.62567013e-02  -3.60707948e-02  -2.09190882e-02
    1.41688620e-02  -1.20039113e-02  -3.41724098e-02  -3.00254492e-02
   -7.52045226e-03   2.78101207e-02  -2.37204130e-02  -9.92166765e-03
   -2.11959086e-02   4.91142770e-02  -6.43086852e-03   3.84405593e-02
    2.15992053e-02   8.14847602e-03   7.95838521e-03   3.02877304e-03
   -2.83792569e-03   1.55858208e-02  -3.31380388e-03   3.17888628e-02
   -8.57665270e-03   1.16501606e-03  -2.00634544e-02  -1.78939820e-02
   -1.96620323e-02  -4.00372356e-02  -1.69711946e-02   1.69427191e-02
   -1.32113194e-02   2.63700515e-02  -7.58584313e-03   3.05405552e-03
    6.54275394e-03  -5.87168213e-03   2.50981901e-02  -3.42103511e-02
   -2.05620099e-02   9.71459707e-03   1.13942746e-02   1.29278894e-02
   -6.00790741e-03   4.27442557e-02  -8.53777373e-03  -7.79271024e-03
   -8.43114301e-03  -1.27781169e-02   2.76924214e-03  -9.35530659e-03
    7.15824366e-04  -1.40369933e-02   3.26816065e-02  -1.16926265e-02
   -5.86385296e-03   1.70579864e-02  -1.36484862e-02   1.53073763e-02
   -1.12915318e-02   2.77235932e-02  -1.48992473e-02  -3.37593835e-02
    9.26707616e-03  -1.38269892e-02   2.29654572e-02  -1.57563017e-03]]
normalize...
pca...
参考

http://sklearn.apachecn.org/cn/0.19.0/auto_examples/covariance/plot_outlier_detection.html#sphx-glr-auto-examples-covariance-plot-outlier-detection-py
http://sklearn.apachecn.org/cn/0.19.0/auto_examples/ensemble/plot_isolation_forest.html#sphx-glr-auto-examples-ensemble-plot-isolation-forest-py
http://sklearn.apachecn.org/cn/0.19.0/modules/outlier_detection.html

作者:happyprince
http://blog.csdn.net/ld326/article/details/79030806

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值