2.以breast-cancer-unsupervised-ad数据集为例做一些简单的数据可视化。
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
## 1) 载入训练集和测试集;
f=open('breast-cancer-unsupervised-ad.csv')
names=['f' + str(i) for i in range(31)]
names[30]='label'
Train_data = pd.read_csv(f, header=None,names=names)
## 2) 简略观察数据(head()+shape)
Train_data.head()
## 1) 通过describe()来熟悉数据的相关统计量
Train_data.describe()
## 2) 通过info()来熟悉数据类型
Train_data.info()
numeric_features = ['f' + str(i) for i in range(30)]
## 1) 相关性分析
numeric = Train_data[numeric_features]
correlation = numeric.corr()
f , ax = plt.subplots(figsize = (14, 14))
sns.heatmap(correlation,square = True)
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
plt.show()
## 3) 每个数字特征得分布可视化
f = pd.melt(Train_data, value_vars=numeric_features)
g = sns.FacetGrid(f, col="variable", col_wrap=6, sharex=False, sharey=False)
g = g.map(sns.distplot, "value", hist=False, rug=True)
#变量两两之间的相关性
sns.set()
sns.pairplot(Train_data[numeric_features],size = 2 ,kind ='scatter',diag_kind='kde')
plt.savefig('correlation.png')
plt.show()
#数据降维可视化
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, init='pca', random_state=0)
result = tsne.fit_transform(numeric)
x_min, x_max = np.min(result, 0), np.max(result, 0)
result = (result - x_min) / (x_max - x_min)
label = Train_data['label']
fig = plt.figure(figsize = (7, 7))
#f , ax = plt.subplots()
color = {'o':0, 'n':7}
for i in range(result.shape[0]):
plt.text(result[i, 0], result[i, 1], str(label[i]),
color=plt.cm.Set1(color[label[i]] / 10.),
fontdict={'weight': 'bold', 'size': 9})
plt.xticks([])
plt.yticks([])
plt.title('Visualization of data dimension reduction')
使用pyod库生成example并使用该库的pca模块进行检测
from pyod.models.pca import PCA
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
# 使用生成样本数据pyod.utils.data.generate_data():
contamination = 0.1 # percentage of outliers
n_train = 200 # number of training points
n_test = 100 # number of testing points
X_train, y_train, X_test, y_test = generate_data(
n_train=n_train, n_test=n_test, contamination=contamination)
# 初始化pyod.models.PCA.PCA检测器,拟合模型,然后进行预测。
# train PCA detector
clf_name = 'PCA'
clf = PCA() # n_components默认为全部
clf.fit(X_train)
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores
# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores
# 使用ROC和Precision @ Rank n评估预测pyod.utils.data.evaluate_print()。
from pyod.utils.data import evaluate_print
# evaluate and print the results
print("\nPCA On Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nPCA On Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)
# 在培训和测试数据上查看示例输出。
# 通过可视化所有示例中包含的功能来生成可视化。
visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
y_test_pred, show_figure=True, save_figure=True)
PCA On Training Data:
PCA ROC:0.9433, precision @ rank n:0.85
PCA On Test Data:
PCA ROC:0.8644, precision @ rank n:0.8