数据分析的一些基本操作

本文链接：https://blog.csdn.net/ggn_2015/article/details/121110281

前言

机器学习什么的又是我学不明白的东西，学一乐。

划分训练集与测试集

from sklearn.model_selection import train_test_split # 将 X, y 随机划分为训练集和测试集
X_train, X_test, y_train, y_text = train_test_split(X, y, random_state = 0, test_size = 0.3, stratify = y) # stratify = y 按 y 值分层抽样

计算混淆矩阵

from sklearn.metrics import confusion_matrix # 计算二分类的混淆矩阵
def get_confusion_matrix(y_true1, y_pred1): # 自定义计算混淆矩阵
    TN, FP, FN, TP = confusion_matrix(y_true1, y_pred1).reshape((4, ))
    return (TN, FP, FN, TP)
# TN, FP, FN, TP = get_confusion_matrix(y_test, predict_y) # 使用方式

符号介绍

$prediction\\grandtruth$	0 阴	1 阳
0 负	$T r u e N e g a t i v e$	$F a l s e P o s i t i v e$
1 正	$F a l s e N e g a t i v e$	$T r u e P o s i t i v e$

$G - m e a n$ 计算公式

def get_gmean(TN, FP, FN, TP): # 计算 G-mean
    specificity = TN / (TN + FP)# 特异度: 负样本的判对概率
    recall = TP / (TP + FN) # 召回率：正样本的判对概率
    return (recall * specificity) ** 0.5

交叉验证测试集

import numpy as np
from sklearn.model_selection import KFold,StratifiedKFold # Stratified 分层划分
X = np.random.randint(1,100,40).reshape((20,2))
y = np.random.randint(0,2,20) # 随机标签
skf = StratifiedKFold(n_splits=5, random_state=0) # 定义一个数据划分类
# kf = KFold(n_splits=5, random_state=0)
for train_id, test_id in skf.split(X, y):
    print(train_id, test_id, y[test_id]) # 可以看到 y[test_id] 中 0 和 1 基本上是 1 : 1 的

生成二分类报告

from sklearn.metrics import classification_report
print(classification_report(y_test, predict_y)) # 生成二分类器报告

$p a n d s$ 读取 $c s v$ 数据

import pandas as pd
#pandas.read_csv 的使用方法 https://zhuanlan.zhihu.com/p/340441922
abalone = pd.read_csv('.\\KEEL-DATA\\abalone.dat', header = None) # 读取没有表头的数据

不写 header = None 默认会把 $c s v$ 文件中的第一行视为表头。

$p a n d a s$ 字段值统计

data_frame_name["column_name"].value_counts() # 统计该字段的所有值出现的次数
data_frame_name.head() # 默认返回前五行数据，可以指定行数

$m l p$ 模型直接调用

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier # 多层感知机二分类器
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler # 特征标准化

def mlp_model(X, y, hidden_layer_sizes = [100], random_state = None): # 生成一个多重感知机预测模型
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = random_state, test_size = 0.3, stratify = y)
    mlp = MLPClassifier(solver = 'lbfgs', random_state = 0, hidden_layer_sizes = hidden_layer_sizes) # 多重感知机
    pipe = make_pipeline(StandardScaler(), mlp) # 对训练集进行数据标准化
    pipe.fit(X_train, y_train)
    predict_y = pipe.predict(X_test) # 在预测前会自动对 X_test 按照训练集相同的方式进行变换
    acc = pipe.score(X_test, y_test)
    return (pipe, classification_report(y_test, predict_y), acc)

$random\_state$ 随机种子， $= N o n e$ 默认随机生成；

$hidden\_layer\_sizes$ 是一个 $l i s t$ 用于表示每一层结点的元素个数（不建议太多层，也不建议一层有太多结点）。

估计 $m l p$ 模型的性能

import numpy as np
def check_mlp_model(X, y, hidden_layer_sizes, batch = 100): # 检测一个多层感知机模型的性能
    acc_s = np.zeros(batch) # 记录所有 acc
    for i in range(0, batch):
        mlp, stat, acc = mlp_model(X, y, hidden_layer_sizes, i) # stat 是测试报告
        acc_s[i] = acc
    return (np.mean(acc_s), np.std(acc_s))

生成 $p a i r p l o t$

import seaborn as sns
sns.pairplot(ecoli0vs1, hue="column_name") # 输出 pairplot
# hue 为作为分色依据的列名，利于观察二分类、多分类数据集

获得一个 $D a t e F r a m e$ 的所有列名

def get_columns(dataframe): # 获得一个含有所有列名的 list
    col_list = []
    for cols, ser in dataframe.iteritems(): # 获得所有列
        col_list.append(cols)
    return col_list

$D B S C A N$ 聚类

from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=2) # eps 需要调参
clusters = dbscan.fit_predict(X) # X 为数据集，聚类并返回标号，-1 为噪音

数据标准化

from sklearn.preprocessing import StandardScaler # 特征标准化
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

绘制直方图

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np

def make_hist(data : np.ndarray, xlabel : str, ylabel = "频率", title = "频率直方图"): # 绘制直方图
    mean = data.mean()
    std = data.std()
    n, bins, patches = plt.hist(x = data, # 指定绘图数据
             bins = 20, # 指定直方图中条块的个数
             color = 'green', # 指定直方图的填充色
             edgecolor = 'black', # 指定直方图的边框色
             alpha = 0.5,
             density = True # 频率
             )
    Y = mlab.normpdf(bins, mean, std)#拟合一条最佳正态分布曲线y 
    plt.plot(bins, Y, 'r--') #绘制y的曲线
    plt.xlabel(xlabel) # 添加x轴和y轴标签
    plt.ylabel(ylabel)
    plt.title(title) # 添加标题
    plt.show()# 显示图形
    
# make_hist(y, "均绩")