前言
机器学习什么的又是我学不明白的东西,学一乐。
划分训练集与测试集
from sklearn.model_selection import train_test_split # 将 X, y 随机划分为训练集和测试集
X_train, X_test, y_train, y_text = train_test_split(X, y, random_state = 0, test_size = 0.3, stratify = y) # stratify = y 按 y 值分层抽样
计算混淆矩阵
from sklearn.metrics import confusion_matrix # 计算二分类的混淆矩阵
def get_confusion_matrix(y_true1, y_pred1): # 自定义计算混淆矩阵
TN, FP, FN, TP = confusion_matrix(y_true1, y_pred1).reshape((4, ))
return (TN, FP, FN, TP)
# TN, FP, FN, TP = get_confusion_matrix(y_test, predict_y) # 使用方式
符号介绍
p r e d i c t i o n g r a n d t r u t h prediction\\grandtruth predictiongrandtruth | 0 阴 | 1 阳 |
---|---|---|
0 负 | T r u e N e g a t i v e TrueNegative TrueNegative | F a l s e P o s i t i v e FalsePositive FalsePositive |
1 正 | F a l s e N e g a t i v e FalseNegative FalseNegative | T r u e P o s i t i v e TruePositive TruePositive |
G − m e a n G-mean G−mean 计算公式
def get_gmean(TN, FP, FN, TP): # 计算 G-mean
specificity = TN / (TN + FP)# 特异度: 负样本的判对概率
recall = TP / (TP + FN) # 召回率:正样本的判对概率
return (recall * specificity) ** 0.5
交叉验证测试集
import numpy as np
from sklearn.model_selection import KFold,StratifiedKFold # Stratified 分层划分
X = np.random.randint(1,100,40).reshape((20,2))
y = np.random.randint(0,2,20) # 随机标签
skf = StratifiedKFold(n_splits=5, random_state=0) # 定义一个数据划分类
# kf = KFold(n_splits=5, random_state=0)
for train_id, test_id in skf.split(X, y):
print(train_id, test_id, y[test_id]) # 可以看到 y[test_id] 中 0 和 1 基本上是 1 : 1 的
生成二分类报告
from sklearn.metrics import classification_report
print(classification_report(y_test, predict_y)) # 生成二分类器报告
p a n d s pands pands 读取 c s v csv csv 数据
import pandas as pd
#pandas.read_csv 的使用方法 https://zhuanlan.zhihu.com/p/340441922
abalone = pd.read_csv('.\\KEEL-DATA\\abalone.dat', header = None) # 读取没有表头的数据
不写 header = None
默认会把
c
s
v
csv
csv 文件中的第一行视为表头。
p a n d a s pandas pandas 字段值统计
data_frame_name["column_name"].value_counts() # 统计该字段的所有值出现的次数
data_frame_name.head() # 默认返回前五行数据,可以指定行数
m l p mlp mlp 模型直接调用
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier # 多层感知机二分类器
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler # 特征标准化
def mlp_model(X, y, hidden_layer_sizes = [100], random_state = None): # 生成一个多重感知机预测模型
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = random_state, test_size = 0.3, stratify = y)
mlp = MLPClassifier(solver = 'lbfgs', random_state = 0, hidden_layer_sizes = hidden_layer_sizes) # 多重感知机
pipe = make_pipeline(StandardScaler(), mlp) # 对训练集进行数据标准化
pipe.fit(X_train, y_train)
predict_y = pipe.predict(X_test) # 在预测前会自动对 X_test 按照训练集相同的方式进行变换
acc = pipe.score(X_test, y_test)
return (pipe, classification_report(y_test, predict_y), acc)
r a n d o m _ s t a t e random\_state random_state 随机种子, = N o n e =None =None 默认随机生成;
h i d d e n _ l a y e r _ s i z e s hidden\_layer\_sizes hidden_layer_sizes 是一个 l i s t list list 用于表示每一层结点的元素个数(不建议太多层,也不建议一层有太多结点)。
估计 m l p mlp mlp 模型的性能
import numpy as np
def check_mlp_model(X, y, hidden_layer_sizes, batch = 100): # 检测一个多层感知机模型的性能
acc_s = np.zeros(batch) # 记录所有 acc
for i in range(0, batch):
mlp, stat, acc = mlp_model(X, y, hidden_layer_sizes, i) # stat 是测试报告
acc_s[i] = acc
return (np.mean(acc_s), np.std(acc_s))
生成 p a i r p l o t pairplot pairplot
import seaborn as sns
sns.pairplot(ecoli0vs1, hue="column_name") # 输出 pairplot
# hue 为作为分色依据的列名,利于观察二分类、多分类数据集
获得一个 D a t e F r a m e DateFrame DateFrame 的所有列名
def get_columns(dataframe): # 获得一个含有所有列名的 list
col_list = []
for cols, ser in dataframe.iteritems(): # 获得所有列
col_list.append(cols)
return col_list
D B S C A N DBSCAN DBSCAN 聚类
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=2) # eps 需要调参
clusters = dbscan.fit_predict(X) # X 为数据集,聚类并返回标号,-1 为噪音
数据标准化
from sklearn.preprocessing import StandardScaler # 特征标准化
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
绘制直方图
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
def make_hist(data : np.ndarray, xlabel : str, ylabel = "频率", title = "频率直方图"): # 绘制直方图
mean = data.mean()
std = data.std()
n, bins, patches = plt.hist(x = data, # 指定绘图数据
bins = 20, # 指定直方图中条块的个数
color = 'green', # 指定直方图的填充色
edgecolor = 'black', # 指定直方图的边框色
alpha = 0.5,
density = True # 频率
)
Y = mlab.normpdf(bins, mean, std)#拟合一条最佳正态分布曲线y
plt.plot(bins, Y, 'r--') #绘制y的曲线
plt.xlabel(xlabel) # 添加x轴和y轴标签
plt.ylabel(ylabel)
plt.title(title) # 添加标题
plt.show()# 显示图形
# make_hist(y, "均绩")