机器学习实战：小麦种子（封装函数进行调参、标准化、绘图查看数据分布）

本文链接：https://blog.csdn.net/qq_33489955/article/details/124186463

声明：内容非原创，代码来自葁sir

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 导入数据集
seeds = pd.read_csv('data/seeds.csv',sep = '\t',header = None)
seeds.head()

	0	1	2	3	4	5	6	7
0	15.26	14.84	0.8710	5.763	3.312	2.221	5.220	Kama
1	14.88	14.57	0.8811	5.554	3.333	1.018	4.956	Kama
2	14.29	14.09	0.9050	5.291	3.337	2.699	4.825	Kama
3	13.84	13.94	0.8955	5.324	3.379	2.259	4.805	Kama
4	16.14	14.99	0.9034	5.658	3.562	1.355	5.175	Kama

# 观察小麦有多少类
seeds[7].value_counts()

Kama        70
Rosa        70
Canadian    70
Name: 7, dtype: int64

seeds[7].value_counts().plot(kind = 'bar')

<AxesSubplot:>

在这里插入图片描述

# 或者用seaborn 
import seaborn as sns
sns.set()

# seaborn 常用图像
# barplot()
# scatterplot()
# swanrmplot()
# boxplot()
# violinplot()
# countplot()
# pairplot()
# heatmap()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso,RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler,StandardScaler

X = seeds.iloc[:,:7].copy()
# X = seeds.values[:,:7].copy() # 但是这样复制 numpy.ndarray
X.shape

(210, 7)

	0	1	2	3	4	5	6
0	15.26	14.84	0.8710	5.763	3.312	2.221	5.220
1	14.88	14.57	0.8811	5.554	3.333	1.018	4.956
2	14.29	14.09	0.9050	5.291	3.337	2.699	4.825
3	13.84	13.94	0.8955	5.324	3.379	2.259	4.805
4	16.14	14.99	0.9034	5.658	3.562	1.355	5.175
...	...	...	...	...	...	...	...
205	12.19	13.20	0.8783	5.137	2.981	3.631	4.870
206	11.23	12.88	0.8511	5.140	2.795	4.325	5.003
207	13.20	13.66	0.8883	5.236	3.232	8.315	5.056
208	11.84	13.21	0.8521	5.175	2.836	3.598	5.044
209	12.30	13.34	0.8684	5.243	2.974	5.637	5.063

210 rows × 7 columns

y =  seeds.iloc[:,-1].copy()
# y = seeds.values[:,-1].copy()
y.shape

(210,)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

# 封装函数来进行knn试探性运算
def knn_score(k,X,y):
    # 构造算法对象
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = []
    train_scores = []
    for i in range(100):
        # 拆分
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
        # 训练
        knn.fit(X_train,y_train)
        # 评价模型
        scores.append(knn.score(X_test,y_test))
        # 经验评分
        train_scores.append(knn.score(X_train,y_train))
        
    return np.array(scores).mean(),np.array(train_scores).mean()

# 调参
result_dict = {}
k_list = [1,3,5,7,9,11]
for k in k_list:
    score,train_score = knn_score(k,X,y)
    result_dict[k] = [score,train_score]

result_dict

{1: [0.9047619047619047, 1.0],
 3: [0.9047619047619047, 0.9642857142857139],
 5: [0.8571428571428572, 0.9285714285714287],
 7: [0.8571428571428572, 0.9345238095238096],
 9: [0.8809523809523812, 0.9226190476190478],
 11: [0.8809523809523812, 0.9226190476190478]}

pd.DataFrame(result_dict).T

	0	1
1	0.904762	1.000000
3	0.904762	0.964286
5	0.857143	0.928571
7	0.857143	0.934524
9	0.880952	0.922619
11	0.880952	0.922619

result = pd.DataFrame(result_dict).T.copy()

result.columns = ['Test','Train']

result

	Test	Train
1	0.904762	1.000000
3	0.904762	0.964286
5	0.857143	0.928571
7	0.857143	0.934524
9	0.880952	0.922619
11	0.880952	0.922619

result.plot()
plt.xticks(k_list)
plt.show()

在这里插入图片描述

进阶版

# z-score (x-x.mean)/ x.std  N(0,1)
# MinMaxScaller (x-x.min)/(x.max-x.min)  0-1

# 异常值 空值 数据分布查看

X.shape

(210, 7)

# 查看统计学指标
X.describe().T

	count	mean	std	min	25%	50%	75%	max
0	210.0	14.847524	2.909699	10.5900	12.27000	14.35500	17.305000	21.1800
1	210.0	14.559286	1.305959	12.4100	13.45000	14.32000	15.715000	17.2500
2	210.0	0.870999	0.023629	0.8081	0.85690	0.87345	0.887775	0.9183
3	210.0	5.628533	0.443063	4.8990	5.26225	5.52350	5.979750	6.6750
4	210.0	3.258605	0.377714	2.6300	2.94400	3.23700	3.561750	4.0330
5	210.0	3.700201	1.503557	0.7651	2.56150	3.59900	4.768750	8.4560
6	210.0	5.408071	0.491480	4.5190	5.04500	5.22300	5.877000	6.5500

def standard_X(X):
    X_copy = X.copy() # 拿数据
    for col_name in X_copy.columns: # 取列名
        col_data = X_copy[[col_name]] # 根据列名拿列数据，两个方括号是因为要二维数组
        # fit_transform
        stand_data = StandardScaler().fit_transform(col_data.values) # 标准化
        X_copy[col_name] = stand_data # 将数据替换成标准化后的数据
    return X_copy

standard_X(X).describe([0.01,0.25,0.5,0.75,0.99]).T
# standard_X(X).describe([0.01,0.25,0.5,0.75,0.99]).T

	count	mean	std	min	1%	25%	50%	75%	99%	max
0	210.0	-5.392512e-17	1.002389	-1.466714	-1.397504	-0.887955	-0.169674	0.846599	2.072913	2.181534
1	210.0	9.146123e-17	1.002389	-1.649686	-1.474607	-0.851433	-0.183664	0.887069	2.023505	2.065260
2	210.0	1.322091e-15	1.002389	-2.668236	-2.588824	-0.598079	0.103993	0.711677	1.678118	2.006586
3	210.0	-2.182910e-15	1.002389	-1.650501	-1.464372	-0.828682	-0.237628	0.794595	2.154459	2.367533
4	210.0	-2.030122e-16	1.002389	-1.668209	-1.634930	-0.834907	-0.057335	0.804496	1.936725	2.055112
5	210.0	-3.679596e-16	1.002389	-1.956769	-1.857934	-0.759148	-0.067469	0.712379	2.519905	3.170590
6	210.0	-1.337554e-16	1.002389	-1.813288	-1.633810	-0.740495	-0.377459	0.956394	2.130797	2.328998

查看数据分布

经过对标准化数据describe查看99分位数发现标签为2和5的两个列有较大差距

stand_X = standard_X(X)

for col_name in stand_X.columns:
    sns.distplot(stand_X[col_name])
    plt.title(col_name)
    plt.show()

在这里插入图片描述

请添加图片描述

分箱操作

10 3000 5000 10000000

以5000为分割点分割出高收入低收入进行映射（减少数据之间的差异）

# 0 0 1 1

X[0] = pd.cut(X[0],bins = 5,labels = [0,1,2,3,4])
# 将数据进行切割，防止过拟合

X[0]

0      2
1      2
2      1
3      1
4      2
      ..
205    0
206    0
207    1
208    0
209    0
Name: 0, Length: 210, dtype: category
Categories (5, int64): [0 < 1 < 2 < 3 < 4]

sns.countplot(X[0])

C:\Anaconda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(





<AxesSubplot:xlabel='0', ylabel='count'>

请添加图片描述

# 拆所有数据
for col_name in X.columns:
    X[col_name] = pd.cut(X[col_name],bins = 5,labels = [0,1,2,3,4])

	0	1	2	3	4	5	6
0	2	2	2	2	2	0	1
1	2	2	3	1	2	0	1
2	1	1	4	1	2	1	0
3	1	1	3	1	2	0	0
4	2	2	4	2	3	0	1
...	...	...	...	...	...	...	...
205	0	0	3	0	1	1	0
206	0	0	1	0	0	2	1
207	1	1	3	0	2	4	1
208	0	0	1	0	0	1	1
209	0	0	2	0	1	3	1

210 rows × 7 columns

knn = KNeighborsClassifier()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1)

knn.fit(X_train,y_train)

KNeighborsClassifier()

knn.score(X_train,y_train)

0.9166666666666666

knn.score(X_test,y_test)

0.9523809523809523

	0	1	2	3	4	5	6
0	2	2	2	2	2	0	1
1	2	2	3	1	2	0	1
2	1	1	4	1	2	1	0
3	1	1	3	1	2	0	0
4	2	2	4	2	3	0	1
...	...	...	...	...	...	...	...
205	0	0	3	0	1	1	0
206	0	0	1	0	0	2	1
207	1	1	3	0	2	4	1
208	0	0	1	0	0	1	1
209	0	0	2	0	1	3	1

	0	1	2	3	4	5	6
0	2	2	2	2	2	0	1
1	2	2	3	1	2	0	1
2	1	1	4	1	2	1	0
3	1	1	3	1	2	0	0
4	2	2	4	2	3	0	1
...	...	...	...	...	...	...	...
205	0	0	3	0	1	1	0
206	0	0	1	0	0	2	1
207	1	1	3	0	2	4	1
208	0	0	1	0	0	1	1
209	0	0	2	0	1	3	1

	0	1	2	3	4	5	6
0	2	2	2	2	2	0	1
1	2	2	3	1	2	0	1
2	1	1	4	1	2	1	0
3	1	1	3	1	2	0	0
4	2	2	4	2	3	0	1
...	...	...	...	...	...	...	...
205	0	0	3	0	1	1	0
206	0	0	1	0	0	2	1
207	1	1	3	0	2	4	1
208	0	0	1	0	0	1	1
209	0	0	2	0	1	3	1