机器学习实战:小麦种子(封装函数进行调参、标准化、绘图查看数据分布)

声明:内容非原创,代码来自葁sir

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 导入数据集
seeds = pd.read_csv('data/seeds.csv',sep = '\t',header = None)
seeds.head()
01234567
015.2614.840.87105.7633.3122.2215.220Kama
114.8814.570.88115.5543.3331.0184.956Kama
214.2914.090.90505.2913.3372.6994.825Kama
313.8413.940.89555.3243.3792.2594.805Kama
416.1414.990.90345.6583.5621.3555.175Kama
# 观察小麦有多少类
seeds[7].value_counts()
Kama        70
Rosa        70
Canadian    70
Name: 7, dtype: int64
seeds[7].value_counts().plot(kind = 'bar')
<AxesSubplot:>

在这里插入图片描述

# 或者用seaborn 
import seaborn as sns
sns.set()
# seaborn 常用图像
# barplot()
# scatterplot()
# swanrmplot()
# boxplot()
# violinplot()
# countplot()
# pairplot()
# heatmap()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso,RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler,StandardScaler

X = seeds.iloc[:,:7].copy()
# X = seeds.values[:,:7].copy() # 但是这样复制 numpy.ndarray
X.shape
(210, 7)
X
0123456
015.2614.840.87105.7633.3122.2215.220
114.8814.570.88115.5543.3331.0184.956
214.2914.090.90505.2913.3372.6994.825
313.8413.940.89555.3243.3792.2594.805
416.1414.990.90345.6583.5621.3555.175
........................
20512.1913.200.87835.1372.9813.6314.870
20611.2312.880.85115.1402.7954.3255.003
20713.2013.660.88835.2363.2328.3155.056
20811.8413.210.85215.1752.8363.5985.044
20912.3013.340.86845.2432.9745.6375.063

210 rows × 7 columns

y =  seeds.iloc[:,-1].copy()
# y = seeds.values[:,-1].copy()
y.shape
(210,)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
# 封装函数来进行knn试探性运算
def knn_score(k,X,y):
    # 构造算法对象
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = []
    train_scores = []
    for i in range(100):
        # 拆分
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
        # 训练
        knn.fit(X_train,y_train)
        # 评价模型
        scores.append(knn.score(X_test,y_test))
        # 经验评分
        train_scores.append(knn.score(X_train,y_train))
        
    return np.array(scores).mean(),np.array(train_scores).mean()

# 调参
result_dict = {}
k_list = [1,3,5,7,9,11]
for k in k_list:
    score,train_score = knn_score(k,X,y)
    result_dict[k] = [score,train_score]
    
result_dict
{1: [0.9047619047619047, 1.0],
 3: [0.9047619047619047, 0.9642857142857139],
 5: [0.8571428571428572, 0.9285714285714287],
 7: [0.8571428571428572, 0.9345238095238096],
 9: [0.8809523809523812, 0.9226190476190478],
 11: [0.8809523809523812, 0.9226190476190478]}
pd.DataFrame(result_dict).T
01
10.9047621.000000
30.9047620.964286
50.8571430.928571
70.8571430.934524
90.8809520.922619
110.8809520.922619
result = pd.DataFrame(result_dict).T.copy()
result.columns = ['Test','Train']
result
TestTrain
10.9047621.000000
30.9047620.964286
50.8571430.928571
70.8571430.934524
90.8809520.922619
110.8809520.922619
result.plot()
plt.xticks(k_list)
plt.show()

在这里插入图片描述

进阶版
# z-score (x-x.mean)/ x.std  N(0,1)
# MinMaxScaller (x-x.min)/(x.max-x.min)  0-1
# 异常值 空值 数据分布查看
X.shape
(210, 7)
# 查看统计学指标
X.describe().T
countmeanstdmin25%50%75%max
0210.014.8475242.90969910.590012.2700014.3550017.30500021.1800
1210.014.5592861.30595912.410013.4500014.3200015.71500017.2500
2210.00.8709990.0236290.80810.856900.873450.8877750.9183
3210.05.6285330.4430634.89905.262255.523505.9797506.6750
4210.03.2586050.3777142.63002.944003.237003.5617504.0330
5210.03.7002011.5035570.76512.561503.599004.7687508.4560
6210.05.4080710.4914804.51905.045005.223005.8770006.5500
def standard_X(X):
    X_copy = X.copy() # 拿数据
    for col_name in X_copy.columns: # 取列名
        col_data = X_copy[[col_name]] # 根据列名拿列数据,两个方括号是因为要二维数组
        # fit_transform
        stand_data = StandardScaler().fit_transform(col_data.values) # 标准化
        X_copy[col_name] = stand_data # 将数据替换成标准化后的数据
    return X_copy

standard_X(X).describe([0.01,0.25,0.5,0.75,0.99]).T
# standard_X(X).describe([0.01,0.25,0.5,0.75,0.99]).T
countmeanstdmin1%25%50%75%99%max
0210.0-5.392512e-171.002389-1.466714-1.397504-0.887955-0.1696740.8465992.0729132.181534
1210.09.146123e-171.002389-1.649686-1.474607-0.851433-0.1836640.8870692.0235052.065260
2210.01.322091e-151.002389-2.668236-2.588824-0.5980790.1039930.7116771.6781182.006586
3210.0-2.182910e-151.002389-1.650501-1.464372-0.828682-0.2376280.7945952.1544592.367533
4210.0-2.030122e-161.002389-1.668209-1.634930-0.834907-0.0573350.8044961.9367252.055112
5210.0-3.679596e-161.002389-1.956769-1.857934-0.759148-0.0674690.7123792.5199053.170590
6210.0-1.337554e-161.002389-1.813288-1.633810-0.740495-0.3774590.9563942.1307972.328998

查看数据分布

经过对标准化数据describe查看99分位数 发现标签为2和5的两个列 有较大差距

stand_X = standard_X(X)
for col_name in stand_X.columns:
    sns.distplot(stand_X[col_name])
    plt.title(col_name)
    plt.show()

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

请添加图片描述

请添加图片描述

请添加图片描述

分箱操作

10 3000 5000 10000000

以5000为分割点 分割出高收入 低收入 进行映射 (减少数据之间的差异)

# 0 0 1 1 
X[0] = pd.cut(X[0],bins = 5,labels = [0,1,2,3,4])
# 将数据进行切割,防止过拟合
X[0]
0      2
1      2
2      1
3      1
4      2
      ..
205    0
206    0
207    1
208    0
209    0
Name: 0, Length: 210, dtype: category
Categories (5, int64): [0 < 1 < 2 < 3 < 4]
sns.countplot(X[0])
C:\Anaconda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(





<AxesSubplot:xlabel='0', ylabel='count'>

请添加图片描述

# 拆所有数据
for col_name in X.columns:
    X[col_name] = pd.cut(X[col_name],bins = 5,labels = [0,1,2,3,4])
X
0123456
02222201
12231201
21141210
31131200
42242301
........................
2050030110
2060010021
2071130241
2080010011
2090020131

210 rows × 7 columns

knn = KNeighborsClassifier()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1)
knn.fit(X_train,y_train)
KNeighborsClassifier()
knn.score(X_train,y_train)
0.9166666666666666
knn.score(X_test,y_test)
0.9523809523809523

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值