机器学习数据集读取和预处理

本文以干豆数据集为例,数据集下载位置如下:干豆数据集

import pandas as pd
import sklearn
import numpy as np

数据读取与预处理

dry = pd.read_csv("Dry_Bean.csv")

在info返回的信息中的non-null也能看出数据集不存在缺失值。

dry.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             13611 non-null  int64  
 1   Perimeter        13611 non-null  float64
 2   MajorAxisLength  13611 non-null  float64
 3   MinorAxisLength  13611 non-null  float64
 4   AspectRation     13611 non-null  float64
 5   Eccentricity     13611 non-null  float64
 6   ConvexArea       13611 non-null  int64  
 7   EquivDiameter    13611 non-null  float64
 8   Extent           13611 non-null  float64
 9   Solidity         13611 non-null  float64
 10  roundness        13611 non-null  float64
 11  Compactness      13611 non-null  float64
 12  ShapeFactor1     13611 non-null  float64
 13  ShapeFactor2     13611 non-null  float64
 14  ShapeFactor3     13611 non-null  float64
 15  ShapeFactor4     13611 non-null  float64
 16  Class            13611 non-null  object 
dtypes: float64(14), int64(2), object(1)
memory usage: 1.8+ MB
dry.head()
AreaPerimeterMajorAxisLengthMinorAxisLengthAspectRationEccentricityConvexAreaEquivDiameterExtentSolidityroundnessCompactnessShapeFactor1ShapeFactor2ShapeFactor3ShapeFactor4Class
028395610.291208.178117173.8887471.1971910.54981228715190.1410970.7639230.9888560.9580270.9133580.0073320.0031470.8342220.998724SEKER
128734638.018200.524796182.7344191.0973560.41178529172191.2727510.7839680.9849860.8870340.9538610.0069790.0035640.9098510.998430SEKER
229380624.110212.826130175.9311431.2097130.56272729690193.4109040.7781130.9895590.9478490.9087740.0072440.0030480.8258710.999066SEKER
330008645.884210.557999182.5165161.1536380.49861630724195.4670620.7826810.9766960.9039360.9283290.0070170.0032150.8617940.994199SEKER
430140620.134201.847882190.2792791.0607980.33368030417195.8965030.7730980.9908930.9848770.9705160.0066970.0036650.9419000.999166SEKER
dry["Class"].unique()
array(['SEKER', 'BARBUNYA', 'BOMBAY', 'CALI', 'HOROZ', 'SIRA', 'DERMASON'],
      dtype=object)
dry["Class"].nunique()
7
dry.columns
Index(['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
       'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent',
       'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
       'ShapeFactor3', 'ShapeFactor4', 'Class'],
      dtype='object')
dry.index
RangeIndex(start=0, stop=13611, step=1)

重复值检查

dry.duplicated().sum()
68

数据缺失值检验

dry.isnull().sum()
Area               0
Perimeter          0
MajorAxisLength    0
MinorAxisLength    0
AspectRation       0
Eccentricity       0
ConvexArea         0
EquivDiameter      0
Extent             0
Solidity           0
roundness          0
Compactness        0
ShapeFactor1       0
ShapeFactor2       0
ShapeFactor3       0
ShapeFactor4       0
Class              0
dtype: int64

我们也可以通过定义如下函数来输出更加完整的每一列缺失值的数值和占比

def missing (df):
    """
    计算每一列的缺失值及占比
    """
    missing_number = df.isnull().sum().sort_values(ascending=False)              # 每一列的缺失值求和后降序排序
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)          # 每一列缺失值占比
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])      # 合并为一个DataFrame
    return missing_values
missing(dry)
Missing_NumberMissing_Percent
Area00.0
Solidity00.0
ShapeFactor400.0
ShapeFactor300.0
ShapeFactor200.0
ShapeFactor100.0
Compactness00.0
roundness00.0
Extent00.0
Perimeter00.0
EquivDiameter00.0
ConvexArea00.0
Eccentricity00.0
AspectRation00.0
MinorAxisLength00.0
MajorAxisLength00.0
Class00.0
dry.describe()
AreaPerimeterMajorAxisLengthMinorAxisLengthAspectRationEccentricityConvexAreaEquivDiameterExtentSolidityroundnessCompactnessShapeFactor1ShapeFactor2ShapeFactor3ShapeFactor4
count13611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.00000013611.000000
mean53048.284549855.283459320.141867202.2707141.5832420.75089553768.200206253.0642200.7497330.9871430.8732820.7998640.0065640.0017160.6435900.995063
std29324.095717214.28969685.69418644.9700910.2466780.09200229774.91581759.1771200.0490860.0046600.0595200.0617130.0011280.0005960.0989960.004366
min20420.000000524.736000183.601165122.5126531.0248680.21895120684.000000161.2437640.5553150.9192460.4896180.6405770.0027780.0005640.4103390.947687
25%36328.000000703.523500253.303633175.8481701.4323070.71592836714.500000215.0680030.7186340.9856700.8320960.7624690.0059000.0011540.5813590.993703
50%44652.000000794.941000296.883367192.4317331.5511240.76444145178.000000238.4380260.7598590.9882830.8831570.8012770.0066450.0016940.6420440.996386
75%61332.000000977.213000376.495012217.0317411.7071090.81046662294.000000279.4464670.7868510.9900130.9168690.8342700.0072710.0021700.6960060.997883
max254616.0000001985.370000738.860154460.1984972.4303060.911423263261.000000569.3743580.8661950.9946770.9906850.9873030.0104510.0036650.9747670.999733
dry.groupby("Class").mean()
AreaPerimeterMajorAxisLengthMinorAxisLengthAspectRationEccentricityConvexAreaEquivDiameterExtentSolidityroundnessCompactnessShapeFactor1ShapeFactor2ShapeFactor3ShapeFactor4
Class
BARBUNYA69804.1331321046.105764370.044279240.3093521.5443950.75466571025.729198297.3110180.7492730.9828040.8002000.8050010.0053570.0013940.6491440.995739
BOMBAY173485.0593871585.619079593.152075374.3525471.5855500.770518175813.116858468.9414260.7765590.9869020.8644210.7926220.0034420.0008440.6291950.991841
CALI75538.2110431057.634282409.499538236.3706161.7336630.81480476688.503067309.5352800.7589530.9850210.8459340.7567030.0054590.0011070.5730220.990584
DERMASON32118.710942665.209536246.557279165.6571431.4904710.73663232498.435138201.6838130.7529530.9882260.9081140.8191100.0077550.0021610.6716360.996914
HOROZ53648.508817919.859676372.570290184.1706632.0261190.86744354440.091805260.7307150.7063930.9854800.7944200.7008800.0070070.0010480.4917910.991926
SEKER39881.299951727.672440251.291957201.9096531.2451820.58478140269.567341224.9484410.7716740.9903510.9445080.8968410.0063340.0025410.8051490.998383
SIRA44729.128604796.418737299.380258190.8002501.5700830.76727745273.099772238.3353160.7494450.9879710.8846520.7973450.0067200.0016830.6363580.995385

查看标签字段的取值分布情况

import seaborn as sns
import matplotlib.pyplot as plt
dry["Class"].value_counts()
DERMASON    3546
SIRA        2636
SEKER       2027
HOROZ       1928
CALI        1630
BARBUNYA    1322
BOMBAY       522
Name: Class, dtype: int64
sns.displot(dry['Class'])

在这里插入图片描述

数据标准化与归一化

  当然,除了离散变量的重编码外,有的时候我们也需要对连续变量进行转化,以提升模型表现或模型训练效率。在之前的内容中我们曾介绍了关于连续变量标准化和归一化的相关内容,对连续变量而言,标准化可以消除量纲影响并且加快梯度下降的迭代效率,而归一化则能够对每条数据进行进行范数单位化处理,我们可以通过下面的内容进行标准化和归一化相关内容回顾。

标准化与归一化

  从功能上划分,sklearn中的归一化其实是分为标准化(Standardization)和归一化(Normalization)两类。其中,此前所介绍的Z-Score标准化和0-1标准化,都属于Standardization的范畴,而在sklearn中,Normalization则特指针对单个样本(一行数据)利用其范数进行放缩的过程。不过二者都属于数据预处理范畴,都在sklearn中的Preprocessing data模块下。

需要注意的是,此前我们介绍数据归一化时有讨论过标准化和归一化名称上的区别,在大多数场景下其实我们并不会对其进行特意的区分,但sklearn中标准化和归一化则各指代一类数据处理方法,此处需要注意。

标准化 Standardization

  sklearn的标准化过程,即包括Z-Score标准化,也包括0-1标准化,并且即可以通过实用函数来进行标准化处理,同时也可以利用评估器来执行标准化过程。接下来我们分不同功能以的不同实现形式来进行讨论:

  • Z-Score标准化的评估器实现方法

  实用函数进行标准化处理,尽管从代码实现角度来看清晰易懂,但却不适用于许多实际的机器学习建模场景。其一是因为在进行数据集的训练集和测试集切分后,我们首先要在训练集进行标准化、然后统计训练集上统计均值和方差再对测试集进行标准化处理,因此其实还需要一个统计训练集相关统计量的过程;其二则是因为相比实用函数,sklearn中的评估器其实会有一个非常便捷的串联的功能,sklearn中提供了Pipeline工具能够对多个评估器进行串联进而组成一个机器学习流,从而简化模型在重复调用时候所需代码量,因此通过评估器的方法进行数据标准化,其实是一种更加通用的选择。

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit()

将特征和标签分开:

#.提取特征数据、标签数据
cols = [i for i in dry.columns if i not in ['Class']] #获取种特征名称,不包含标签
print(cols)
['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']
data = dry[cols]
data.head()
AreaPerimeterMajorAxisLengthMinorAxisLengthAspectRationEccentricityConvexAreaEquivDiameterExtentSolidityroundnessCompactnessShapeFactor1ShapeFactor2ShapeFactor3ShapeFactor4
028395610.291208.178117173.8887471.1971910.54981228715190.1410970.7639230.9888560.9580270.9133580.0073320.0031470.8342220.998724
128734638.018200.524796182.7344191.0973560.41178529172191.2727510.7839680.9849860.8870340.9538610.0069790.0035640.9098510.998430
229380624.110212.826130175.9311431.2097130.56272729690193.4109040.7781130.9895590.9478490.9087740.0072440.0030480.8258710.999066
330008645.884210.557999182.5165161.1536380.49861630724195.4670620.7826810.9766960.9039360.9283290.0070170.0032150.8617940.994199
430140620.134201.847882190.2792791.0607980.33368030417195.8965030.7730980.9908930.9848770.9705160.0066970.0036650.9419000.999166
target = dry["Class"]
target.head()
0    SEKER
1    SEKER
2    SEKER
3    SEKER
4    SEKER
Name: Class, dtype: object
data_Standard = scaler.fit_transform(data)
data_Standard
array([[-0.84074853, -1.1433189 , -1.30659814, ...,  2.4021726 ,
         1.92572347,  0.83837102],
       [-0.82918764, -1.01392388, -1.39591111, ...,  3.10089364,
         2.68970162,  0.77113831],
       [-0.80715717, -1.07882906, -1.25235661, ...,  2.23509111,
         1.84135576,  0.91675506],
       ...,
       [-0.37203825, -0.44783294, -0.45047814, ...,  0.28920501,
         0.33632829,  0.39025106],
       [-0.37176543, -0.42702856, -0.42897404, ...,  0.22837456,
         0.2489734 ,  0.03644007],
       [-0.37135619, -0.38755718, -0.2917356 , ..., -0.12777538,
        -0.27648141,  0.71371941]])
from sklearn.preprocessing import Normalizer
normlize = Normalizer()
data_normlize.fit_transform(data)
  • 1
    点赞
  • 23
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

<编程路上>

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值