【python机器学习手册】第四章处理数值型数据

最新推荐文章于 2023-01-10 13:55:45 发布

着凉xxx

最新推荐文章于 2023-01-10 13:55:45 发布

阅读量251

点赞数 1

文章标签： python 机器学习

本文链接：https://blog.csdn.net/weixin_48040339/article/details/110006532

版权

本文详细探讨了在Python中进行机器学习时如何有效地处理数值型数据，包括数据预处理、缺失值处理、异常值检测和标准化等关键步骤，旨在提升模型的准确性和稳定性。

摘要由CSDN通过智能技术生成

#4.1 缩放，最大为1，最小为0
import numpy as np
from sklearn import preprocessing
feature=np.array([[-500.5],#随手建立一个数组
                [-100.1],
                [0],
                [100.1],
                [900.9]])
minmax_scale=preprocessing.MinMaxScaler((0,1),copy=False)#copy表示标准化后原数组是否改变，还是拷贝一份保持不变
scaled_feature=minmax_scale.fit_transform(feature)#用最大最小缩放器这个规则对feature进行计算转化
print(scaled_feature)

[[0.        ]
 [0.28571429]
 [0.35714286]
 [0.42857143]
 [1.        ]]

#4.2标准化（适合没有太多极端值的情况）
x=np.array([[-1000.1],#随手建立一个数组
           [-200.2],
            [500.5],
            [600.6],
            [9000.9]])
scaler=preprocessing.StandardScaler()#选择缩放器，建立计算转化规则
standardized=scaler.fit_transform(x)#根据规则进行计算转化
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

#很多极端值的情况更适合用中位数和四分位数间距进行缩放
robust_scaler=preprocessing.RobustScaler()#选择缩放器，建立计算转化规则，人家名字叫鲁棒性缩放器，显然更加稳健
robust_scaler.fit_transform(x)#根据规则进行计算转化

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

#4.3
from sklearn.preprocessing import Normalizer
y=np.array([[0.5,0.5],
            [1.1,3.4],
            [1.5,20.2],
            [1.53,34.4],
            [10.9,3.3]])
normalizer=Normalizer(norm="l2")#三个范数l1、l2和max
normalizer.fit_transform(y)#l2为平方和开根号

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04443282, 0.99901237],
       [0.95709822, 0.28976368]])

z=np.array([[0.5,0.5],
            [1.1,3.4],
            [1.5,20.2],
            [1.53,34.4],
            [10.9,3.3]])
normalizer=Normalizer(norm="l1")#三个范数l1、l2和max
#l1元素之和为1
normalizer.fit_transform(z)

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.0425828 , 0.9574172 ],
       [0.76760563, 0.23239437]])

#4.4 特征与结果存在非线性关系，创建多项式特征
from sklearn.preprocessing import PolynomialFeatures
features=np.array([[2,3],
                  [2,3],
                  [2,3]])
polynomial=PolynomialFeatures(degree=2,include_bias=False)#include_bias=False要不要常数项的意思,degree创建几阶多项式
polynomial.fit_transform(features)#五列为a,b,a2,ab,b2

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

polynomial=PolynomialFeatures(degree=2,include_bias=False,interaction_only=True)#注释见最末行
polynomial.fit_transform(features)#五列为a,b,c,ab,ac,bc,abc
#interaction_only： 默认为False，如果指定为True，那么就不会有特征自己和自己结合的项，上面的二次项中没有a^2和b^2。

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

polynomial=PolynomialFeatures(degree=3,include_bias=False,interaction_only=True)#注释见最末行
polynomial.fit_transform(features)#a,b,ab

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

polynomial=PolynomialFeatures(degree=3,include_bias=False)#注释见最末行
polynomial.fit_transform(features)#a,b,a2,ab,b2,a2b,ab2,a3,b3,元素还是这么多元素，阶数升上去罢了

array([[ 2.,  3.,  4.,  6.,  9.,  8., 12., 18., 27.],
       [ 2.,  3.,  4.,  6.,  9.,  8., 12., 18., 27.],
       [ 2.,  3.,  4.,  6.,  9.,  8., 12., 18., 27.]])

#4.6 识别异常值
#异常值占比的方法EllipticEnvelope
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
x,y=make_blobs(n_samples=10,#x特征矩阵，y标签矩阵
              n_features=2,
              centers=1,#产生数据的中心为1的椭圆形数据
              random_state=1)
x[0,0]=10000#将观测值的第一个替换为极端值
x[0,1]=10000
outlier=EllipticEnvelope(contamination=0.1)#污染指数设置为0.1，异常值占观察值比例，可以理解为期估计的结果比例
outlier.fit(x)
outlier.predict(x)#在这里不是转化而是预测异常值

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

array([[ 1.00000000e+04,  1.00000000e+04],
       [-2.76017908e+00,  5.55121358e+00],
       [-1.61734616e+00,  4.98930508e+00],
       [-5.25790464e-01,  3.30659860e+00],
       [ 8.52518583e-02,  3.64528297e+00],
       [-7.94152277e-01,  2.10495117e+00],
       [-1.34052081e+00,  4.15711949e+00],
       [-1.98197711e+00,  4.02243551e+00],
       [-2.18773166e+00,  3.33352125e+00],
       [-1.97451969e-01,  2.34634916e+00]])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

#基于IQR的识别，异常值偏离中间50%1.5倍的程度
ff=features[:,0]
def out(x):
    q1,q3=np.percentile(x,[25,75])#获取给的x的中位数、百分位数
    iqr=q1*q3
    lower=q1-1.5*iqr#小于第一个四分位数1.5倍
    upper=q3+1.5*iqr#大于第三四分位数1.5倍
    return np.where((x>upper)|(x<lower))#|一竖表示或
out(ff)

(array([0, 1], dtype=int64),)

#4.7处理异常值
#方法一：对于不符合筛选条件的直接删除
import pandas as pd
houses=pd.DataFrame()
houses["Price"]=[534433,392333,293222,4322032]
houses["Bathrooms"]=[2,3.5,2,116]
houses["Square_Feet"]=[1500,2500,1500,48000]
houses

	Price	Bathrooms	Square_Feet
0	534433	2.0	1500
1	392333	3.5	2500
2	293222	2.0	1500
3	4322032	116.0	48000

houses[houses["Bathrooms"]<20]

	Price	Bathrooms	Square_Feet
0	534433	2.0	1500
1	392333	3.5	2500
2	293222	2.0	1500

#方法二：把异常值标记出来，并把它们作为数据的一个特征
import numpy as np
houses["Outlier"]=np.where(houses["Bathrooms"]<20,0,1)#小于二十的标记为0.大于二十的标记为1
houses

	Price	Bathrooms	Square_Feet	Outlier
0	534433	2.0	1500	0
1	392333	3.5	2500	0
2	293222	2.0	1500	0
3	4322032	116.0	48000	1

#方法三：对异常值的特征进行转换，降低异常值的影响
x=houses["Square_Feet"]
houses["log"]=np.log(x)
houses

	Price	Bathrooms	Square_Feet	Outlier	log
0	534433	2.0	1500	0	7.313220
1	392333	3.5	2500	0	7.824046
2	293222	2.0	1500	0	7.313220
3	4322032	116.0	48000	1	10.778956

#4.8把特征离散化
#根据阈值将特征二值化
from sklearn.preprocessing import Binarizer
age=np.array([[6],#随手创建一个数组
             [12],
             [20],
             [36],
              [65]])
binarizer=Binarizer(18)#创建一个二值化器，阈值设为18，大于18记为1，小于18记为0
binarizer.fit_transform(age)#进行计算转化

array([[0],
       [0],
       [1],
       [1],
       [1]])

#根据多个阈值将数值型特征离散化
np.digitize(age,bins=[20,30,64]，right=False)#函数功能：返回一个和x形状相同的数据，返回值中的元素为对应x位置的元素落在bins中区间的索引号
#bin区间左闭右开,right=True则为两头闭区间

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

#4.9使用聚类的方法将观察值分组
###具体操作如下：
#1. 首先随机生成k个聚类中心点
#2. 根据聚类中心点，将数据分为k类。分类的原则是数据离哪个中心点近就将它分为哪一类别。
#3. 再根据分好的类别的数据，重新计算聚类的类别中心点。
#4. 不断的重复2和3步，直到中心点不再变化。
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
x,y=make_blobs(n_samples=50,
              n_features=2,
              centers=3,#决定要生成多少个聚类
              random_state=1)
plt.scatter(x[:,0],x[:,1],marker='o',c=y)#数组可以画图

<matplotlib.collections.PathCollection at 0x1c86553e820>

在这里插入图片描述

dataframe=pd.DataFrame(x,columns=["x_1","x_2"])#把矩阵转化为数据框，特征1命名为x_1，特征2同理
dataframe.head(5)

	x_1	x_2
0	-9.877554	-3.336145
1	-7.287210	-8.353986
2	-6.943061	-7.023744
3	-7.440167	-8.791959
4	-6.641388	-8.075888

clusterer=KMeans(3,random_state=0)#将观测值分成三类，random随机生成簇中心的状态条件
clusterer.fit(x)
dataframe["group"]=clusterer.predict(x)#预测聚类的值，识别异常值也是用预测
dataframe.head(5)#只有数据框可以显示前几行

	x_1	x_2	group
0	-9.877554	-3.336145	2
1	-7.287210	-8.353986	0
2	-6.943061	-7.023744	0
3	-7.440167	-8.791959	0
4	-6.641388	-8.075888	0

#4.10删除带有缺失值的观察值
#numpy
features=np.array([[1,1],
                 [2,2],
                 [3,3],
                 [np.nan,5]])
features[~np.isnan(features).any(axis=1)]#~表示不是，axis=1横着我

array([[1., 1.],
       [2., 2.],
       [3., 3.]])

#pandas
dataframe=pd.DataFrame(features,columns=["features_1","features_2"])
dataframe.dropna()

	features_1	features_2
0	1.0	1.0
1	2.0	2.0
2	3.0	3.0

#4.11填充缺失值
#(数据量不大则使用KNN)
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
x,y=make_blobs(n_samples=1000,
              n_features=2,
              random_state=1)
scaler=StandardScaler()#创建标准化器
standardized=scaler.fit_transform(x)#标准化特征
true_value=standardized[0,0]
standardized[0,0]=np.nan#把第一行第一列替换为缺失值
knn=KNeighborsClassifier()
knn.fit(x,y)
z=knn.predict(x)
z#z只有一行

array([0, 1, 0, 1, 2, 2, 2, 2, 0, 1, 2, 2, 0, 0, 0, 2, 1, 0, 2, 2, 1, 1,
       2, 0, 1, 2, 1, 1, 0, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 2, 1, 0, 1, 2,
       2, 1, 2, 0, 2, 1, 0, 2, 1, 2, 0, 2, 2, 1, 1, 0, 1, 2, 2, 2, 1, 0,
       0, 0, 0, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 2, 0, 1,
       0, 1, 1, 0, 2, 0, 2, 2, 0, 1, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2, 2, 0,
       1, 2, 1, 0, 1, 1, 0, 1, 2, 2, 2, 0, 1, 2, 2, 2, 0, 1, 0, 0, 0, 2,
       0, 1, 1, 2, 2, 2, 1, 2, 0, 0, 2, 2, 1, 0, 2, 2, 2, 1, 2, 0, 1, 0,
       1, 2, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1, 1, 1,
       2, 1, 2, 1, 1, 0, 1, 1, 1, 0, 2, 1, 2, 0, 0, 1, 1, 2, 1, 0, 0, 2,
       0, 2, 2, 1, 1, 2, 2, 1, 1, 0, 1, 0, 2, 0, 1, 1, 1, 2, 0, 0, 1, 0,
       0, 2, 1, 2, 0, 1, 0, 2, 2, 1, 2, 0, 2, 0, 0, 2, 0, 1, 1, 0, 0, 1,
       2, 0, 0, 1, 2, 0, 2, 0, 1, 1, 0, 0, 2, 1, 1, 0, 0, 1, 0, 2, 1, 2,
       2, 2, 2, 2, 1, 2, 0, 1, 0, 2, 2, 0, 0, 2, 1, 2, 0, 1, 0, 0, 2, 2,
       0, 1, 1, 2, 2, 0, 0, 2, 1, 0, 1, 1, 2, 0, 1, 1, 1, 2, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 2, 1, 2, 0, 1, 1, 0, 1, 0, 0, 2, 2, 0, 0, 0, 1, 2,
       2, 2, 1, 2, 1, 2, 0, 2, 1, 0, 1, 0, 1, 2, 1, 2, 2, 1, 1, 2, 0, 2,
       2, 1, 2, 0, 2, 2, 2, 2, 0, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 0, 2, 0,
       0, 0, 2, 2, 2, 0, 1, 2, 1, 2, 0, 2, 2, 1, 0, 1, 0, 0, 0, 1, 2, 2,
       1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 0, 2, 2, 0, 2, 2, 2, 1, 0, 0, 2, 1,
       1, 2, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 2, 2, 2, 0, 1, 1, 1, 2, 1, 1,
       2, 1, 0, 2, 0, 2, 0, 2, 2, 1, 1, 1, 0, 0, 2, 2, 0, 1, 1, 2, 0, 1,
       1, 2, 1, 2, 0, 1, 1, 1, 1, 0, 2, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0,
       0, 2, 0, 2, 1, 2, 0, 0, 0, 0, 2, 1, 1, 0, 2, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 2, 0, 0, 2, 0, 0, 1, 2, 1, 2, 1, 0, 2, 1, 2, 0, 0, 2, 1,
       1, 1, 2, 0, 2, 0, 2, 0, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, 2, 1, 0, 1,
       2, 1, 1, 1, 1, 0, 2, 2, 0, 2, 2, 2, 2, 1, 1, 2, 2, 0, 0, 2, 1, 2,
       2, 2, 2, 2, 0, 1, 0, 2, 1, 1, 0, 0, 0, 0, 2, 2, 0, 0, 2, 1, 0, 0,
       1, 2, 2, 1, 1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 1, 0, 2, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 2, 1, 1, 0, 1, 2, 1, 0, 0, 0, 2, 0, 0, 1, 2, 0, 2,
       2, 1, 1, 0, 2, 0, 0, 0, 0, 2, 2, 1, 0, 2, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 2, 2, 0, 1, 1, 1, 0, 1, 0, 1, 1, 2, 0, 0, 1, 2, 1, 2, 1, 1,
       1, 0, 2, 1, 0, 1, 1, 2, 1, 0, 2, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1, 1,
       0, 0, 2, 1, 1, 0, 2, 1, 2, 0, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 0, 0,
       0, 0, 0, 2, 1, 0, 0, 2, 0, 2, 1, 1, 2, 2, 1, 2, 1, 0, 0, 2, 1, 1,
       0, 0, 0, 2, 1, 2, 2, 2, 0, 0, 2, 1, 2, 2, 2, 0, 0, 1, 2, 0, 0, 0,
       2, 0, 1, 1, 1, 1, 1, 0, 2, 1, 2, 0, 2, 1, 1, 0, 0, 2, 2, 1, 0, 1,
       1, 1, 2, 1, 1, 0, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 1, 0, 1, 2, 0, 1,
       2, 1, 1, 2, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 2, 0, 2, 2, 0, 2,
       1, 0, 2, 1, 0, 1, 2, 1, 0, 1, 0, 0, 0, 2, 2, 1, 1, 0, 0, 2, 1, 2,
       2, 1, 1, 2, 1, 0, 0, 2, 2, 1, 2, 0, 2, 0, 2, 0, 1, 1, 1, 2, 0, 2,
       1, 0, 2, 2, 2, 2, 1, 2, 0, 2, 2, 1, 0, 1, 0, 1, 1, 1, 0, 1, 2, 0,
       2, 1, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2, 0, 0, 2, 0, 1, 0, 0,
       1, 1, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 1, 2, 1, 0, 2, 2, 0, 0, 1, 1,
       0, 1, 2, 1, 1, 2, 0, 1, 2, 1, 1, 0, 1, 2, 1, 0, 0, 1, 1, 0, 1, 2,
       2, 0, 2, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0,
       1, 0, 1, 0, 0, 0, 2, 0, 0, 2])

from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
x,y=make_blobs(n_samples=1000,
              n_features=2,
              random_state=1)
#用平均数来填补缺失值
from sklearn.impute import SimpleImputer
mean_imputer=SimpleImputer(strategy="mean")#默认按列处理
features_mean=mean_imputer.fit_transform(x)
features_mean[0,0]

-3.058372724614996

着凉xxx

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【python机器学习手册】第四章处理数值型数据

#4.1 缩放，最大为1，最小为0import numpy as npfrom sklearn import preprocessingfeature=np.array([[-500.5],#随手建立一个数组 [-100.1], [0], [100.1], [900.9]])minmax_scale=preprocessing.MinMaxScaler((0,1),copy
复制链接

扫一扫