from sklearn.preprocessing import MinMaxScaler
data =[[1,2],[2,4],[3,2],[4,3]]
scaler = MinMaxScaler()# 实例化
scaler = scaler.fit(data)# 本质生成min() max() scaler.fit_transfrom()
result = scaler.transform(data)# 通过接口导出结果
result
scaler.inverse_transform(result)# 将归一化后的数据逆转
# 使用MinMaxScaler的参数feature_range实现将数据归一化到[0,1]以外的范围
data =[[1,2],[2,4],[3,2],[4,3]]
scaler = MinMaxScaler(feature_range=[5,10])
result = scaler.fit_transform(data)
result
当X中的特征数量非常多的时候,fit会报错,表示数据量太大计算不了
此时用partial_fit作为训练的接口
scaler = scaler.partial_fit(data)
1.2、数据标准化 (一般选用标准化)
preprocessing.StandardScaler
Standardization (Z-score normalization) 标准化
from sklearn.preprocessing import StandardScaler
data =[[1,2],[2,4],[3,2],[4,3]]
scaler = StandardScaler()
scaler = scaler.fit(data)# 本质是生成均值和方差
result = scaler.transform(data)
result
import pandas as pd
data = pd.read_csv(r"F:\计算机学习资料\机器学习b站菜菜\【机器学习】菜菜的sklearn课堂(1-12全课)\03数据预处理和特征工程\Narrativedata.csv",index_col=0)# 将第0列作为索引
data.head()
data.info()# 查看数据信息
2.1、impute.SimpleImputer填补缺失值
class sklearn.impute.SimpleImputer (missing_values=nan, strategy=’mean’, fill_value=None, verbose=0,copy=True)
from sklearn.preprocessing import OneHotEncoder
x = data.iloc[:,1:-1]
enc = OneHotEncoder(categories='auto').fit(x)
result = enc.transform(x).toarray()# toarray() 转化为array数据
result
x = newdata.iloc[:,0].values.reshape(-1,1)from sklearn.preprocessing import KBinsDiscretizer
est = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform')
est.fit_transform(x)