1.二值化
- 大于阈值的值映射为1,而小于或等于阈值的值映射为0。 默认阈值为0时,特征中所有的正值都映射到1。
- 常用于文本计数统计
- sklearn.preprocessing.Binarizer
以此为例
from sklearn.preprocessing import Binarizer
X = data_2.iloc[:,0].values.reshape(-1,1)
#类为特征专用,所以不能使用一维数组
transformer = Binarizer(threshold=30).fit_transform(X)
#以30为分界线
2.分段
- from sklearn.preprocessing import KBinsDiscretizer
- n_bins分箱的个数默认为5
- encode :[“one hot”,“ordinal”,
“onehot-dense”]默认onehot
“ordinal”:每个特征的每个箱都被编码为一个整数,返回每一列是一个特征,每个特征下含有不同整数编码的箱的矩阵。 - strategy用来定义箱宽的方式默认是"quantile"
“uniform”:等宽分箱 like 20-40-60
“quantile”:等位分箱,每个箱子内的样本数量都相同
“kmeans”:按聚类分箱
from sklearn.preprocessing import KBinsDiscretizer
#取age那一列
X = data.iloc[:,0].values.reshape(-1,1)
est = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform')
est.fit_transform(X)
array([[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[2.],
[0.],
[1.],
[0.],
[0.],
[2.],
[0.],
[1.],
[0.],
[2.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[2.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[2.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[2.],
[1.],
[2.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[2.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[2.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[2.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[2.],
[0.],
[0.],
[0.],
[2.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[2.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[2.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[2.],
[1.],
[1.],
[2.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[2.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[2.],
[1.],
[1.],
[0.],
[1.],
[2.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[2.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[2.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[2.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[2.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[2.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[2.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[2.],
[0.],
[1.],
[1.],
[2.],
[1.],
[0.],
[1.],
[0.],
[2.],
[2.],
[0.],
[1.],
[2.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[2.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[2.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[2.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[2.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[2.],
[1.],
[1.],
[0.],
[1.],
[2.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[2.],
[2.],
[0.],
[0.],
[1.],
[2.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[2.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[2.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[2.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[2.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[2.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[2.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[2.],
[1.],
[2.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[2.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[2.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.]])
#查看转换后分的箱:变成了一列中的三箱
#ravel降维
set(est.fit_transform(X).ravel())
{0.0, 1.0, 2.0}
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
#n_bins 1 0 0 ~=3
#查看转换后分的箱:变成了哑变量
est.fit_transform(X).toarray()
array([[1., 0., 0.],
[0., 1., 0.],
[1., 0., 0.],
…,
[0., 1., 0.],
[1., 0., 0.],
[0., 1., 0.]])