from sklearn.impute import SimpleImputer
import pandas as pd
data=pd.read_csv("Narrativedata.csv",index_col=0)
data.head()
Age | Sex | Embarked | Survived | |
---|---|---|---|---|
0 | 22.0 | male | S | No |
1 | 38.0 | female | C | Yes |
2 | 26.0 | female | S | Yes |
3 | 35.0 | female | S | Yes |
4 | 35.0 | male | S | No |
1,1 impute.SimpleImputer
- class sklearn.impute.SimpleImputer (missing_values=nan, strategy=’mean’, fill_value=None, verbose=0,
copy=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age 714 non-null float64
Sex 891 non-null object
Embarked 889 non-null object
Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
age 填充
- sklearn 中特征矩阵必须为二维
data.Age.values.shape
(891,)
Age=data.Age.values.reshape(-1,1)
imp_mean=SimpleImputer()
# 默认均值填充
#此时使用中位数填充
imp_median=SimpleImputer(strategy="median")
# 用0 填充
imp_0=SimpleImputer(strategy="constant",fill_value=0)
# imp_mean=imp_mean.fit_transform(Age)
# 会出现小说
# 避免该情况使用众数
imp_median=imp_median.fit_transform(Age)
# imp_median
data.Age=imp_median
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age 891 non-null float64
Sex 891 non-null object
Embarked 889 non-null object
Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
# 使用众数填补Embarked
Embarked=data.Embarked.values.reshape(-1,1)
imp_mode=SimpleImputer(strategy="most_frequent")
data.Embarked=imp_mode.fit_transform(Embarked)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age 891 non-null float64
Sex 891 non-null object
Embarked 891 non-null object
Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
# 使用pandas 完成上述操作
data=pd.read_csv("Narrativedata.csv",index_col=0)
data.head()
Age | Sex | Embarked | Survived | |
---|---|---|---|---|
0 | 22.0 | male | S | No |
1 | 38.0 | female | C | Yes |
2 | 26.0 | female | S | Yes |
3 | 35.0 | female | S | Yes |
4 | 35.0 | male | S | No |
data.Age=data.Age.fillna(data.Age.median())
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age 891 non-null float64
Sex 891 non-null object
Embarked 889 non-null object
Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
# 三处Embark为空得行
data.dropna(axis=0,inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 4 columns):
Age 889 non-null float64
Sex 889 non-null object
Embarked 889 non-null object
Survived 889 non-null object
dtypes: float64(1), object(3)
memory usage: 34.7+ KB
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 4 columns):
Age 889 non-null float64
Sex 889 non-null object
Embarked 889 non-null object
Survived 889 non-null object
dtypes: float64(1), object(3)
memory usage: 34.7+ KB
2 处理分类型特征:编码与哑变量
在机器学习中,大多数算法,譬如逻辑回归,支持向量机SVM,k近邻算法等都只能够处理数值型数据,不能处理
文字,在sklearn当中,除了专用来处理文字的算法,其他算法在fit的时候全部要求输入数组或矩阵,也不能够导
入文字型数据(其实手写决策树和普斯贝叶斯可以处理文字,但是sklearn中规定必须导入数值型)。
然而在现实中,许多标签和特征在数据收集完毕的时候,都不是以数字来表现的。比如说,学历的取值可以是[“小
学”,“初中”,“高中”,“大学”],付费方式可能包含[“支付宝”,“现金”,“微信”]等等。在这种情况下,为了让数据适
应算法和库,我们必须将数据进行编码,即是说,将文字型数据转换为数值型。
from sklearn.preprocessing import LabelEncoder
- LabelEncoder:标签专用,用来将分类转化为分类数值
label=LabelEncoder().fit_transform(data.iloc[:,-1])
from sklearn.preprocessing import OrdinalEncoder
data=data.copy()
data.head()
Age | Sex | Embarked | Survived | |
---|---|---|---|---|
0 | 22.0 | male | S | No |
1 | 38.0 | female | C | Yes |
2 | 26.0 | female | S | Yes |
3 | 35.0 | female | S | Yes |
4 | 35.0 | male | S | No |
- preprocessing.OrdinalEncoder:特征专用,能够将分类特征转换为分类数值
-类别OrdinalEncoder可以用来处理有序变量,但对于名义变量,我们只有使用哑变量的方式来处理,才能够尽量
向算法传达最准确的信息:
data.iloc[:,1:-1]=OrdinalEncoder().fit_transform(data.iloc[:,1:-1])
data.head()
Age | Sex | Embarked | Survived | |
---|---|---|---|---|
0 | 22.0 | 1.0 | 2.0 | No |
1 | 38.0 | 0.0 | 0.0 | Yes |
2 | 26.0 | 0.0 | 2.0 | Yes |
3 | 35.0 | 0.0 | 2.0 | Yes |
4 | 35.0 | 1.0 | 2.0 | No |
OrdinalEncoder().fit(data.iloc[:,1:-1]).categories_
[array([0., 1.]), array([0., 1., 2.])]
preprocessing.OneHotEncoder:独热编码,创建哑变量
data2=data.copy()
data2.copy()
Age | Sex | Embarked | Survived | |
---|---|---|---|---|
0 | 22.0 | male | S | No |
1 | 38.0 | female | C | Yes |
2 | 26.0 | female | S | Yes |
3 | 35.0 | female | S | Yes |
4 | 35.0 | male | S | No |
5 | 28.0 | male | Q | No |
6 | 54.0 | male | S | No |
7 | 2.0 | male | S | No |
8 | 27.0 | female | S | Yes |
9 | 14.0 | female | C | Yes |
10 | 4.0 | female | S | Unknown |
11 | 58.0 | female | S | Yes |
12 | 20.0 | male | S | No |
13 | 39.0 | male | S | No |
14 | 14.0 | female | S | No |
15 | 55.0 | female | S | Unknown |
16 | 2.0 | male | Q | No |
17 | 28.0 | male | S | Yes |
18 | 31.0 | female | S | No |
19 | 28.0 | female | C | Yes |
20 | 35.0 | male | S | Unknown |
21 | 34.0 | male | S | Yes |
22 | 15.0 | female | Q | Yes |
23 | 28.0 | male | S | Yes |
24 | 8.0 | female | S | No |
25 | 38.0 | female | S | Unknown |
26 | 28.0 | male | C | No |
27 | 19.0 | male | S | No |
28 | 28.0 | female | Q | Yes |
29 | 28.0 | male | S | No |
... | ... | ... | ... | ... |
861 | 21.0 | male | S | No |
862 | 48.0 | female | S | Yes |
863 | 28.0 | female | S | No |
864 | 24.0 | male | S | Unknown |
865 | 42.0 | female | S | Unknown |
866 | 27.0 | female | C | Unknown |
867 | 31.0 | male | S | No |
868 | 28.0 | male | S | No |
869 | 4.0 | male | S | Yes |
870 | 26.0 | male | S | No |
871 | 47.0 | female | S | Unknown |
872 | 33.0 | male | S | No |
873 | 47.0 | male | S | No |
874 | 28.0 | female | C | Yes |
875 | 15.0 | female | C | Yes |
876 | 20.0 | male | S | No |
877 | 19.0 | male | S | No |
878 | 28.0 | male | S | No |
879 | 56.0 | female | C | Yes |
880 | 25.0 | female | S | Yes |
881 | 33.0 | male | S | No |
882 | 22.0 | female | S | No |
883 | 28.0 | male | S | Unknown |
884 | 25.0 | male | S | No |
885 | 39.0 | female | Q | No |
886 | 27.0 | male | S | No |
887 | 19.0 | female | S | Yes |
888 | 28.0 | female | S | No |
889 | 26.0 | male | C | Unknown |
890 | 32.0 | male | Q | No |
891 rows × 4 columns
from sklearn.preprocessing import OneHotEncoder
x=data2.iloc[:,1:-1]
Sex | Embarked | |
---|---|---|
0 | male | S |
1 | female | C |
set(x.Sex)
{'female', 'male'}
enc=OneHotEncoder(categories="auto").fit_transform(x).toarray()
enc=OneHotEncoder(categories="auto").fit(x)
enc.get_feature_names()
array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)
new_data=pd.concat([data2,pd.DataFrame(enc)],axis=1)
new_data.head()
Age | Sex | Embarked | Survived | 0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|---|---|---|---|
0 | 22.0 | male | S | No | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
1 | 38.0 | female | C | Yes | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
2 | 26.0 | female | S | Yes | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 35.0 | female | S | Yes | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 35.0 | male | S | No | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
new_data.drop(["Sex","Embarked"],axis=1,inplace=True)
new_data.columns=["Age","Survived",'x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S']
new_data.head()
Age | Survived | x0_female | x0_male | x1_C | x1_Q | x1_S | |
---|---|---|---|---|---|---|---|
0 | 22.0 | No | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
1 | 38.0 | Yes | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
2 | 26.0 | Yes | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 35.0 | Yes | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 35.0 | No | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
# 连续型特征处理 二值化或者分段
data_2=data.copy()
data_2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age 891 non-null float64
Sex 891 non-null object
Embarked 891 non-null object
Survived 891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
from sklearn.preprocessing import Binarizer
x=data_2.Age.values.reshape(-1,1)
tranformer=Binarizer(threshold=30).fit_transform(x)
tranformer
array([[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.],
[1.],
[0.],
[0.],
[0.],
[1.],
[0.],
[0.],
[0.],
[0.],
[1.]])
-
preprocessing.KBinsDiscretizer
-
这是将连续型变量划分为分类变量的类,能够将连续型变量排序后按顺序分箱后编码。总共包含三个重要参数
-
n_bins 每个特征中分箱的个数,默认5,一次会被运用到所有导入的特征
-
encode 编码的方式,默认“onehot”
-
“onehot”:做哑变量,之后返回一个稀疏矩阵,每一列是一个特征中的一个类别,含有该
-
类别的样本表示为1,不含的表示为0
-
“ordinal”:每个特征的每个箱都被编码为一个整数,返回每一列是一个特征,每个特征下含
有不同整数编码的箱的矩阵 -
“onehot-dense”:做哑变量,之后返回一个密集数组
-
strategy 用来定义箱宽的方式,默认"quantile"
- 用来定义箱宽的方式,默认"quantile"
- “uniform”:表示等宽分箱,即每个特征中的每个箱的最大值之间的差为
- (特征.max() - 特征.min())/(n_bins)
- “quantile”:表示等位分箱,即每个特征中的每个箱内的样本数量都相同
- “kmeans”:表示按聚类分箱,每个箱中的值到最近的一维k均值聚类的簇心得距离都相同
from sklearn.preprocessing import KBinsDiscretizer
X = data.iloc[:,0].values.reshape(-1,1)
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit_transform(X)
#查看转换后分的箱:变成了一列中的三箱
set(est.fit_transform(X).ravel())
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
#查看转换后分的箱:变成了哑变量
est.fit_transform(X).toarray()
array([[1., 0., 0.],
[0., 1., 0.],
[1., 0., 0.],
...,
[0., 1., 0.],
[1., 0., 0.],
[0., 1., 0.]])
菜菜的sklearn学习得到 https://live.bilibili.com/12582510