# pandas dataFrame[col].value_counts() 类似
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))
# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])
哑变量处理,指定要处理的特征,以及列前缀
for fea in feature_cols:
df = pd.get_dummies(df,columns=[fea],prefix=fea)
如下格式尽量避免使用,因为在jion的时候有bug
embark_dummies= pd.get_dummies(train_data['Embarked'])
#get_dummies()该列有多少种可能值就用多少列表示,一列代表一种可能值,与独热编码相似
train_data = train_data.join(embark_dummies)
train_data.drop(['Embarked'],axis=1,inplace=True)
embark_dummies = train_data[['S','C','Q']]
embark_dummies.head()
categorical_features = 'all',这个参数指定了对哪些特征进行编码,默认对所有类别都进行编码。也可以自己指定选择哪些特征,通过索引或者 bool 值来指定,看下例:
# -*- coding: utf-8 -*-
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categorical_features = [0,2]) # 等价于 [True, False, True]
enc.fit([[0, 0, 3],
[1, 1, 0],
[0, 2, 1],
[1, 0, 2]])
ans = enc.transform([[0, 2, 3]]).toarray()