需求:把 pandas dataframe 中的属于类别或字符串类型的列编码为整数,跳过缺失值
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
df = pd.DataFrame({
'f1': ['a', 'b', 'a', None],
'f2': ['x', 'y', 'y', 'z']
})
ordinal_enc_dict = {}
for col_name in df.columns:
ordinal_enc_dict[col_name] = OrdinalEncoder()
col = df[col_name]
col_not_null = col[col.notnull()]
reshaped_vals = col_not_null.values.reshape(-1, 1)
encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
df.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)
print(df)
# f1 f2
# 0 0 0
# 1 1 1
# 2 0 1
# 3 None 2
for col_name, encoder in ordinal_enc_dict.items():
print(col_name, encoder.categories_)
# f1 [array(['a', 'b'], dtype=object)]
# f2 [array(['x', 'y', 'z'], dtype=object)]