1.创建示例数据集
import pandas as pd
df = pd.DataFrame([
['green','M',10.1,'class1'],
['red','L',13.5,'class2'],
['blue','XL',15.3,'class1']])
df.columns = ['color','size','price','classlabel']
结果:
color size price classlabel
0 green M 10.1 class1
1 red L 13.5 class2
2 blue XL 15.3 class1
2.映射叙述特征
为了确保机器学习能够正确的解读序数特征,需要将分类字符串转换为整数。
size_mapping = {'XL':3,'L':2,'M':1}
df['size'] = df['size'].map(size_mapping)
print(df)
结果
color size price classlabel
0 green 1 10.1 class1
1 red 2 13.5 class2
2 blue 3 15.3 class1
3.分类标签编码
import pandas as pd
import numpy as np
df = pd.DataFrame([
['green','M',10.1,'class1'],
['red','L',13.5,'class2'],
['blue','XL',15.3,'class1']])
df.columns = ['color','size','price','classlabel']
size_mapping = {'XL':3,'L':2,'M':1}
df['size'] = df['size'].map(size_mapping)
#分类标签整数化
class_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))} #分类标签从0开始枚举
df['classlabel'] = df['classlabel'].map(class_mapping)
print(df)
输出:
color size price classlabel
0 green 1 10.1 0
1 red 2 13.5 1
2 blue 3 15.3 0
也可以反向映射键值对,或者使用sklearn中的LabelEncoder
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
df = pd.DataFrame([
['green','M',10.1,'class1'],
['red','L',13.5,'class2'],
['blue','XL',15.3,'class1']])
df.columns = ['color','size','price','classlabel']
size_mapping = {'XL':3,'L':2,'M':1}
df['size'] = df['size'].map(size_mapping)
#分类标签整数化
class_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))} #分类标签从0开始枚举
df['classlabel'] = df['classlabel'].map(class_mapping)
CL = LabelEncoder()
y = CL.fit_transform(df['classlabel'].values)
print(y)
输出:
array([0, 1, 0], dtype=int64)
4.为名词特征做热编码
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
df = pd.DataFrame([
['green','M',10.1,'class1'],
['red','L',13.5,'class2'],
['blue','XL',15.3,'class1']])
df.columns = ['color','size','price','classlabel']
size_mapping = {'XL':3,'L':2,'M':1}
df['size'] = df['size'].map(size_mapping)
#分类标签整数化
class_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))} #分类标签从0开始枚举
df['classlabel'] = df['classlabel'].map(class_mapping)
CL = LabelEncoder()
y = CL.fit_transform(df['classlabel'].values)
x = df[['color','size','price']].values
color_le = LabelEncoder()
x[:,0] = color_le.fit_transform(x[:,0])
ohe = OneHotEncoder(categorical_features=[0]) #设置虚拟变量
H = ohe.fit_transform(x).toarray()
print(H)
输出
[[ 0. 1. 0. 1. 10.1]
[ 0. 0. 1. 2. 13.5]
[ 1. 0. 0. 3. 15.3]]