




import pandas as pd

import numpy as np




file_path = "G:/Py/practical-machine-learning-with-python-master/notebooks/Ch04_Feature_Engineering_and_Selection/"

vg_df = pd.read_csv(file_path + 'datasets/vgsales.csv', encoding='utf-8')

vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]

Name Platform Year Genre Publisher 1 Super Mario Bros. NES 1985.0 Platform Nintendo 2 Mario Kart Wii Wii 2008.0 Racing Nintendo 3 Wii Sports Resort Wii 2009.0 Sports Nintendo 4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo 5 Tetris GB 1989.0 Puzzle Nintendo 6 New Super Mario Bros. DS 2006.0 Platform Nintendo


genres = np.unique(vg_df['Genre'])


array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',

'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',

'Strategy'], dtype=object)


from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder() #设置一个标注器

genre_labels = gle.fit_transform(vg_df['Genre']) #用标注器进行标注

genre_mappings = {index: label for index, label in enumerate(gle.classes_)} #把标注内容的键值对提取出来

genre_mappings #显示键值对

{0: 'Action',

1: 'Adventure',

2: 'Fighting',

3: 'Misc',

4: 'Platform',

5: 'Puzzle',

6: 'Racing',

7: 'Role-Playing',

8: 'Shooter',

9: 'Simulation',

10: 'Sports',

11: 'Strategy'}


vg_df['GenreLabel'] = genre_labels

vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]

Name Platform Year Genre GenreLabel 1 Super Mario Bros. NES 1985.0 Platform 4 2 Mario Kart Wii Wii 2008.0 Racing 6 3 Wii Sports Resort Wii 2009.0 Sports 10 4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7 5 Tetris GB 1989.0 Puzzle 5 6 New Super Mario Bros. DS 2006.0 Platform 4




poke_df = pd.read_csv(file_path + 'datasets/Pokemon.csv', encoding='utf-8')

poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)

np.unique(poke_df['Generation']) #独特值的审视

array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)


gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3,

'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}


poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)

poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10] #观察样例

Name Generation GenerationLabel 4 Octillery Gen 2 2 5 Helioptile Gen 6 6 6 Dialga Gen 4 4 7 DeoxysDefense Forme Gen 3 3 8 Rapidash Gen 1 1 9 Swanna Gen 5 5







poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10]

Name Generation Legendary 4 Octillery Gen 2 False 5 Helioptile Gen 6 False 6 Dialga Gen 4 True 7 DeoxysDefense Forme Gen 3 True 8 Rapidash Gen 1 False 9 Swanna Gen 5 False


from sklearn.preprocessing import OneHotEncoder, LabelEncoder


# transform and map pokemon generations

gen_le = LabelEncoder()

gen_labels = gen_le.fit_transform(poke_df['Generation'])

poke_df['Gen_Label'] = gen_labels

# transform and map pokemon legendary status

leg_le = LabelEncoder()

leg_labels = leg_le.fit_transform(poke_df['Legendary'])

poke_df['Lgnd_Label'] = leg_labels

poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]


Name Generation Gen_Label Legendary Lgnd_Label 4 Octillery Gen 2 1 False 0 5 Helioptile Gen 6 5 False 0 6 Dialga Gen 4 3 True 1 7 DeoxysDefense Forme Gen 3 2 True 1 8 Rapidash Gen 1 0 False 0 9 Swanna Gen 5 4 False 0


# encode generation labels using one-hot encoding scheme

gen_ohe = OneHotEncoder() #构造编码器

gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray() #用编码器对标签进行转化,并转化为数组

gen_feature_labels = list(gen_le.classes_) #构造列名称

gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels) #构造特征编码数据框

# encode legendary status labels using one-hot encoding scheme

leg_ohe = OneHotEncoder()

leg_feature_arr = leg_ohe.fit_transform(poke_df[['Lgnd_Label']]).toarray()

leg_feature_labels = ['Legendary_'+str(cls_label) for cls_label in leg_le.classes_]

leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)

poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1) #数据框合并

columns = sum([['Name', 'Generation', 'Gen_Label'],gen_feature_labels,

['Legendary', 'Lgnd_Label'],leg_feature_labels], [])


Name Generation Gen_Label Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6 Legendary Lgnd_Label Legendary_False Legendary_True 4 Octillery Gen 2 1 0.0 1.0 0.0 0.0 0.0 0.0 False 0 1.0 0.0 5 Helioptile Gen 6 5 0.0 0.0 0.0 0.0 0.0 1.0 False 0 1.0 0.0 6 Dialga Gen 4 3 0.0 0.0 0.0 1.0 0.0 0.0 True 1 0.0 1.0 7 DeoxysDefense Forme Gen 3 2 0.0 0.0 1.0 0.0 0.0 0.0 True 1 0.0 1.0 8 Rapidash Gen 1 0 1.0 0.0 0.0 0.0 0.0 0.0 False 0 1.0 0.0 9 Swanna Gen 5 4 0.0 0.0 0.0 0.0 1.0 0.0 False 0 1.0 0.0

我们可以看到,利用one-hot编码方法,有多少个独特的值,就会多生成多少列,然后当属于这个值的时候,在该列就会标注为1,其他则均为0. 需要理解一点的是,经过训练,我们的编码器现在能够自动把相应的列转化为one-hot的编码格式,如果有新的数据,可以重新利用我们的编码器,对数据进行编码。(这对于训练集、测试集同时使用数据预处理极其有用)


new_poke_df = pd.DataFrame([['PikaZoom', 'Gen 3', True],

['CharMyToast', 'Gen 4', False]],

columns=['Name', 'Generation', 'Legendary'])


Name Generation Legendary 0 PikaZoom Gen 3 True 1 CharMyToast Gen 4 False


new_gen_labels = gen_le.transform(new_poke_df['Generation'])

new_poke_df['Gen_Label'] = new_gen_labels

new_leg_labels = leg_le.transform(new_poke_df['Legendary'])

new_poke_df['Lgnd_Label'] = new_leg_labels

new_poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]

Name Generation Gen_Label Legendary Lgnd_Label 0 PikaZoom Gen 3 2 True 1 1 CharMyToast Gen 4 3 False 0


new_gen_feature_arr = gen_ohe.transform(new_poke_df[['Gen_Label']]).toarray()

new_gen_features = pd.DataFrame(new_gen_feature_arr, columns=gen_feature_labels)

new_leg_feature_arr = leg_ohe.transform(new_poke_df[['Lgnd_Label']]).toarray()

new_leg_features = pd.DataFrame(new_leg_feature_arr, columns=leg_feature_labels)

new_poke_ohe = pd.concat([new_poke_df, new_gen_features, new_leg_features], axis=1)

columns = sum([['Name', 'Generation', 'Gen_Label'], gen_feature_labels,

['Legendary', 'Lgnd_Label'], leg_feature_labels], [])


Name Generation Gen_Label Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6 Legendary Lgnd_Label Legendary_False Legendary_True 0 PikaZoom Gen 3 2 0.0 0.0 1.0 0.0 0.0 0.0 True 1 0.0 1.0 1 CharMyToast Gen 4 3 0.0 0.0 0.0 1.0 0.0 0.0 False 0 1.0 0.0

不过这个代码量也太大了,life is short,不能这样子。让我们试试pandas自带的get_dummies函数。

gen_onehot_features = pd.get_dummies(poke_df['Generation'])

pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10]

Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6 4 Octillery Gen 2 0 1 0 0 0 0 5 Helioptile Gen 6 0 0 0 0 0 1 6 Dialga Gen 4 0 0 0 1 0 0 7 DeoxysDefense Forme Gen 3 0 0 1 0 0 0 8 Rapidash Gen 1 1 0 0 0 0 0 9 Swanna Gen 5 0 0 0 0 1 0




##Dummy Coding Scheme

gen_dummy_features = pd.get_dummies(poke_df['Generation'], drop_first=True)

pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]

Name Generation Gen 2 Gen 3 Gen 4 Gen 5 Gen 6 4 Octillery Gen 2 1 0 0 0 0 5 Helioptile Gen 6 0 0 0 0 1 6 Dialga Gen 4 0 0 1 0 0 7 DeoxysDefense Forme Gen 3 0 1 0 0 0 8 Rapidash Gen 1 0 0 0 0 0 9 Swanna Gen 5 0 0 0 1 0

#Effect Coding Scheme

gen_onehot_features = pd.get_dummies(poke_df['Generation'])

gen_effect_features = gen_onehot_features.iloc[:,:-1]

gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.

pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10]

Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 4 Octillery Gen 2 0.0 1.0 0.0 0.0 0.0 5 Helioptile Gen 6 -1.0 -1.0 -1.0 -1.0 -1.0 6 Dialga Gen 4 0.0 0.0 0.0 1.0 0.0 7 DeoxysDefense Forme Gen 3 0.0 0.0 1.0 0.0 0.0 8 Rapidash Gen 1 1.0 0.0 0.0 0.0 0.0 9 Swanna Gen 5 0.0 0.0 0.0 0.0 1.0

#Feature Hashing scheme

unique_genres = np.unique(vg_df[['Genre']])

print("Total game genres:", len(unique_genres))


Total game genres: 12

['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'

'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy']

from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=6, input_type='string')

hashed_features = fh.fit_transform(vg_df['Genre'])

hashed_features = hashed_features.toarray()

pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]

Name Genre 0 1 2 3 4 5 1 Super Mario Bros. Platform 0.0 2.0 2.0 -1.0 1.0 0.0 2 Mario Kart Wii Racing -1.0 0.0 0.0 0.0 0.0 -1.0 3 Wii Sports Resort Sports -2.0 2.0 0.0 -2.0 0.0 0.0 4 Pokemon Red/Pokemon Blue Role-Playing -1.0 1.0 2.0 0.0 1.0 -1.0 5 Tetris Puzzle 0.0 1.0 1.0 -2.0 1.0 -1.0 6 New Super Mario Bros. Platform 0.0 2.0 2.0 -1.0 1.0 0.0






file_path = "G:/Py/practical-machine-learning-with-python-master/notebooks/Ch04_Feature_Engineering_and_Selection/"

read_csv(paste0(file_path,'datasets/vgsales.csv')) -> vg_df


vg_df %>%

select(Genre) %>%

distinct %>%

mutate(GenreLabel = 1:n()) %>%

right_join(vg_df) %>%

select('Name', 'Platform', 'Year', 'Genre', 'GenreLabel') %>%


Joining, by = "Genre"

NamePlatformYearGenreGenreLabelWii Sports Wii 2006 Sports 1 Super Mario Bros. NES 1985 Platform 2 Mario Kart Wii Wii 2008 Racing 3 Wii Sports Resort Wii 2009 Sports 1 Pokemon Red/Pokemon BlueGB 1996 Role-Playing 4 Tetris GB 1989 Puzzle 5 New Super Mario Bros. DS 2006 Platform 2


read_csv(paste0(file_path,'datasets/Pokemon.csv')) -> poke_df


poke_df %>%

mutate(GenerationLabel=str_sub(Generation,start=-1,end=-1)) %>%

select('Name', 'Generation', 'GenerationLabel') %>%


NameGenerationGenerationLabelVenusaurMega Venusaur Gen 1 1 Charmander Gen 1 1 Charmeleon Gen 1 1 Charizard Gen 1 1 CharizardMega Charizard XGen 1 1 CharizardMega Charizard YGen 1 1 Squirtle Gen 1 1



iris %>% onehot -> encoder

predict(encoder,iris) %>% head

Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies=setosaSpecies=versicolorSpecies=virginica5. 0 0 0 0 0 0 0 0 0 0 0 0


model.matrix(~iris$Species - 1) %>% head


关于在R中用effect coding和feature hashing的方法,目前我还没有找到比较成熟的方法。不过feature hashing有较为广泛的用途,因此Github已经有相关的项目可以参考,见







