数据集见GitHub链接:https://github.com/ChuanTaoLai/Frequency-Encoding-And-Label-Encoding
标签编码:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
data1 = pd.read_excel(r'D:\0文献整理\网络入侵检测\KDD99\KDDTrain.xlsx')
data2 = pd.read_excel(r'D:\0文献整理\网络入侵检测\KDD99\KDDTest_without_unkown.xlsx')
'''标签编码'''
label_encoder = LabelEncoder()
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df1['Attack_Types'] = label_encoder.fit_transform(data1['Attack_Types'])
df2['Attack_Types'] = label_encoder.transform(data2['Attack_Types'])
df1.to_excel('KDDTrain_label_encoded.xlsx', index=False)
df2.to_excel('KDDTest_label_encoded.xlsx', index=False)
频率编码:
import pandas as pd
data1 = pd.read_excel(r'D:\0文献整理\网络入侵检测\KDD99\KDDTrain.xlsx')
data2 = pd.read_excel(r'D:\0文献整理\网络入侵检测\KDD99\KDDTest_without_unkown.xlsx')
df1 = data1[['protocol_type', 'service', 'flag']].copy()
df2 = data2[['protocol_type', 'service', 'flag']].copy()
'''频率编码'''
for col in df1.columns:
df1[col + '_frequency_encoded'] = df1[col].map(df1[col].value_counts(normalize=True))
for col in df2.columns:
df2[col + '_frequency_encoded'] = df2[col].map(df2[col].value_counts(normalize=True))
df1.to_excel('KDDTrain_frequency_encoded.xlsx', index=False)
df2.to_excel('KDDTest_frequency_encoded.xlsx', index=False)