知识点见示例代码
- 字典的简单介绍
- 标签编码
- 连续特征的处理:归一化和标准化
至此,常见的预处理方式都说完了
作业:对心脏病数据集的特征用上述知识完成,一次性用所有的处理方式完成预处理,尝试手动完成,多敲几遍代码。
dict={"name":"豆包","sex":"male","age":"19"} #创建字典
dict["name"] #访问字典中的值
import pandas as pd
data=pd.read_csv(r"heart.csv")
discrete_features=["sex","cp","fbs","restecg","exang","slope","ca","thal"]
continuous_features=[]
for i in data.columns:
if i not in discrete_features:
continuous_features.append(i)
continuous_features
data2=pd.read_csv(r"heart.csv")
#这里通过ai,把离散变量分为需要独热编码的dd,需要标签编码的db,二分类的其它
dd=["cp","restecg","ca","thal"]
db=["slope"]
#独热编码
data=pd.get_dummies(data,columns=dd,drop_first=True)
data
list_finall=[]
for i in data.columns:
if i not in data2.columns:
list_finall.append(i)
list_finall
for i in list_finall:
data[i]=data[i].astype(int)
data
#标签编码
data["slope"].value_counts()
dict={0:"0",1:"1",2:"2"}
dict
data["slope"]=data["slope"].map(dict)
data["slope"]
#借助sklearn库进行归一化处理
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
data['age'] = min_max_scaler.fit_transform(data[['age']])
data['age'].head()
#手动构建函数进行归一化
def manual_normalize(data):
min_val = data.min()
max_val = data.max()
normalized_data = (data - min_val) / (max_val - min_val)
return normalized_data
data['age'] = manual_normalize(data['age'])
data['age'].head()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data['age'] = scaler.fit_transform(data[['age']])
data['age'].head()