目录
一、导入数据
# 导入数据集
import pandas as pd
data = pd.read_csv(r"C:\Users\Terry\Desktop\heart.csv")
data
数据集有1025行,14列。每行表示一个病人。13列表示特征,1列表示标签(是否患心脏病)
%%html
<style>
table {
display: inline-block
}
</style>
看一下字段名的含义:
二、数据预处理 探索性数据分析
查看缺失值:没有缺失值
# 缺失值的检查
data.isna().sum()
import missingno as msno
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['Roboto']
msno.matrix(data, labels=True)
# 热力图绘制
def enhanced_corr_heatmap(data):
"""绘制数据皮尔逊相关性系数的热力图(下三角显示)"""
# 导包
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# matplotlib的图像大小和字体显示设置
plt.figure(figsize=(10, 8)) # 可同比例放大
plt.rcParams['font.sans-serif'] = ['Roboto'] # 美化字体
plt.rcParams['axes.unicode_minus'] = False # 显示负数
#保留下三角:
data_corr = data.corr()
mask = np.zeros_like(data_corr)
for i in range(1,len(mask)):
for j in range(0,i):
mask[j][i] = True # 上三角就mask[i][j] = True
# 绘图
sns.heatmap(data_corr,annot=True, fmt=".2f",cmap = 'Blues',mask = mask)
plt.show()
enhanced_corr_heatmap(data)
三、特征工程
df = data
# 将定类特征由整数编码转为实际对应的字符串,还原为真实含义
df['sex'][df['sex'] == 0] = 'female'
df['sex'][df['sex'] == 1] = 'male'
df['cp'][df['cp'] == 0] = 'typical angina'
df['cp'][df['cp'] == 1] = 'atypical angina'
df['cp'][df['cp'] == 2] = 'non-anginal pain'
df['cp'][df['cp'] == 3] = 'asymptomatic'
df['fbs'][df['fbs'] == 0] = 'lower than 120mg/ml'
df['fbs'][df['fbs'] == 1] = 'greater than 120mg ml'
df['restecg'][df['restecg'] == 0] = 'normal'
df['restecg'][df['restecg'] == 1] = 'ST-T wave abnormality'
df['restecg'][df[&