知识点回顾:
- 推断簇含义的2个思路:先选特征和后选特征
- 通过可视化图形借助ai定义簇的含义
- 科研逻辑闭环:通过精度判断特征工程价值
作业:参考示例代码对心脏病数据集采取类似操作,并且评估特征工程后模型效果有无提升。
# 先运行之前预处理好的代码
import pandas as pd
import pandas as pd #用于数据处理和分析,可处理表格数据。
import numpy as np #用于数值计算,提供了高效的数组操作。
import matplotlib.pyplot as plt #用于绘制各种类型的图表
import seaborn as sns #基于matplotlib的高级绘图库,能绘制更美观的统计图形。
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# 设置中文字体(解决中文显示问题)
plt.rcParams['font.sans-serif'] = ['SimHei'] # Windows系统常用黑体字体
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
data = pd.read_csv('heart.csv') #读取数据
X = data.drop(['target'], axis=1) # 特征,axis=1表示按列删除
y = data['target'] # 标签
# 划分特征和目标变量
X = data.drop('target', axis=1)
y = data['target']
# 划分训练集和测试集(特征工程前)
X_train_before, X_test_before, y_train_before, y_test_before = train_test_split(X, y, test_size=0.2, random_state=42)
# 模型训练与评估(特征工程前)
model_before = LogisticRegression()
model_before.fit(X_train_before, y_train_before)
y_pred_before = model_before.predict(X_test_before)
accuracy_before = accuracy_score(y_test_before, y_pred_before)
precision_before = precision_score(y_test_before, y_pred_before)
recall_before = recall_score(y_test_before, y_pred_before)
f1_before = f1_score(y_test_before, y_pred_before)
print("特征工程前模型评估指标:")
print(f"准确率: {accuracy_before:.4f}")
print(f"精确率: {precision_before:.4f}")
print(f"召回率: {recall_before:.4f}")
print(f"F1分数: {f1_before:.4f}")
# 特征工程
# 1. 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 2. 创建新特征:例如年龄和胆固醇的乘积
X_scaled = np.c_[X_scaled, X['age'] * X['chol']]
# 划分训练集和测试集(特征工程后)
X_train_after, X_test_after, y_train_after, y_test_after = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# 模型训练与评估(特征工程后)
model_after = LogisticRegression()
model_after.fit(X_train_after, y_train_after)
y_pred_after = model_after.predict(X_test_after)
accuracy_after = accuracy_score(y_test_after, y_pred_after)
precision_after = precision_score(y_test_after, y_pred_after)
recall_after = recall_score(y_test_after, y_pred_after)
f1_after = f1_score(y_test_after, y_pred_after)
print("\n特征工程后模型评估指标:")
print(f"准确率: {accuracy_after:.4f}")
print(f"精确率: {precision_after:.4f}")
print(f"召回率: {recall_after:.4f}")
print(f"F1分数: {f1_after:.4f}")
# 对比评估指标
improvement_accuracy = accuracy_after - accuracy_before
improvement_precision = precision_after - precision_before
improvement_recall = recall_after - recall_before
improvement_f1 = f1_after - f1_before
print("\n特征工程后模型效果提升情况:")
print(f"准确率提升: {improvement_accuracy:.4f}")
print(f"精确率提升: {improvement_precision:.4f}")
print(f"召回率提升: {improvement_recall:.4f}")
print(f"F1分数提升: {improvement_f1:.4f}")