```
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, confusion_matrix
import statsmodels.api as sm
from sklearn.metrics import precision_score, roc_curve
plt.style.use('ggplot')
# 读取数据
file_path = r'C:\Users\29930\Desktop\插补数据.csv'
data = pd.read_csv(file_path,encoding='GBK')
data['性别'] = data['性别'].map({'男': 0, '女': 1}) # 编码性别列
arr = data.to_numpy() # 转换为NumPy数组
# 查看前几行数据
print(data.head())
# 检查是否有缺失值
print(data.isnull().sum())
# 填充缺失值(如果有的话)
data.ffill(inplace=True)
# 分离特征和目标变量
X = data.drop(columns=['慢阻肺']) # 假设第一列为COPD标签
y = data['慢阻肺']
# 标准化数值型特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 将标准化后的数据转回DataFrame格式以便后续操作
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
# 使用SelectKBest进行单变量选择
selector = SelectKBest(score_func=f_classif, k='all') # 先全部选出来查看得分情况
fit = selector.fit(X_scaled_df, y)
# 打印每个特征的重要性评分
feature_scores = pd.DataFrame(list(zip(X_scaled_df.columns, fit.scores_)),
columns=['Feature','Score'])
feature_scores.sort_values(by='Score', ascending=False, inplace=True)
print(feature_scores)
# 绘制特征重要性图
plt.figure(figsize=(10, 6))
sns.barplot(x="Score", y="Feature", data=feature_scores)
plt.title('Feature Importance Scores')
plt.show()
# 选择最重要的几个特征
selected_features = feature_scores[feature_scores.Score >= feature_scores.Score.quantile(0.75)].Feature.tolist() # 选取前75%分位数以上的特征
X_selected = X_scaled_df[selected_features]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
# 定义逻辑回归模型
logreg = LogisticRegression(solver='liblinear')
# 超参数调优 - 这里我们只对正则化强度C做网格搜索
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
# 输出最佳参数组合及对应的成绩
best_logreg = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)
# 在测试集上应用最优模型
y_pred = best_logreg.predict(X_test)
# 计算性能指标
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(report)
# 绘制混淆矩阵热力图
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix Heatmap')
plt.show()
# ROC曲线
fpr, tpr, _ = roc_curve(y_test, best_logreg.decision_function(X_test))
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
# 添加常数项
X_const = sm.add_constant(X_selected)
# 构建OLS线性回归模型(用于展示线性关系)
model = sm.Logit(y, X_const).fit()
print(model.summary())
# 获取最终模型方程
coefs = list(best_logreg.coef_[0]) + [best_logreg.intercept_[0]]
features_with_intercept = ['const'] + selected_features
formula_parts = []
for coef, feat in zip(coefs, features_with_intercept):
formula_parts.append(f"{coef:+.4f}*{feat}")
final_formula = " + ".join(formula_parts)
print("\nFinal Early Screening Formula:")
print(final_formula.replace('+', ' + ').replace('-', ' - '))```输出的重要特征图像中的变量标签不能显示中文
最新发布