@浙大疏锦行
DAY 22 复习日
作业:
自行学习参考如何使用kaggle平台,写下使用注意点,并对下述比赛提交代码
# 先运行之前预处理好的代码
import pandas as pd
import pandas as pd #用于数据处理和分析,可处理表格数据。
import numpy as np #用于数值计算,提供了高效的数组操作。
import matplotlib.pyplot as plt #用于绘制各种类型的图表
import seaborn as sns #基于matplotlib的高级绘图库,能绘制更美观的统计图形。
import warnings
warnings.filterwarnings("ignore")
# 设置中文字体(解决中文显示问题)
plt.rcParams['font.sans-serif'] = ['SimHei'] # Windows系统常用黑体字体
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
data =pd.read_csv(r'kaggle\train.csv') #读取数据
data = data.drop(columns=["PassengerId","Name","Ticket","Cabin"])
Pclass_mapping = {
1: 3,
2: 2,
3: 1
}
data['Pclass'] = data['Pclass'].map(Pclass_mapping)
data["Sex"]=pd.get_dummies(data["Sex"],dtype=int,drop_first=True)
data = data.rename(columns={'Sex': 'Male'})
data = pd.concat([data.drop("Embarked", axis=1), pd.get_dummies(data["Embarked"], prefix="Embarked", dtype=int, drop_first=False)], axis=1)
continuous_features=data.select_dtypes(include=['float64','int64']).columns.tolist()
discrete_features=data.select_dtypes(exclude=['float64','int64']).columns.tolist()
#离散特征使用众数进行补全
for feature in discrete_features:
if data[feature].isnull().sum()>0:
mode_value = data[feature].mode()[0]
data[feature].fillna(mode_value, inplace=True)
#连续变量用中位数进行补全
for feature in continuous_features:
if data[feature].isnull().sum()>0:
median_value = data[feature].median()
data[feature].fillna(median_value, inplace=True)
print(data.info())
print(data.head())
# 计算相关系数矩阵
correlation_matrix = data[continuous_features].corr()
# 设置图片清晰度
plt.rcParams['figure.dpi'] = 300
# 绘制热力图
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Continuous Features')
plt.show()
# 划分数据集
from sklearn.model_selection import train_test_split
X = data.drop(['Survived'], axis=1) # 特征,axis=1表示按列删除
y = data['Survived'] # 标签
# 按照8:2划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80%训练集,20%测试集
# ... existing code ...
# 支持向量分类
from sklearn.svm import SVC
# K近邻分类
from sklearn.neighbors import KNeighborsClassifier
# 逻辑回归(用于分类任务)
from sklearn.linear_model import LogisticRegression
# XGBoost分类
import xgboost as xgb
# 随机森林分类
from sklearn.ensemble import RandomForestClassifier
# CatBoost分类
from catboost import CatBoostClassifier
# 决策树分类
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
# 创建支持向量分类模型
svc_model = SVC()
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
accuracy_svc = accuracy_score(y_test, y_pred)
precision_svc = precision_score(y_test, y_pred, average='weighted')
recall_svc = recall_score(y_test, y_pred, average='weighted')
f1_svc = f1_score(y_test, y_pred, average='weighted')
# 创建 K 近邻分类模型,这里 n_neighbors 可以根据实际情况调整
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred)
precision_knn = precision_score(y_test, y_pred, average='weighted')
recall_knn = recall_score(y_test, y_pred, average='weighted')
f1_knn = f1_score(y_test, y_pred, average='weighted')
# 逻辑回归
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
precision_logistic = precision_score(y_test, y_pred_logistic, average='weighted')
recall_logistic = recall_score(y_test, y_pred_logistic, average='weighted')
f1_logistic = f1_score(y_test, y_pred_logistic, average='weighted')
# XGBoost 分类
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
recall_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
# 随机森林分类
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
# CatBoost 分类
cat_model = CatBoostClassifier(verbose=0)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)
accuracy_cat = accuracy_score(y_test, y_pred_cat)
precision_cat = precision_score(y_test, y_pred_cat, average='weighted')
recall_cat = recall_score(y_test, y_pred_cat, average='weighted')
f1_cat = f1_score(y_test, y_pred_cat, average='weighted')
# 决策树分类
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
f1_dt = f1_score(y_test, y_pred_dt, average='weighted')
# 定义模型名称和对应的评估指标
models = ['支持向量分类', 'K近邻分类', '逻辑回归', 'XGBoost分类', '随机森林分类', 'CatBoost分类', '决策树分类']
accuracy_values = [accuracy_svc, accuracy_knn, accuracy_logistic, accuracy_xgb, accuracy_rf, accuracy_cat, accuracy_dt]
precision_values = [precision_svc, precision_knn, precision_logistic, precision_xgb, precision_rf, precision_cat, precision_dt]
recall_values = [recall_svc, recall_knn, recall_logistic, recall_xgb, recall_rf, recall_cat, recall_dt]
f1_values = [f1_svc, f1_knn, f1_logistic, f1_xgb, f1_rf, f1_cat, f1_dt]
# 找到准确率最高的模型
best_accuracy_index = accuracy_values.index(max(accuracy_values))
best_accuracy_model = models[best_accuracy_index]
# 找到精确率最高的模型
best_precision_index = precision_values.index(max(precision_values))
best_precision_model = models[best_precision_index]
# 找到召回率最高的模型
best_recall_index = recall_values.index(max(recall_values))
best_recall_model = models[best_recall_index]
# 找到 F1 分数最高的模型
best_f1_index = f1_values.index(max(f1_values))
best_f1_model = models[best_f1_index]
print("准确率最高的模型:", best_accuracy_model)
print("精确率最高的模型:", best_precision_model)
print("召回率最高的模型:", best_recall_model)
print("F1 分数最高的模型:", best_f1_model)
# 综合考虑,假设 F1 分数权重为 0.4,其他三个指标权重各为 0.2
scores = []
for i in range(len(models)):
score = 0.4 * f1_values[i] + 0.2 * accuracy_values[i] + 0.2 * precision_values[i] + 0.2 * recall_values[i]
scores.append(score)
best_overall_index = scores.index(max(scores))
best_overall_model = models[best_overall_index]
print("综合评估最好的模型:", best_overall_model)
# 创建评估指标的 DataFrame
evaluation_df = pd.DataFrame({
'模型名称': models,
'准确率': accuracy_values,
'精确率': precision_values,
'召回率': recall_values,
'F1 分数': f1_values,
'综合得分': scores
})
# 打印表格
print(evaluation_df)
# --- 1. 默认参数的 CatBoost ---
# 评估基准模型,这里确实不需要验证集
print("--- 1. 默认参数 CatBoost (训练集 -> 测试集) ---")
import time # 记录时长
start_time = time.time() # 记录开始时间
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
cat_model = CatBoostClassifier(random_state=42, verbose=0)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)
end_time = time.time() # 记录结束时间
accuracy = accuracy_score(y_test, y_pred_cat)
precision = precision_score(y_test, y_pred_cat, average='weighted')
recall = recall_score(y_test, y_pred_cat, average='weighted')
f1 = f1_score(y_test, y_pred_cat, average='weighted')
print(f"准确率: {accuracy}")
print(f"精确率: {precision}")
print(f"召回率: {recall}")
print(f"F1 分数: {f1}")
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
# 标准化数据(聚类前通常需要标准化)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 评估不同 k 值下的指标
k_range = range(2, 11) # 测试 k 从 2 到 10
inertia_values = []
silhouette_scores = []
ch_scores = []
db_scores = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)
inertia_values.append(kmeans.inertia_) # 惯性(肘部法则)
silhouette = silhouette_score(X_scaled, kmeans_labels) # 轮廓系数
silhouette_scores.append(silhouette)
ch = calinski_harabasz_score(X_scaled, kmeans_labels) # CH 指数
ch_scores.append(ch)
db = davies_bouldin_score(X_scaled, kmeans_labels) # DB 指数
db_scores.append(db)
print(f"k={k}, 惯性: {kmeans.inertia_:.2f}, 轮廓系数: {silhouette:.3f}, CH 指数: {ch:.2f}, DB 指数: {db:.3f}")
# 绘制评估指标图
plt.figure(figsize=(15, 10))
# 肘部法则图(Inertia)
plt.subplot(2, 2, 1)
plt.plot(k_range, inertia_values, marker='o')
plt.title('肘部法则确定最优聚类数 k(惯性,越小越好)')
plt.xlabel('聚类数 (k)')
plt.ylabel('惯性')
plt.grid(True)
# 轮廓系数图
plt.subplot(2, 2, 2)
plt.plot(k_range, silhouette_scores, marker='o', color='orange')
plt.title('轮廓系数确定最优聚类数 k(越大越好)')
plt.xlabel('聚类数 (k)')
plt.ylabel('轮廓系数')
plt.grid(True)
# CH 指数图
plt.subplot(2, 2, 3)
plt.plot(k_range, ch_scores, marker='o', color='green')
plt.title('Calinski-Harabasz 指数确定最优聚类数 k(越大越好)')
plt.xlabel('聚类数 (k)')
plt.ylabel('CH 指数')
plt.grid(True)
# DB 指数图
plt.subplot(2, 2, 4)
plt.plot(k_range, db_scores, marker='o', color='red')
plt.title('Davies-Bouldin 指数确定最优聚类数 k(越小越好)')
plt.xlabel('聚类数 (k)')
plt.ylabel('DB 指数')
plt.grid(True)
plt.tight_layout()
plt.show()
# 提示用户选择 k 值
selected_k = 3
# 使用选择的 k 值进行 KMeans 聚类
kmeans = KMeans(n_clusters=selected_k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)
X['KMeans_Cluster'] = kmeans_labels
# 使用 PCA 降维到 2D 进行可视化
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# KMeans 聚类结果可视化
plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=kmeans_labels, palette='viridis')
plt.title(f'KMeans Clustering with k={selected_k} (PCA Visualization)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
# 打印 KMeans 聚类标签的前几行
print(f"KMeans Cluster labels (k={selected_k}) added to X:")
print(X[['KMeans_Cluster']].value_counts())
X = pd.get_dummies(X, columns=['KMeans_Cluster'])
list_final = [] # 新建一个空列表,用于存放独热编码后新增的特征名
for i in X.columns:
if i not in data:
list_final.append(i) # 这里打印出来的就是独热编码后的特征名
for i in list_final:
if i in X.columns:
data[i] = X[i].astype(int)
# 划分数据集
from sklearn.model_selection import train_test_split
X = data.drop(['Survived'], axis=1) # 特征,axis=1表示按列删除
y = data['Survived'] # 标签
# 按照8:2划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80%训练集,20%测试集
import time # 记录时长
start_time = time.time() # 记录开始时间
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
cat_model = CatBoostClassifier(random_state=42, verbose=0)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)
end_time = time.time() # 记录结束时间
accuracy = accuracy_score(y_test, y_pred_cat)
precision = precision_score(y_test, y_pred_cat, average='weighted')
recall = recall_score(y_test, y_pred_cat, average='weighted')
f1 = f1_score(y_test, y_pred_cat, average='weighted')
print(f"准确率: {accuracy}")
print(f"精确率: {precision}")
print(f"召回率: {recall}")
print(f"F1 分数: {f1}")
import shap
# 初始化 SHAP 解释器
explainer = shap.Explainer(best_model)
shap_values = explainer(X_test)
# 全局特征重要性蜜蜂群图
shap.summary_plot(shap_values, X_test)
plt.show()
print("--- SHAP重要性筛选 ---")
import shap
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
import numpy as np
# 初始化最大 F1 值和对应的 k 值
max_f1 = 0
best_k = 0
for k in range(1, 13):
start_time = time.time()
# 使用 CatBoost 模型计算 SHAP 值
cat_shap = CatBoostClassifier(random_state=42, verbose=0)
cat_shap.fit(X_train, y_train)
explainer = shap.TreeExplainer(cat_shap)
shap_values = explainer.shap_values(X_train)
# 计算每个特征的平均 SHAP 值(取绝对值的平均)
if isinstance(shap_values, list):
# 多分类问题,假设二分类取索引 1
mean_shap = np.abs(shap_values[1]).mean(axis=0)
else:
# 二分类或回归问题
mean_shap = np.abs(shap_values).mean(axis=0)
top_k_indices = np.argsort(mean_shap)[-k:]
X_train_shap = X_train.iloc[:, top_k_indices]
X_test_shap = X_test.iloc[:, top_k_indices]
# 获取筛选后的特征名
selected_features_shap = X_train.columns[top_k_indices].tolist()
# 创建 CatBoost 分类模型
cat_model_shap = CatBoostClassifier(random_state=42, verbose=0)
# 在筛选后的训练数据上训练模型
cat_model_shap.fit(X_train_shap, y_train)
# 使用训练好的模型对筛选后的测试数据进行预测
cat_pred_shap = cat_model_shap.predict(X_test_shap)
end_time = time.time()
# 提取分类报告中的加权平均 F1 分数
report = classification_report(y_test, cat_pred_shap, output_dict=True)
weighted_avg = report['weighted avg']
f1_score = weighted_avg['f1-score']
# 更新最大 F1 值和对应的 k 值
if f1_score > max_f1:
max_f1 = f1_score
best_k = k
# 打印最大 F1 值和对应的 k 值
print(f"最大加权平均 F1 分数: {max_f1},对应的 k 值为: {best_k}")
data =pd.read_csv(r'kaggle\test.csv') #读取数据
data = data.drop(columns=["PassengerId","Name","Ticket","Cabin"])
Pclass_mapping = {
1: 3,
2: 2,
3: 1
}
data['Pclass'] = data['Pclass'].map(Pclass_mapping)
data["Sex"]=pd.get_dummies(data["Sex"],dtype=int,drop_first=True)
data = data.rename(columns={'Sex': 'Male'})
data = pd.concat([data.drop("Embarked", axis=1), pd.get_dummies(data["Embarked"], prefix="Embarked", dtype=int, drop_first=False)], axis=1)
continuous_features=data.select_dtypes(include=['float64','int64']).columns.tolist()
discrete_features=data.select_dtypes(exclude=['float64','int64']).columns.tolist()
#离散特征使用众数进行补全
for feature in discrete_features:
if data[feature].isnull().sum()>0:
mode_value = data[feature].mode()[0]
data[feature].fillna(mode_value, inplace=True)
#连续变量用中位数进行补全
for feature in continuous_features:
if data[feature].isnull().sum()>0:
median_value = data[feature].median()
data[feature].fillna(median_value, inplace=True)
X = data
# 标准化数据(聚类前通常需要标准化)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 提示用户选择 k 值
selected_k = 3
# 使用选择的 k 值进行 KMeans 聚类
kmeans = KMeans(n_clusters=selected_k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)
X['KMeans_Cluster'] = kmeans_labels
X = pd.get_dummies(X, columns=['KMeans_Cluster'])
list_final = [] # 新建一个空列表,用于存放独热编码后新增的特征名
for i in X.columns:
if i not in data:
list_final.append(i) # 这里打印出来的就是独热编码后的特征名
for i in list_final:
if i in X.columns:
data[i] = X[i].astype(int)
data=data.drop(columns=['KMeans_Cluster'])
X = data
X_test_shap = X.iloc[:, top_k_indices]
cat_model_shap = CatBoostClassifier(random_state=42, verbose=0)
cat_model_shap.fit(X_train_shap, y_train)
cat_pred_shap = cat_model_shap.predict(X_test_shap)
# 在原测试集数据中添加预测结果列
data =pd.read_csv(r'kaggle\test.csv') #读取数据
data['Survived'] = cat_pred_shap
# 选取 id 列和预测结果列
result_df = data[['PassengerId', 'Survived']]
# 保存为 CSV 文件
result_df.to_csv('prediction_result.csv', index=False)