修改提及到的代码,其余代码不做改变,生成完整代码文件
代码:
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 26 16:23:50 2023
@author: 86157
"""
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, adjusted_rand_score
from itertools import permutations
import random
from mpl_toolkits.mplot3d import Axes3D
import warnings
import os
import traceback
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
warnings.filterwarnings('ignore')
# =============================================================================
# 1. 数据可视化工具函数
# =============================================================================
def get_colors(style='bright'):
if style == 'bright':
return sns.color_palette("bright")
elif style == 'rainbow':
return sns.color_palette("rainbow")
else:
return sns.color_palette("deep")
def boxplot(data, rows, cols, hue=None, vars=None, figsize=(12, 8), subplots_adjust=(0.5, 0.5)):
if not vars:
vars = data.select_dtypes(include=np.number).columns.tolist()
fig = plt.figure(figsize=figsize)
ax_num = 1
for col in vars:
plt.subplot(rows, cols, ax_num)
if hue:
sns.boxplot(x=hue, y=col, data=data, palette=get_colors('rainbow'))
else:
sns.boxplot(y=data[col], color=random.choice(get_colors()))
plt.title(col)
plt.xticks(rotation=45)
ax_num += 1
plt.tight_layout()
plt.subplots_adjust(hspace=subplots_adjust[0], wspace=subplots_adjust[1])
plt.savefig("玻璃成分箱线图.jpg", dpi=300, bbox_inches='tight')
plt.show()
def distplot(data, rows=3, cols=4, bins=10, vars=None, hue=None, kind='hist', figsize=(12, 5), subplots_adjust=(0.3, 0.2)):
if not vars:
vars = data.select_dtypes(include=np.number).columns.tolist()
fig = plt.figure(figsize=figsize)
ax_num = 1
for col in vars:
plt.subplot(rows, cols, ax_num)
if kind == 'hist':
sns.histplot(data[col], bins=bins, color=random.choice(get_colors()), alpha=0.7, kde=True)
elif kind == 'kde':
sns.kdeplot(data[col], color=random.choice(get_colors()), fill=True)
elif kind == 'both':
sns.histplot(data[col], bins=bins, color=random.choice(get_colors()), alpha=0.5, kde=False)
sns.kdeplot(data[col], color='darkred', alpha=0.7)
plt.title(col)
ax_num += 1
plt.subplots_adjust(hspace=subplots_adjust[0], wspace=subplots_adjust[1])
plt.savefig("玻璃成分分布图.jpg", dpi=300)
plt.show()
def plot_3d_clusters(data, features, glass_type):
if len(features) < 3:
print("不足3个特征,无法3D可视化")
return
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
for label in data['聚类标签'].unique():
cluster_data = data[data['聚类标签'] == label]
ax.scatter(cluster_data[features[0]], cluster_data[features[1]], cluster_data[features[2]], label=f'聚类 {label}')
ax.set_xlabel(features[0])
ax.set_ylabel(features[1])
ax.set_zlabel(features[2])
plt.title(f'{glass_type}玻璃3D聚类')
plt.legend()
plt.savefig(f"{glass_type}玻璃3D聚类.jpg", dpi=300)
plt.show()
# =============================================================================
# 2. 数据加载和预处理函数
# =============================================================================
def load_and_process_data(file_path, sheet_name=0):
try:
data = pd.read_excel(file_path, sheet_name=sheet_name)
# 列名修复
col_renames = {
'表面风化化': '表面风化',
'采采样点风化类型': '采样点风化类型',
'样点风化类型': '采样点风化类型',
'总成分': '总含量'
}
data.rename(columns=col_renames, inplace=True)
# 化学成分列重命名
data.rename(columns={
'氧化硅(Si)': '二氧化硅(SiO2)',
'氧化锡(SnO)': '氧化锡(SnO2)',
'氧化硫(SO3)': '二氧化硫(SO2)',
'氧化亚铜(Cu2O)': '氧化亚铜(Cu2O)',
'氧化铜(CuO)': '氧化铜(CuO)',
'三氧化二铁(Fe2O3)': '三氧化二铁(Fe2O3)'
}, inplace=True)
# 删除总含量列
if '总含量' in data.columns:
data = data.drop('总含量', axis=1)
# 补全缺失列
for col in ['表面风化', '采样点风化类型', '类型']:
if col not in data.columns:
data[col] = np.nan
print("列名处理完成")
return data
except Exception as e:
print("数据加载失败:", e)
return None
# =============================================================================
# 3. 亚类划分与特征选择函数
# =============================================================================
def select_subclass_features(data):
try:
# 分离高钾和铅钡玻璃
gaojia_data = data[data['类型'] == '高钾'].copy()
qianbai_data = data[data['类型'] == '铅钡'].copy()
gaojia_data.reset_index(drop=True, inplace=True)
qianbai_data.reset_index(drop=True, inplace=True)
# 提取化学成分列
chem_cols = [col for col in data.columns if any(x in col for x in ['氧化', '二氧化'])]
# 高钾玻璃特征选择
gaojia_x = gaojia_data[chem_cols].fillna(0)
gaojia_y = gaojia_data['采样点风化类型']
# 随机森林+网格搜索
model = RandomForestClassifier(random_state=42)
parameters = {'max_depth': [2, 3, 4], 'min_samples_leaf': [1, 2]}
grid = GridSearchCV(model, parameters, cv=5)
grid.fit(gaojia_x, gaojia_y)
# 输出特征重要性
gaojia_fea_df = pd.DataFrame({'化学成分': chem_cols, '特征重要性': grid.best_estimator_.feature_importances_})
gaojia_fea_df = gaojia_fea_df.sort_values('特征重要性', ascending=False)
# 铅钡玻璃特征选择(同理)
qianbai_x = qianbai_data[chem_cols].fillna(0)
qianbai_y = qianbai_data['采样点风化类型']
grid.fit(qianbai_x, qianbai_y)
qianbai_fea_df = pd.DataFrame({'化学成分': chem_cols, '特征重要性': grid.best_estimator_.feature_importances_})
qianbai_fea_df = qianbai_fea_df.sort_values('特征重要性', ascending=False)
return gaojia_data, qianbai_data, gaojia_fea_df, qianbai_fea_df, chem_cols
except Exception as e:
print("特征选择失败:", e)
return None, None, None, None, None
# =============================================================================
# 4. 聚类评估函数
# =============================================================================
def evaluate_clustering(pred, true_labels):
try:
from sklearn.metrics import adjusted_rand_score, f1_score
from itertools import permutations
true_labels = pd.factorize(true_labels)[0]
ari = adjusted_rand_score(true_labels, pred)
max_f1 = 0
for perm in permutations(set(true_labels), len(set(pred))):
remapped_pred = [perm[p] for p in pred]
f1 = f1_score(true_labels, remapped_pred, average='weighted', zero_division=0)
max_f1 = max(max_f1, f1)
return (ari + max_f1) / 2
except:
return 0
# =============================================================================
# 5. 聚类和优化特征选择函数
# =============================================================================
def optimize_features_and_cluster(gaojia_data, qianbai_data, gaojia_fea_df, qianbai_fea_df, chem_cols):
try:
gaojia_list = gaojia_fea_df['化学成分'].tolist()
qianbai_list = qianbai_fea_df['化学成分'].tolist()
# 高钾玻璃聚类
best_score_gaojia, best_features_gaojia = 0, []
for num in range(1, min(10, len(gaojia_list))):
X = gaojia_data[gaojia_list[:num]].values
X = StandardScaler().fit_transform(X)
labels = KMeans(n_clusters=2, n_init=10).fit_predict(X)
score = evaluate_clustering(labels, gaojia_data['采样点风化类型'])
if score > best_score_gaojia:
best_score_gaojia = score
best_features_gaojia = gaojia_list[:num]
gaojia_data['聚类标签'] = KMeans(n_clusters=2, n_init=10).fit_predict(
StandardScaler().fit_transform(gaojia_data[best_features_gaojia])
)
# 铅钡玻璃聚类
best_score_qianbai, best_features_qianbai = 0, []
for num in range(1, min(10, len(qianbai_list))):
X = qianbai_data[qianbai_list[:num]].values
X = StandardScaler().fit_transform(X)
labels = KMeans(n_clusters=2, n_init=10).fit_predict(X)
score = evaluate_clustering(labels, qianbai_data['采样点风化类型'])
if score > best_score_qianbai:
best_score_qianbai = score
best_features_qianbai = qianbai_list[:num]
qianbai_data['聚类标签'] = KMeans(n_clusters=2, n_init=10).fit_predict(
StandardScaler().fit_transform(qianbai_data[best_features_qianbai])
)
return gaojia_data, qianbai_data, best_features_gaojia, best_features_qianbai
except Exception as e:
print("聚类优化失败:", e)
return None, None, None, None
# =============================================================================
# 6. 结果可视化与分析函数
# =============================================================================
def plot_subclass_results(gaojia_data, qianbai_data, gaojia_features, qianbai_features):
try:
print("高钾玻璃聚类统计:")
print(gaojia_data.groupby('聚类标签')['采样点风化类型'].value_counts())
print("铅钡玻璃聚类统计:")
print(qianbai_data.groupby('聚类标签')['采样点风化类型'].value_counts())
plt.figure(figsize=(10, 6))
sns.scatterplot(x=gaojia_data[gaojia_features[0]], y=gaojia_data[gaojia_features[1]], hue=gaojia_data['聚类标签'], legend='full')
plt.title('高钾玻璃聚类结果')
plt.show()
except Exception as e:
print("结果可视化失败:", e)
# =============================================================================
# 7. 未知样本预测函数
# =============================================================================
def predict_unknown_samples(unknown_data, gaojia_model, qianbai_model, gaojia_features, qianbai_features):
try:
# 填充缺失值
chem_cols = [col for col in unknown_data.columns if any(x in col for x in ['氧化', '二氧化'])]
X = unknown_data[chem_cols].fillna(0).values
unknown_data['类型'] = None
for i in range(len(X)):
x = X[i].reshape(1, -1)
pred_gaojia = gaojia_model.predict(x)
pred_qianbai = qianbai_model.predict(x)
if pred_gaojia == 0 and pred_qianbai == 0:
unknown_data.at[i, '类型'] = '高钾' if random.random() < 0.5 else '铅钡'
else:
unknown_data.at[i, '类型'] = '高钾' if pred_gaojia[0] == 0 else '铅钡'
print("预测完成")
return unknown_data
except Exception as e:
print("预测失败:", e)
return None
# =============================================================================
# 8. 主程序流程
# =============================================================================
def main():
try:
# 加载数据
data = load_and_process_data(r"D:\BianChen\python_studycode\tf_env\玻璃\分析结果.xlsx")
if data is None:
print("加载数据失败")
return
# 特征选择
gaojia_data, qianbai_data, gaojia_fea_df, qianbai_fea_df, chem_cols = select_subclass_features(data)
# 聚类优化
gaojia_data, qianbai_data, gaojia_features, qianbai_features = optimize_features_and_cluster(
gaojia_data, qianbai_data, gaojia_fea_df, qianbai_fea_df, chem_cols
)
# 可视化
plot_subclass_results(gaojia_data, qianbai_data, gaojia_features, qianbai_features)
# 训练模型
gaojia_kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
gaojia_kmeans.fit(gaojia_data[gaojia_features].values)
qianbai_kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
qianbai_kmeans.fit(qianbai_data[qianbai_features].values)
# 预测未知样本
unknown_data = load_and_process_data(r"D:\Users\86157\Desktop\数学建模\附件.xlsx", sheet_name='表单3')
if unknown_data is None:
print("加载未知数据失败")
return
unknown_data = predict_unknown_samples(unknown_data, gaojia_kmeans, qianbai_kmeans, gaojia_features, qianbai_features)
# 保存结果
output_dir = "final_results"
os.makedirs(output_dir, exist_ok=True)
unknown_data.to_excel(os.path.join(output_dir, '未知样本分类结果.xlsx'), index=False)
print("程序已完成")
except Exception as e:
print("主程序运行失败:", e)
traceback.print_exc()
if __name__ == "__main__":
main()
最新发布