宏CV_IS_ROW_SAMPLE的含义

本文详细解析了cvCreateMTStumpClassifier函数中的CV_IS_ROW_SAMPLE宏,提供了其在OpenCV库中的作用和用法,帮助读者理解如何在不同场景下正确使用该宏。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

     在函数cvCreatMTStumpClassifier中遇到了CV_IS_ROW_SAMPLE,网上查了下资料,很少,现在我总结如下,希望能帮助大家理解。如大家对cvCreatMTStumpClassifier不理解的,请参考我博客中的文章http://blog.csdn.net/ding977921830/article/details/46356789http://blog.csdn.net/ding977921830/article/details/46412465


// 函数功能:计算最优弱分类器  
CvClassifier* cvCreateMTStumpClassifier( CvMat* trainData,      // 训练样本HAAR特征值矩阵  
                      int flags,                                // 1.按行排列,0.按列排列  
                      CvMat* trainClasses,                  
                      CvMat* /*typeMask*/,                     
                      CvMat* missedMeasurementsMask,            
                      CvMat* compIdx,                           
                      CvMat* sampleIdx,                         
                      CvMat* weights,                          
                      CvClassifierTrainParams* trainParams )    
{  
.......
}

宏CV_IS_ROW_SAMPLE的具体定义如下:

    /* columns of <trainData> matrix are training samples */
    //矩阵traindata的列是训练样本,对应上述的flags=0
    #define CV_COL_SAMPLE 0

    /* rows of <trainData> matrix are training samples */
    //矩阵traindata的行是训练样本,对应上述的flags=1
    #define CV_ROW_SAMPLE 1

    #define CV_IS_ROW_SAMPLE(flags) ((flags) & CV_ROW_SAMPLE)

宏CV_IS_ROW_SAMPLE(flags)是带有参数的宏,当flags=0时,宏CV_IS_ROW_SAMPLE的含义是0,当flags=1时,宏CV_IS_ROW_SAMPLE的含义是1.

可以看出在opencv的开源库中,c++的编程是非常灵活的,而且内容是具有一定含义和代表性的。

import chardet import streamlit as st import pandas as pd import numpy as np import joblib import os import time import matplotlib.pyplot as plt import seaborn as sns from pyspark.sql import SparkSession from pyspark.ml.feature import VectorAssembler, StandardScaler from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from sklearn.metrics import classification_report, confusion_matrix import warnings import dask.dataframe as dd from dask.diagnostics import ProgressBar from dask_ml.preprocessing import StandardScaler as DaskStandardScaler import tempfile import shutil import re warnings.filterwarnings("ignore") plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 页面设置 st.set_page_config( page_title="单宽转融用户预测系统", page_icon="📶", layout="wide", initial_sidebar_state="expanded" ) # 自定义CSS样式 st.markdown(""" <style> .stApp { background: linear-gradient(135deg, #f5f7fa 0%, #e4edf5 100%); font-family: 'Helvetica Neue', Arial, sans-serif; } .header { background: linear-gradient(90deg, #2c3e50 0%, #4a649极 100%); color: white; padding: 1.5rem; border-radius: 0.75rem; box-shadow: 0 4px 12px rgba(0,0,0,0.1); margin-bottom: 2rem; } .card { background: white; border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 4px 12px rgba(0,0,0,0.08); transition: transform 0.3s ease; } .card:hover { transform: translateY(-5px); box-shadow: 0 6px 16px rgba(0,0,0,0.12); } .stButton button { background: linear-gradient(90deg, #3498db 0%, #1a5276 100%) !important; color: white !important; border: none !important; border-radius: 0.5rem; padding: 0.75rem 1.5rem; font-size: 1rem; font-weight: 600; transition: all 0.3s ease; width: 100%; } .stButton button:hover { transform: scale(1.05); box-shadow: 0 4px 8px rgba(52, 152, 219, 0.4); } .feature-box { background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; } .result-box { background: linear-gradient(135deg, #e8f5e9 0%, #c8e6c9 100%); border-radius: 0.75rem; padding极 1.5rem; margin-top: 1.5rem; } .model-box { background: linear-gradient(135deg, #fff3e0 0%, #ffe0b2 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .stProgress > div > div > div { background: linear-gradient(90deg, #2ecc71 0%, #27ae60 100%) !important; } .metric-card { background: white; border-radius: 0.75rem; padding: 1rem; text-align: center; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .metric-value { font-size: 1.8rem; font-weight: 700; color: #2c3e50; } .metric-label { font-size: 0.9rem; color: #7f8c8d; margin-top: 0.5rem; } .highlight { background: linear-gradient(90deg, #ffeb3b 0%, #fbc02d 100%); padding: 0.2rem 0.5rem; border-radius: 0.25rem; font-weight: 600; } .stDataFrame { border-radius: 0.75rem; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .risk-high { background-color: #ffcdd2 !important; color: #c62828 !important; font-weight: 700; } .risk-medium { background-color: #fff9c4 !important; color: #f57f17 !important; font-weight: 600; } .risk-low { background-color: #c8e6c9 !important; color: #388e3c !important; } </style> """, unsafe_allow_html=True) def clean_numeric_string(value): """清理数值字符串中的非数字字符""" if pd.isna(value): return np.nan try: # 尝试直接转换为浮点数 return float(value) except (ValueError, TypeError): # 移除非数字字符(除小数点和负号外) cleaned = re.sub(r'[^\d\.-]', '', str(value)) try: return float(cleaned) if cleaned else np.nan except ValueError: return np.nan def is_numeric_column(series): """检测列是否可以转换为数值类型""" try: # 尝试转换样本数据 # 修复:移除对Pandas Series的.compute()调用 if isinstance(series, dd.Series): sample = series.head(1000).compute() else: sample = series.head(1000) pd.to_numeric(sample, errors='raise') return True except (ValueError, TypeError): return False def preprocess_data(ddf): """使用Dask进行大数据预处理""" processed_ddf = ddf.copy() # 删除无意义特征 drop_cols = ['BIL_MONTH', 'ASSET_ROW_ID', 'CCUST_ROW_ID', 'BELONG_CITY', 'MKT_CHANNEL_NAME', 'MKT_CHANNEL_SUB_NAME', 'PREPARE_FLG', 'SERV_START_DT', 'COMB_STAT_NAME', 'FIBER_ACCESS_CATEGORY'] existing_cols = [col for col in drop_cols if col in processed_ddf.columns] processed_ddf = processed_ddf.drop(columns=existing_cols) # 初始数值列 initial_numeric_cols = processed_ddf.select_dtypes(include=[np.number]).columns.tolist() if 'is_rh_next' in initial_numeric_cols: initial_numeric_cols.remove('is_rh_next') # 潜在数值列 potential_numeric_cols = [ 'MAX_PRICE_COMPANY', 'MAX_PRICE_MODEL', 'MAX_PRICE_TERM_TYPE', 'MOBLE_4G_CNT_LV', 'MOBLE_CNT_LV', 'OWE_AMT_LV', 'OWE_CNT_LV', 'PROM_INTEG_ID', 'TOUSU_CNT_LV' ] # 数值列集合 numeric_cols = initial_numeric_cols.copy() # 处理潜在数值列 for col in potential_numeric_cols: if col in processed_ddf.columns: # 检查列是否可转换为数值型 if is_numeric_column(processed_ddf[col]): try: # 尝试转换为数值型 processed_ddf[col] = processed_ddf[col].apply(clean_numeric_string, meta=(col, 'f8')) numeric_cols.append(col) except Exception as e: st.warning(f"列 {col} 转换为数值型失败: {str(e)},将视为分类特征") else: st.warning(f"列 {col} 包含非数值数据,将视为分类特征") # 处理缺失值 with ProgressBar(): # 计算数值列均值 means = processed_ddf[numeric_cols].mean().compute() # 填充数值列缺失值 for col in numeric_cols: processed_ddf[col] = processed_ddf[col].fillna(means[col]) # 处理非数值列缺失值 non_numeric_cols = [col for col in processed_ddf.columns if col not in numeric_cols and col != 'is_rh_next'] for col in non_numeric_cols: processed_ddf[col] = processed_ddf[col].fillna("Unknown") # 类型转换 - 修复:避免在Dask操作中直接调用.compute() for col in numeric_cols: if processed_ddf[col].dtype == 'float64': # 检查是否可以安全转换为整数 try: # 使用采样数据代替整个数据集 sample = processed_ddf[col].dropna().head(1000).compute() if (sample == sample.astype(int)).all(): processed_ddf[col] = processed_ddf[col].astype('int64') except: # 如果转换失败,保持浮点类型 pass # 二进制特征编码 binary_cols = ['IF_YHTS', 'is_kdts', 'is_itv_up', 'is_mobile_up', 'if_zzzw_up'] for col in binary_cols: if col in processed_ddf.columns: processed_ddf[col] = processed_ddf[col].map({'否': 0, '是': 1, 0: 0, 1: 1, 'Unknown': -1}, meta=(col, 'int64')) # 分类特征编码 if 'GENDER' in processed_ddf.columns: gender_mapping = {'男': 0, '女': 1, 'Unknown': -1} processed_ddf['GENDER'] = processed_ddf['GENDER'].map(gender_mapping, meta=('GENDER', 'int64')) if 'MKT_STAR_GRADE_NAME' in processed_ddf.columns: star_mapping = {'五星级': 5, '四星级': 4, '三星级': 3, '二星级': 2, '一星级': 1, 'Unknown': 0} processed_ddf['MKT_STAR_GRADE_NAME'] = processed_ddf['MKT_STAR_GRADE_NAME'].map(star_mapping, meta=('MKT_STAR_GRADE_NAME', 'int64')) # 特征工程 if 'PROM_AMT' in numeric_cols and 'STMT_AMT' in numeric_cols: processed_ddf['CONSUMPTION_RATIO'] = processed_ddf['PROM_AMT'] / (processed_ddf['STMT_AMT'] + 1) numeric_cols.append('CONSUMPTION_RATIO') if 'DWN_VOL' in numeric_cols and 'ONLINE_DAY' in numeric_cols: processed_ddf['TRAFFIC_DENSITY'] = processed_ddf['DWN_VOL'] / (processed_ddf['ONLINE_DAY'] + 1) numeric_cols.append('TRAFFIC_DENSITY') if 'TERM_CNT' in processed_ddf.columns: processed_ddf['HAS_TERMINAL'] = (processed_ddf['TERM_CNT'] > 0).astype('int64') numeric_cols.append('HAS_TERMINAL') # 标准化处理 scaler = DaskStandardScaler() numeric_cols_for_scaling = [col for col in numeric_cols if col != 'is极_rh_next'] if numeric_cols_for_scaling: processed_ddf[numeric_cols_for_scaling] = scaler.fit_transform(processed_ddf[numeric_cols_for_scaling]) feature_cols = [col for col in processed_ddf.columns if col != 'is_rh_next'] return processed_ddf, feature_cols, means, numeric_cols_for_scaling, scaler def create_spark_session(): """创建或获取现有的Spark会话""" return SparkSession.builder \ .appName("SingleToMeltUserPrediction") \ .config("spark.sql.shuffle.partitions", "8") \ .config("spark.driver.memory", "8g") \ .config("spark.executor.memory", "8g") \ .getOrCreate() def train_models(spark_df, feature_cols): """使用Spark训练多个模型并评估性能""" spark = create_spark_session() assembler = VectorAssembler(inputCols=feature_cols, outputCol="rawFeatures") assembled_df = assembler.transform(spark_df) scaler = StandardScaler(inputCol="rawFeatures", outputCol="features") scaler_model = scaler.fit(assembled_df) scaled_df = scaler_model.transform(assembled_df) train_df, test_df = scaled_df.randomSplit([0.8, 0.2], seed=42) # 定义模型和参数网格 models = { "逻辑回归": ( LogisticRegression(featuresCol="features", labelCol="is_rh_next"), ParamGridBuilder().addGrid(LogisticRegression.regParam, [0.01, 0.1]) .addGrid(LogisticRegression.elasticNetParam, [0.0, 0.5]) .build() ), "决策树": ( DecisionTreeClassifier(featuresCol="features", labelCol="is_rh_next"), ParamGridBuilder().addGrid(DecisionTreeClassifier.maxDepth, [5, 10]) .addGrid(DecisionTreeClassifier.minInstancesPerNode, [10, 20]) .build() ), "随机森林": ( RandomForestClassifier(featuresCol="features", labelCol="is_rh_next", numTrees=10), ParamGridBuilder().addGrid(RandomForestClassifier.numTrees, [10, 20]) .addGrid(RandomForestClassifier.maxDepth, [5, 10]) .build() ) } evaluator = BinaryClassificationEvaluator(labelCol="is_rh_next", metricName="areaUnderROC") results = {} for model_name, (model, param_grid) in models.items(): with st.spinner(f"正在训练{model_name}模型..."): cv = CrossValidator(estimator=model, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3) cv_model = cv.fit(train_df) predictions = cv_model.transform(test_df) auc = evaluator.evaluate(predictions) accuracy = predictions.filter(predictions.is_rh_next == predictions.prediction).count() / test_df.count() results[model_name] = { "model": cv_model, "auc": auc, "accuracy": accuracy, "best_params": cv_model.bestModel._java_obj.parent().extractParamMap(), "feature_importances": getattr(cv_model.bestModel, "featureImportances", {}).toArray().tolist() if model_name != "逻辑回归" else None } return results # 页面布局 st.markdown(""" <div class="header"> <h1 style='text-align: center; margin: 0;'>单宽转融用户预测系统</h1> <p style='text-align: center; margin: 0.5rem 0 0; font-size: 1.1rem;'>基于大数据挖掘的精准营销分析平台</p> </div> """, unsafe_allow_html=True) col1, col2 = st.columns([1, 1.5]) with col1: st.markdown(""" <div class="feature-box"> <h4>📈 系统功能</h4> <ul> <li>用户转化预测</li> <li>多模型对比分析</li> <li>特征重要性分析</li> <li>可视化数据洞察</li> </ul> </div> """, unsafe_allow_html=True) st.image("https://images.unsplash.com/photo-1550751822256-00808c92fc8d?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1200&q=80", caption="精准营销示意图", use_column_width=True) with col2: option = st.radio("", ["🚀 训练新模型 - 使用新数据训练预测模型", "🔍 模型分析 - 查看现有模型的分析结果"], index=0, label_visibility="hidden") if "训练新模型" in option: st.markdown("<div class='model-box'><h4>模型训练</h4><p>上传训练数据并训练新的预测模型</p></div>", unsafe_allow_html=True) train_file = st.file_uploader("上传训练数据 (CSV格式)", type=["csv"], accept_multiple_files=False) if train_file is not None: try: with tempfile.TemporaryDirectory() as tmpdir: tmp_path = os.path.join(tmpdir, "large_file.csv") with open(tmp_path, "wb") as f: f.write(train_file.getvalue()) def detect_encoding(file_path): with open(file_path, 'rb') as f: raw_data = f.read(10000) result = chardet.detect(raw_data) return result['encoding'] detected_encoding = detect_encoding(tmp_path) st.info(f"检测到文件编码: {detected_encoding}") chunksize = 256 * 1024 * 1024 na_values_list = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null'] # 定义特殊列的数据类型 special_dtypes = { 'MAX_PRICE_COMPANY': 'object', 'MAX_PRICE_MODEL': 'object', 'MAX_PRICE_TERM_TYPE': 'object', 'MOBLE_4G_CNT_LV': 'object', 'MOBLE_CNT_LV': 'object', 'OWE_AMT_LV': 'object', 'OWE_CNT_LV': 'object', 'PROM_INTEG_ID': 'object', 'TOUSU_CNT_LV': 'object', 'is_rh_next': 'float64' } # 尝试读取文件 try: raw_ddf = dd.read_csv( tmp_path, blocksize=chunksize, dtype=special_dtypes, encoding=detected_encoding, na_values=na_values_list, assume_missing=True, low_memory=False ) except UnicodeDecodeError: st.warning("检测编码读取失败,尝试GB18030编码...") try: raw_ddf = dd.read_csv( tmp_path, blocksize=chunksize, dtype=special_dtypes, encoding='GB18030', na_values=na_values_list, assume_missing=True, low_memory=False ) except UnicodeDecodeError: st.warning("GB18030读取失败,尝试Latin-1编码...") raw_ddf = dd.read_csv( tmp_path, blocksize=chunksize, dtype=special_dtypes, encoding='latin-1', na_values=na_values_list, assume_missing=True, low_memory=False ) except Exception as e: st.error(f"读取文件时发生错误: {str(e)}") st.stop() with st.expander("数据预览", expanded=True): try: # 使用compute()获取前1000行 preview_data = raw_ddf.head(1000) st.dataframe(preview_data) col1, col2 = st.columns(2) try: total_rows = raw_ddf.shape[0].compute() col1.metric("总样本数", f"{total_rows:,}") except: col1.metric("总样本数", "计算中...") col2.metric("特征数量", len(raw_ddf.columns)) if 'is_rh_next' not in raw_ddf.columns: st.warning("⚠️ 注意:未找到目标变量 'is_rh_next'") else: st.info(f"目标变量类型: {raw_ddf['is_rh_next'].dtype}") except Exception as e: st.error(f"数据预览错误: {str(e)}") st.write("尝试显示前50行...") try: preview_data = raw_ddf.head(50) st.dataframe(preview_data) except: st.error("无法显示数据预览") if st.button("开始数据预处理", use_container_width=True): with st.spinner("正在进行数据预处理,请稍候..."): processed_ddf, feature_cols, means, numeric_cols_for_scaling, scaler = preprocess_data(raw_ddf) preprocessor_params = { 'means': means, 'numeric_cols_for_scaling': numeric_cols_for_scaling, 'scaler': scaler, 'feature_cols': feature_cols } joblib.dump(preprocessor_params, 'preprocessor_params.pkl') processed_ddf.to_csv('processed_data_*.csv', index=False) st.success("✅ 数据预处理完成!") # 显示处理后的数据统计 st.subheader("数据质量检查") with st.spinner("计算缺失值统计..."): try: null_counts = processed_ddf.isnull().sum().compute() st.write("缺失值统计:") st.dataframe(null_counts[null_counts > 0]) except: st.warning("缺失值计算失败") # 可视化关键特征分布 st.subheader("关键特征分布") try: sample_ddf = processed_ddf.sample(frac=0.1) sample_df = sample_ddf.compute() # 选择存在的列进行可视化 plot_cols = [] if 'AGE' in sample_df.columns: plot_cols.append('AGE') if 'ONLINE_DAY' in sample_df.columns: plot_cols.append('ONLINE_DAY') if 'PROM_AMT' in sample_df.columns: plot_cols.append('PROM_AMT') if 'DWN_VOL' in sample_df.columns: plot_cols.append('DWN_VOL') if len(plot_cols) >= 4: fig, axes = plt.subplots(2, 2, figsize=(12, 10)) for i, col in enumerate(plot_cols[:4]): sns.histplot(sample_df[col], ax=axes[i//2, i%2], kde=True) plt.tight_layout() st.pyplot(fig) else: st.warning("缺少足够的列进行可视化") except: st.error("关键特征分布可视化失败") # 目标变量分布 st.subheader("目标变量分布") if 'is_rh_next' in sample_df.columns: fig, ax = plt.subplots(figsize=(6, 4)) sns.countplot(x='is_rh_next', data=sample_df, ax=ax) ax.set_xlabel("是否转化 (0=未转化, 1=转化)") ax.set_ylabel("用户数量") ax.set_title("用户转化分布") st.pyplot(fig) else: st.warning("未找到目标变量 'is_rh_next'") # 特征与目标变量相关性 st.subheader("特征与转化的相关性") if 'is_rh_next' in sample_df.columns: with st.spinner("计算特征相关性..."): try: # 使用采样数据计算相关性 correlation = sample_df[feature_cols + ['is_rh_next']].corr()['is_rh_next'].sort_values(ascending=False) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x=correlation.values, y=correlation.index, ax=ax) ax.set_title("特征与转化的相关性") st.pyplot(fig) except: st.error("特征相关性计算失败") else: st.warning("未找到目标变量 'is_rh_next'") # 模型训练按钮 if st.button("开始模型训练", use_container_width=True): # 检查预处理文件是否存在 if not any(fname.startswith('processed_data_') for fname in os.listdir('.')): st.error("请先进行数据预处理") else: # 创建Spark会话 spark = create_spark_session() # 使用通配符读取所有预处理文件 spark_df = spark.read.csv('processed_data_*.csv', header=True, inferSchema=True) # 加载预处理参数 preprocessor_params = joblib.load('preprocessor_params.pkl') feature_cols = preprocessor_params['feature_cols'] # 训练模型 with st.spinner("正在训练模型,请耐心等待..."): results = train_models(spark_df, feature_cols) # 保存模型结果 joblib.dump(results, 'model_results.pkl') st.success("🎉 模型训练完成!") # 显示模型比较 st.subheader("模型性能对比") model_performance = pd.DataFrame({ '模型': ['逻辑回归', '决策树', '随机森林'], '准确率': [results['逻辑回归']['accuracy'], results['决策树']['accuracy'], results['随机森林']['accuracy']], 'AUC': [results['逻辑回归']['auc'], results['决策树']['auc'], results['随机森林']['auc']] }).sort_values('AUC', ascending=False) st.table(model_performance.style.format({ '准确率': '{:.2%}', 'AUC': '{:.4f}' })) # 最佳模型特征重要性 best_model_name = model_performance.iloc[0]['模型'] best_model = results[best_model_name]['model'].bestModel st.subheader(f"最佳模型 ({best_model_name}) 分析") if best_model_name in ['决策树', '随机森林']: feature_importances = results[best_model_name]['feature_importances'] importance_df = pd.DataFrame({ '特征': feature_cols, '重要性': feature_importances }).sort_values('重要性', ascending=False).head(10) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x='重要性', y='特征', data=importance_df, ax=ax) ax.set_title('Top 10 重要特征') st.pyplot(fig) # 显示最佳模型参数 st.subheader("最佳模型参数") params = results[best_model_name]['best_params'] param_table = pd.DataFrame({ '参数': [str(param.name) for param in params.keys()], '值': [str(value) for value in params.values()] }) st.table(param_table) except Exception as e: st.error(f"数据处理错误: {str(e)}") st.exception(e) else: st.markdown("<div class='model-box'><h4>模型分析</h4><p>查看已有模型的详细分析结果</极div>", unsafe_allow_html=True) if not os.path.exists('model_results.pkl'): st.info("ℹ️ 当前没有可用模型。请先进行模型训练以生成分析报告。") else: results = joblib.load('model_results.pkl') preprocessor_params = joblib.load('preprocessor_params.pkl') feature_cols = preprocessor_params['feature_cols'] model_choice = st.selectbox( "选择要分析的模型", ("逻辑回归", "决策树", "随机森林") ) # 显示模型基本信息 model_info = results[model_choice] st.markdown(f""" <div class="card"> <h3>{model_choice}</h3> <p><strong>AUC得分:</strong> {model_info['auc']:.4f}</p> <p><strong>准确率:</strong> {model_info['accuracy']:.2%}</p> </div> """, unsafe_allow_html=True) # 显示参数详情 with st.expander("模型参数详情", expanded=False): params = model_info['best_params'] param_table = pd.DataFrame({ '参数': [str(param.name) for param in params.keys()], '值': [str(value) for value in params.values()] }) st.table(param_table) # 特征重要性分析 if model_choice in ['决策树', '随机森林']: feature_importances = model_info['feature_importances'] importance_df = pd.DataFrame({ '特征': feature_cols, '重要性': feature_importances }).sort_values('重要性', ascending=False) st.subheader("特征重要性分析") top_features = importance_df.head(10) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x='重要性', y='特征', data=top_features, ax=ax) ax.set_title('Top 10 重要特征') st.pyplot(fig) fig, ax = plt.subplots(figsize=(10, 6)) sns.histplot(importance_df['重要性'], bins=20, ax=ax) ax.set_title('特征重要性分布') st.pyplot(fig) st.write("特征重要性详细数据:") st.dataframe(importance_df.style.background_gradient(subset=['重要性'], cmap='viridis')) # 模型比较 st.subheader("与其他模型的对比") model_performance = pd.DataFrame({ '模型': ['逻辑回归', '决策树', '随机森林'], '准确率': [results['逻辑回归']['accuracy'], results['决策树']['accuracy'], results['随机森林']['accuracy']], 'AUC': [results['逻辑回归']['auc'], results['决策树']['auc'], results['随机森林']['auc']] }).sort_values('AUC', ascending=False) fig, ax = plt.subplots(figsize=(10, 6)) model_performance.set_index('模型')[['AUC', '准确率']].plot(kind='bar', ax=ax) ax.set_title('模型性能对比') ax.set_ylabel('评分') plt.xticks(rotation=0) st.pyplot(fig) st.table(model_performance.style.format({ '准确率': '{:.2%}', 'AUC': '{:.4f}' }).apply(lambda x: ['background: lightgreen' if x.name == model_performance.index[0] else '' for _ in x])) # 页脚 st.markdown("—") st.markdown(""" <div style="text-align: center; color: #7f8c8d; font-size: 0.9rem; padding: 1rem;"> © 2023 单宽转融用户预测系统 | 2231030273 基于Streamlit和Spark开发 </div> """, unsafe_allow_html=True) 执行上述代码提示如下报错,给出修改后完整代码 数据处理错误: 'Series' object has no attribute 'compute' AttributeError: 'Series' object has no attribute 'compute' Traceback: File "D:\2035946879\Single_breadth_to_melt.py", line 481, in <module> processed_ddf, feature_cols, means, numeric_cols_for_scaling, scaler = preprocess_data(raw_ddf) ^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\2035946879\Single_breadth_to_melt.py", line 204, in preprocess_data if is_numeric_column(processed_ddf[col]): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\2035946879\Single_breadth_to_melt.py", line 165, in is_numeric_column sample = series.head(1000).compute() ^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\Anaconda\Lib\site-packages\pandas\core\generic.py", line 6299, in __getattr__ return object.__getattribute__(self, name) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
07-01
import chardet import streamlit as st import pandas as pd import numpy as np import joblib import os import time import matplotlib.pyplot as plt import seaborn as sns from pyspark.sql import SparkSession from pyspark.ml.feature import VectorAssembler, StandardScaler from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from sklearn.metrics import classification_report, confusion_matrix import warnings import dask.dataframe as dd from dask.diagnostics import ProgressBar from dask_ml.preprocessing import StandardScaler as DaskStandardScaler import tempfile import shutil import re warnings.filterwarnings(“ignore”) plt.rcParams[‘font.sans-serif’] = [‘SimHei’] plt.rcParams[‘axes.unicode_minus’] = False 页面设置 st.set_page_config( page_title=“单宽转融用户预测系统”, page_icon=“📶”, layout=“wide”, initial_sidebar_state=“expanded” ) 自定义CSS样式 st.markdown(“”" <style> .stApp { background: linear-gradient(135deg, #f5f7fa 0%, #e4edf5 100%); font-family: 'Helvetica Neue', Arial, sans-serif; } .header { background: linear-gradient(90deg, #2c3e50 0%, #4a6491 100%); color: white; padding: 1.5rem; border-radius: 0.75rem; box-shadow: 0 4px 12px rgba(0,0,0,0.1); margin-bottom: 2rem; } .card { background: white; border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 4px 12px rgba(0,0,0,0.08); transition: transform 0.3s ease; } .card:hover { transform: translateY(-5px); box-shadow: 0 6px 16px rgba(0,0,0,0.12); } .stButton button { background: linear-gradient(90deg, #3498db 0%, #1a5276 100%) !important; color: white !important; border: none !important; border-radius: 0.5rem; padding: 0.75rem 1.5rem; font-size: 1rem; font-weight: 600; transition: all 0.3s ease; width: 100%; } .stButton button:hover { transform: scale(1.05); box-shadow: 0 4px 8px rgba(52, 152, 219, 0.4); } .feature-box { background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; } .result-box { background: linear-gradient(135deg, #e8f5e9 0%, #c8e6c9 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .model-box { background: linear-gradient(135deg, #fff3e0 0%, #ffe0b2 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .stProgress > div > div > div { background: linear-gradient(90deg, #2ecc71 0%, #27ae60 100%) !important; } .metric-card { background: white; border-radius: 0.75rem; padding: 1rem; text-align: center; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .metric-value { font-size: 1.8rem; font-weight: 700; color: #2c3e50; } .metric-label { font-size: 0.9rem; color: #7f8c8d; margin-top: 0.5rem; } .highlight { background: linear-gradient(90deg, #ffeb3b 0%, #fbc02d 100%); padding: 0.2rem 0.5rem; border-radius: 0.25rem; font-weight: 600; } .stDataFrame { border-radius: 0.75rem; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .risk-high { background-color: #ffcdd2 !important; color: #c62828 !important; font-weight: 700; } .risk-medium { background-color: #fff9c4 !important; color: #f57f17 !important; font-weight: 600; } .risk-low { background-color: #c8e6c9 !important; color: #388e3c !important; } </style> “”", unsafe_allow_html=True) def clean_numeric_string(value): “”“清理数值字符串中的非数字字符”“” if pd.isna(value): return np.nan try: # 尝试直接转换为浮点数 return float(value) except (ValueError, TypeError): # 移除非数字字符(除小数点和负号外) cleaned = re.sub(r’\d.-', ‘’, str(value)) try: return float(cleaned) if cleaned else np.nan except ValueError: return np.nan def is_numeric_column(series): “”“检测列是否可以转换为数值类型”“” try: # 尝试转换样本数据 sample = series.head(1000).compute() if isinstance(series, dd.Series) else series.head(1000) pd.to_numeric(sample, errors=‘raise’) return True except (ValueError, TypeError): return False def preprocess_data(ddf): “”“使用Dask进行大数据预处理”“” processed_ddf = ddf.copy() # 删除无意义特征 drop_cols = ['BIL_MONTH', 'ASSET_ROW_ID', 'CCUST_ROW_ID', 'BELONG_CITY', 'MKT_CHANNEL_NAME', 'MKT_CHANNEL_SUB_NAME', 'PREPARE_FLG', 'SERV_START_DT', 'COMB_STAT_NAME', 'FIBER_ACCESS_CATEGORY'] existing_cols = [col for col in drop_cols if col in processed_ddf.columns] processed_ddf = processed_ddf.drop(columns=existing_cols) # 初始数值列 initial_numeric_cols = processed_ddf.select_dtypes(include=[np.number]).columns.tolist() if 'is_rh_next' in initial_numeric_cols: initial_numeric_cols.remove('is_rh_next') # 潜在数值列 potential_numeric_cols = [ 'MAX_PRICE_COMPANY', 'MAX_PRICE_MODEL', 'MAX_PRICE_TERM_TYPE', 'MOBLE_4G_CNT_LV', 'MOBLE_CN极T_LV', 'OWE_AMT_LV', 'OWE_CNT_LV', 'PROM_INTEG_ID', 'TOUSU_CNT_LV' ] # 数值列集合 numeric_cols = initial_numeric_cols.copy() # 处理潜在数值列 for col in potential_numeric_cols: if col in processed_ddf.columns: # 检查列是否可转换为数值型 if is_numeric_column(processed_ddf[col]): try: # 尝试转换为数值型 processed_ddf[col] = processed_ddf[col].apply(clean_numeric_string, meta=(col, 'f8')) numeric_cols.append(col) except Exception as e: st.warning(f"列 {col} 转换为数值型失败: {str(e)},将视为分类特征") else: st.warning(f"列 {col} 包含非数值数据,将视为分类特征") # 处理缺失值 with ProgressBar(): # 计算数值列均值 means = processed_ddf[numeric_cols].mean().compute() # 填充数值列缺失值 for col in numeric_cols: processed_ddf[col] = processed_ddf[col].fillna(means[col]) # 处理非数值列缺失值 non_numeric_cols = [col for col in processed_ddf.columns if col not in numeric_cols and col != 'is_rh_next'] for col in non_numeric_cols: processed_ddf[col] = processed_ddf[col].fillna("Unknown") # 类型转换 for col in numeric_cols: if processed_ddf[col].dtype == 'float64': # 检查是否可以安全转换为整数 try: if processed_ddf[col].dropna().apply(lambda x: x == int(x)).all().compute(): processed_ddf[col] = processed_ddf[col].astype('int64') except: # 如果转换失败,保持浮点类型 pass # 二进制特征编码 binary_cols = ['IF_YHTS', 'is_kdts', 'is_itv_up', 'is_mobile_up', 'if_zzzw_up'] for col in binary_cols: if col in processed_ddf.columns: processed_ddf[col] = processed_ddf[col].map({'否': 0, '是': 1, 0: 0, 1: 1, 'Unknown': -1}, meta=(col, 'int64')) # 分类特征编码 if 'GENDER' in processed_ddf.columns: gender_mapping = {'男': 0, '女': 1, 'Unknown': -1} processed_ddf['GENDER'] = processed_ddf['GENDER'].map(gender_mapping, meta=('GENDER', 'int64')) if 'MKT_STAR_GRADE_NAME' in processed_ddf.columns: star_mapping = {'五星级': 5, '四星级': 4, '三星级': 3, '二星级': 2, '一星级': 1, 'Unknown': 0} processed_ddf['MKT_STAR_GRADE_NAME'] = processed极f['MKT_STAR_GRADE_NAME'].map(star_mapping, meta=('MKT_STAR_GRADE_NAME', 'int64')) # 特征工程 if 'PROM_AMT' in numeric_cols and 'STMT_AMT' in numeric_cols: processed_ddf['CONSUMPTION_RATIO'] = processed_ddf['PROM_AMT'] / (processed_ddf['STMT_AMT'] + 1) numeric_cols.append('CONSUMPTION_RATIO') if 'DWN_VOL' in numeric_cols and 'ONLINE_DAY' in numeric_cols: processed_ddf['TRAFFIC_DENSITY'] = processed_ddf['DWN_VOL'] / (processed_ddf['ONLINE_DAY'] + 1) numeric_cols.append('TRAFFIC_DENSITY') if 'TERM_CNT' in processed_ddf.columns: processed_ddf['HAS_TERMINAL'] = (processed_ddf['TERM_CNT'] > 0).astype('int64') numeric_cols.append('HAS_TERMINAL') # 标准化处理 scaler = DaskStandardScaler() numeric_cols_for_scaling = [col for col in numeric_cols if col != 'is_rh_next'] if numeric_cols_for_scaling: processed_ddf[numeric_cols_for_scaling] = scaler.fit_transform(processed_ddf[numeric_cols_for_scaling]) feature_cols = [col for col in processed_ddf.columns if col != 'is_rh_next'] return processed_ddf, feature_cols, means, numeric_cols_for_scaling, scaler def create_spark_session(): “”“创建或获取现有的Spark会话”“” return SparkSession.builder .appName(“SingleToMeltUserPrediction”) .config(“spark.sql.shuffle.partitions”, “8”) .config(“spark.driver.memory”, “8g”) .config(“spark.executor.memory”, “8g”) .getOrCreate() def train_models(spark_df, feature_cols): “”“使用Spark训练多个模型并评估性能”“” spark = create_spark_session() assembler = VectorAssembler(inputCols=feature_cols, outputCol=“rawFeatures”) assembled_df = assembler.transform(spark_df) scaler = StandardScaler(inputCol="rawFeatures", outputCol="features") scaler_model = scaler.fit(assembled_df) scaled_df = scaler_model.transform(assembled_df) train_df, test_df = scaled_df.randomSplit([0.8, 0.2], seed=42) # 定义模型和参数网格 models = { "逻辑回归": ( LogisticRegression(featuresCol="features", labelCol="is_rh_next"), ParamGridBuilder().addGrid(LogisticRegression.regParam, [0.01, 0.1]) .addGrid(LogisticRegression.elasticNetParam, [0.0, 0.5]) .build() ), "决策树": ( DecisionTreeClassifier(featuresCol="features", labelCol="is_rh_next"), ParamGridBuilder().addGrid(DecisionTreeClassifier.maxDepth, [5, 10]) .addGrid(DecisionTreeClassifier.minInstancesPerNode, [10, 20]) .build() ), "随机森林": ( RandomForestClassifier(featuresCol="features", labelCol="is_rh_next", numTrees=10), ParamGridBuilder().addGrid(RandomForestClassifier.numTrees, [10, 20]) .addGrid(RandomForestClassifier.maxDepth, [5, 10]) .build() ) } evaluator = BinaryClassificationEvaluator(labelCol="is_rh_next", metricName="areaUnderROC") results = {} for model_name, (model, param_grid) in models.items(): with st.spinner(f"正在训练{model_name}模型..."): cv = CrossValidator(estimator=model, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3) cv_model = cv.fit(train_df) predictions = cv_model.transform(test_df) auc = evaluator.evaluate(predictions) accuracy = predictions.filter(predictions.is_rh_next == predictions.prediction).count() / test_df.count() results[model_name] = { "model": cv_model, "auc": auc, "accuracy": accuracy, "best_params": cv_model.bestModel._java_obj.parent().extractParamMap(), "feature_importances": getattr(cv_model.bestModel, "featureImportances", {}).toArray().tolist() if model_name != "逻辑回归" else None } return results 页面布局 st.markdown(“”" <div class="header"> <h1 style='text-align: center; margin: 0;'>单宽转融用户预测系统</h1> <p style='text-align: center; margin: 0.5rem 0 0; font-size: 1.1rem;'>基于大数据挖掘的精准营销分析平台</p> </div> """, unsafe_allow_html=True) col1, col2 = st.columns([1, 1.5]) with col1: st.markdown(“”" 📈 系统功能 用户转化预测 多模型对比分析 特征重要性分析 可视化数据洞察 “”", unsafe_allow_html=True) st.image(“https://images.unsplash.com/photo-1550751822256-00808c92fc8d?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1200&q=80”, caption=“精准营销示意图”, use_column_width=True) with col2: option = st.radio(“”, [“🚀 训练新模型 - 使用新数据训练预测模型”, “🔍 模型分析 - 查看现有模型的分析结果”], index=0, label_visibility=“hidden”) if "训练新模型" in option: st.markdown("<div class='model-box'><h4>模型训练</h4><p>上传训练数据并训练新的预测模型</p></div>", unsafe_allow_html=True) train_file = st.file_uploader("上传训练数据 (CSV格式)", type=["csv"], accept_multiple_files=False) if train_file is not None: try: with tempfile.TemporaryDirectory() as tmpdir: tmp_path = os.path.join(tmpdir, "large_file.csv") with open(tmp_path, "wb") as f: f.write(train_file.getvalue()) def detect_encoding(file_path): with open(file_path, 'rb') as f: raw_data = f.read(10000) result = chardet.detect(raw_data) return result['encoding'] detected_encoding = detect_encoding(tmp_path) st.info(f"检测到文件编码: {detected_encoding}") chunksize = 256 * 1024 * 1024 na_values_list = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null'] # 定义特殊列的数据类型 special_dtypes = { 'MAX_PRICE_COMPANY': 'object', 'MAX_PRICE_MODEL': 'object', 'MAX_PRICE_TERM_TYPE': 'object', 'MOBLE_4G_CNT_LV': 'object', 'MOBLE_CNT_LV': 'object', 'OWE_AMT_LV': 'object', 'OWE_CNT_LV': 'object', 'PROM_INTEG_ID': 'object', 'TOUSU_CNT_LV': 'object', 'is_rh_next': 'float64' } # 尝试读取文件 try: raw_ddf = dd.read_csv( tmp_path, blocksize=chunksize, dtype=special_dtypes, encoding=detected_encoding, na_values=na_values_list, assume_missing=True, low_memory=False ) except UnicodeDecodeError: st.warning("检测编码读取失败,尝试GB18030编码...") try: raw_ddf = dd.read_csv( tmp_path, blocksize=chunksize, dtype=special_dtypes, encoding='GB18030', na_values=na_values_list, assume_missing=True, low_memory=False ) except UnicodeDecodeError: st.warning("GB18030读取失败,尝试Latin-1编码...") raw_ddf = dd.read_csv( tmp_path, blocksize=chunksize, dtype=special_dtypes, encoding='latin-1', na_values=na_values_list, assume_missing=True, low_memory=False ) except Exception as e: st.error(f"读取文件时发生错误: {str(e)}") st.stop() with st.expander("数据预览", expanded=True): try: # 使用compute()获取前1000行 preview_data = raw_ddf.head(1000) st.dataframe(preview_data) col1, col2 = st.columns(2) try: total_rows = raw_ddf.shape[0].compute() col1.metric("总样本数", f"{total_rows:,}") except: col1.metric("总样本数", "计算中...") col2.metric("特征数量", len(raw_ddf.columns)) if 'is_rh_next' not in raw_ddf.columns: st.warning("⚠️ 注意:未找到目标变量 'is_rh_next'") else: st.info(f"目标变量类型: {raw_ddf['is_rh_next'].dtype}") except Exception as e: st.error(f"数据预览错误: {str(e)}") st.write("尝试显示前50行...") try: preview_data = raw_ddf.head(50) st.dataframe(preview_data) except: st.error("无法显示数据预览") if st.button("开始数据预处理", use_container_width=True): with st.spinner("正在进行数据预处理,请稍候..."): processed_ddf, feature_cols, means, numeric_cols_for_scaling, scaler = preprocess_data(raw_ddf) preprocessor_params = { 'means': means, 'numeric_cols_for_scaling': numeric_cols_for_scaling, 'scaler': scaler, 'feature_cols': feature_cols } joblib.dump(preprocessor_params, 'preprocessor_params.pkl') processed_ddf.to_csv('processed_data_*.csv', index=False) st.success("✅ 数据预处理完成!") # 显示处理后的数据统计 st.subheader("数据质量检查") with st.spinner("计算缺失值统计..."): try: null_counts = processed_ddf.isnull().sum().compute() st.write("缺失值统计:") st.dataframe(null_counts[null_counts > 0]) except: st.warning("缺失值计算失败") # 可视化关键特征分布 st.subheader("关键特征分布") try: sample_ddf = processed_ddf.sample(frac=0.1) sample_df = sample_ddf.compute() # 选择存在的列进行可视化 plot_cols = [] if 'AGE' in sample_df.columns: plot_cols.append('AGE') if 'ONLINE_DAY' in sample_df.columns: plot_cols.append('ONLINE_DAY') if 'PROM_AMT' in sample_df.columns: plot_cols.append('PROM_AMT') if 'DWN_VOL' in sample_df.columns: plot_cols.append('DWN_VOL') if len(plot_cols) >= 4: fig, axes = plt.subplots(2, 2, figsize=(12, 10)) for i, col in enumerate(plot_cols[:4]): sns.histplot(sample_df[col], ax=axes[i//2, i%2], kde=True) plt.tight_layout() st.pyplot(fig) else: st.warning("缺少足够的列进行可视化") except: st.error("关键特征分布可视化失败") # 目标变量分布 st.subheader("目标变量分布") if 'is_rh_next' in sample_df.columns: fig, ax = plt.subplots(figsize=(6, 4)) sns.countplot(x='is_rh_next', data=sample_df, ax=ax) ax.set_xlabel("是否转化 (0=未转化, 1=转化)") ax.set_ylabel("用户数量") ax.set_title("用户转化分布") st.pyplot(fig) else: st.warning("未找到目标变量 'is_rh_next'") # 特征与目标变量相关性 st.subheader("特征与转化的相关性") if 'is_rh_next' in sample_df.columns: with st.spinner("计算特征相关性..."): try: # 使用采样数据计算相关性 correlation = sample_df[feature_cols + ['is_rh_next']].corr()['is_rh_next'].sort_values(ascending=False) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x=correlation.values, y=correlation.index, ax=ax) ax.set_title("特征与转化的相关性") st.pyplot(fig) except: st.error("特征相关性计算失败") else: st.warning("未找到目标变量 'is_rh_next'") # 模型训练按钮 if st.button("开始模型训练", use_container_width=True): # 检查预处理文件是否存在 if not any(fname.startswith('processed_data_') for fname in os.listdir('.')): st.error("请先进行数据预处理") else: # 创建Spark会话 spark = create_spark_session() # 使用通配符读取所有预处理文件 spark_df = spark.read.csv('processed_data_*.csv', header=True, inferSchema=True) # 加载预处理参数 preprocessor_params = joblib.load('preprocessor_params.pkl') feature_cols = preprocessor_params['feature_cols'] # 训练模型 with st.spinner("正在训练模型,请耐心等待..."): results = train_models(spark_df, feature_cols) # 保存模型结果 joblib.dump(results, 'model_results.pkl') st.success("🎉 模型训练完成!") # 显示模型比较 st.subheader("模型性能对比") model_performance = pd.DataFrame({ '模型': ['逻辑回归', '决策树', '随机森林'], '准确率': [results['逻辑回归']['accuracy'], results['决策树']['accuracy'], results['随机森林']['accuracy']], 'AUC': [results['逻辑回归']['auc'], results['决策树']['auc'], results['随机森林']['auc']] }).sort_values('AUC', ascending=False) st.table(model_performance.style.format({ '准确率': '{:.2%}', 'AUC': '{:.4f}' })) # 最佳模型特征重要性 best_model_name = model_performance.iloc[0]['模型'] best_model = results[best_model_name]['model'].bestModel st.subheader(f"最佳模型 ({best_model_name}) 分析") if best_model_name in ['决策树', '随机森林']: feature_importances = results[best_model_name]['feature_importances'] importance_df = pd.DataFrame({ '特征': feature_cols, '重要性': feature_importances }).sort_values('重要性', ascending=False).head(10) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x='重要性', y='特征', data=importance_df, ax=ax) ax.set_title('Top 10 重要特征') st.pyplot(fig) # 显示最佳模型参数 st.subheader("最佳模型参数") params = results[best_model_name]['best_params'] param_table = pd.DataFrame({ '参数': [str(param.name) for param in params.keys()], '值': [str(value) for value in params.values()] }) st.table(param_table) except Exception as e: st.error(f"数据处理错误: {str(e)}") st.exception(e) else: st.markdown("<div class='model-box'><h4>模型分析</h4><p>查看已有模型的详细分析结果</p></div>", unsafe_allow_html=True) if not os.path.exists('model_results.pkl'): st.info("ℹ️ 当前没有可用模型。请先进行模型训练以生成分析报告。") else: results = joblib.load('model_results.pkl') preprocessor_params = joblib.load('preprocessor_params.pkl') feature_cols = preprocessor_params['feature_cols'] model_choice = st.selectbox( "选择要分析的模型", ("逻辑回归", "决策树", "随机森林") ) # 显示模型基本信息 model_info = results[model_choice] st.markdown(f""" <div class="card"> <h3>{model_choice}</h3> <p><strong>AUC得分:</strong> {model_info['auc']:.4f}</p> <p><strong>准确率:</strong> {model_info['accuracy']:.2%}</p> </div> """, unsafe_allow_html=True) # 显示参数详情 with st.expander("模型参数详情", expanded=False): params = model_info['best_params'] param_table = pd.DataFrame({ '参数': [str(param.name) for param in params.keys()], '值': [str(value) for value in params.values()] }) st.table(param_table) # 特征重要性分析 if model_choice in ['决策树', '随机森林']: feature_importances = model_info['feature_importances'] importance_df = pd.DataFrame({ '特征': feature_cols, '重要性': feature_importances }).sort_values('重要性', ascending=False) st.subheader("特征重要性分析") top_features = importance_df.head(10) fig, ax = plt.subplots(f极size=(10, 6)) sns.barplot(x='重要性', y='特征', data=top_features, ax=ax) ax.set_title('Top 10 重要特征') st.pyplot(fig) fig, ax = plt.subplots(figsize=(10, 6)) sns.histplot(importance_df['重要性'], bins=20, ax=ax) ax.set_title('特征重要性分布') st.pyplot(fig) st.write("特征重要性详细数据:") st.dataframe(importance_df.style.background_gradient(subset=['重要性'], cmap='viridis')) # 模型比较 st.subheader("与其他模型的对比") model_performance = pd.DataFrame({ '模型': ['逻辑回归', '决策树', '随机森林'], '准确率': [results['逻辑回归']['accuracy'], results['决策树']['accuracy'], results['随机森林']['accuracy']], 'AUC': [results['逻辑回归']['auc'], results['决策树']['auc'], results['随机森林']['auc']] }).sort_values('AUC', ascending=False) fig, ax = plt.subplots(figsize=(10, 6)) model_performance.set_index('模型')[['AUC', '准确率']].plot(kind='bar', ax=ax) ax.set_title('模型性能对比') ax.set_ylabel('评分') plt.xticks(rotation=0) st.pyplot(fig) st.table(model_performance.style.format({ '准确率': '{:.2%}', 'AUC': '{:.4f}' }).apply(lambda x: ['background: lightgreen' if x.name == model_performance.index[0] else '' for _ in x])) 页脚 st.markdown(“—”) st.markdown(“”" <div style="text-align: center; color: #7f8c8d; font-size: 0.9rem; padding: 1rem;"> © 2023 单宽转融用户预测系统 | 2231030273 基于Streamlit和Spark开发 </div> """, unsafe_allow_html=True) 又出现如下错误数据处理错误: ‘Series’ object has no attribute ‘compute’ AttributeError: ‘Series’ object has no attribute ‘compute’ Traceback: File “D:\2035946879\Single_breadth_to_melt.py”, line 474, in processed_ddf, feature_cols, means, numeric_cols_for_scaling, scaler = preprocess_data(raw_ddf) ^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\2035946879\Single_breadth_to_melt.py”, line 199, in preprocess_data if is_numeric_column(processed_ddf[col]): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\2035946879\Single_breadth_to_melt.py”, line 163, in is_numeric_column sample = series.head(1000).compute() if isinstance(series, dd.Series) else series.head(1000) ^^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\Anaconda\Lib\site-packages\pandas\core\generic.py”, line 6299, in getattr return object.getattribute(self, name) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 给出修改后完整代码
07-01
import numpy as np import matplotlib.pyplot as plt import pandas as pd import tkinter as tk from tkinter import ttk, filedialog, messagebox from PIL import Image, ImageDraw import cv2 import os import csv from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score from sklearn.preprocessing import StandardScaler # 设置中文字体和负号显示 plt.rcParams["font.family"] = ["SimHei", "Microsoft YaHei"] plt.rcParams["axes.unicode_minus"] = False # 尝试导入XGBoost和LightGBM XGB_INSTALLED = False LGB_INSTALLED = False try: import xgboost as xgb XGB_INSTALLED = True except ImportError: print("警告: 未安装XGBoost库,无法使用XGBoost模型") try: import lightgbm as lgb LGB_INSTALLED = True except ImportError: print("警告: 未安装LightGBM库,无法使用LightGBM模型") # 定义模型元数据常量(优化参数) MODEL_METADATA = { 'svm': ('支持向量机(SVM)', SVC, StandardScaler, {'probability': True, 'random_state': 42}), 'dt': ('决策树(DT)', DecisionTreeClassifier, None, {'random_state': 42}), 'rf': ('随机森林(RF)', RandomForestClassifier, None, {'n_estimators': 100, 'random_state': 42}), 'mlp': ('多层感知机(MLP)', MLPClassifier, StandardScaler, {'hidden_layer_sizes': (100, 50), 'max_iter': 500, 'random_state': 42}), 'knn': ('K最近邻(KNN)', KNeighborsClassifier, StandardScaler, {'n_neighbors': 5, 'weights': 'distance'}), 'nb': ('高斯朴素贝叶斯(NB)', GaussianNB, None, {}), } # 添加可选模型 if XGB_INSTALLED: MODEL_METADATA['xgb'] = ('XGBoost(XGB)', xgb.XGBClassifier, None, {'objective': 'multi:softmax', 'random_state': 42}) if LGB_INSTALLED: MODEL_METADATA['lgb'] = ('LightGBM(LGB)', lgb.LGBMClassifier, None, { 'objective': 'multiclass', 'random_state': 42, 'num_class': 10, 'max_depth': 5, 'min_child_samples': 10, 'learning_rate': 0.1, 'force_col_wise': True }) class ModelFactory: @staticmethod def get_split_data(digits_dataset): """数据集划分""" X, y = digits_dataset.data, digits_dataset.target return train_test_split(X, y, test_size=0.3, random_state=42) @classmethod def create_model(cls, model_type): """创建模型和数据标准化器""" if model_type not in MODEL_METADATA: raise ValueError(f"未知模型类型: {model_type}") name, model_cls, scaler_cls, params = MODEL_METADATA[model_type] if not model_cls: raise ImportError(f"{name}模型依赖库未安装") model = model_cls(**params) scaler = scaler_cls() if scaler_cls else None return model, scaler @staticmethod def train_model(model, X_train, y_train, scaler=None, model_type=None): """训练模型""" if scaler: X_train = scaler.fit_transform(X_train) if model_type == 'lgb' and isinstance(X_train, np.ndarray): X_train = pd.DataFrame(X_train) model.fit(X_train, y_train) return model @staticmethod def evaluate_model(model, X_test, y_test, scaler=None, model_type=None): """评估模型""" if scaler: X_test = scaler.transform(X_test) if model_type == 'lgb' and isinstance(X_test, np.ndarray) and hasattr(model, 'feature_name_'): X_test = pd.DataFrame(X_test, columns=model.feature_name_) y_pred = model.predict(X_test) return accuracy_score(y_test, y_pred) @classmethod def train_and_evaluate(cls, model_type, X_train, y_train, X_test, y_test): """训练并评估模型""" try: model, scaler = cls.create_model(model_type) model = cls.train_model(model, X_train, y_train, scaler, model_type) accuracy = cls.evaluate_model(model, X_test, y_test, scaler, model_type) return model, scaler, accuracy except Exception as e: print(f"模型 {model_type} 训练/评估错误: {str(e)}") raise @classmethod def evaluate_all_models(cls, digits_dataset): """评估所有可用模型""" print("\n=== 模型评估 ===") X_train, X_test, y_train, y_test = cls.get_split_data(digits_dataset) results = [] for model_type in MODEL_METADATA: name = MODEL_METADATA[model_type][0] print(f"评估模型: {name} ({model_type})") if not MODEL_METADATA[model_type][1]: results.append({"模型名称": name, "准确率": "N/A"}) continue try: _, _, accuracy = cls.train_and_evaluate( model_type, X_train, y_train, X_test, y_test ) results.append({"模型名称": name, "准确率": f"{accuracy:.4f}"}) except Exception as e: results.append({"模型名称": name, "准确率": f"错误: {str(e)}"}) # 按准确率排序 results.sort( key=lambda x: float(x["准确率"]) if isinstance(x["准确率"], str) and x["准确率"].replace('.', '', 1).isdigit() else -1, reverse=True ) print(pd.DataFrame(results)) return results class HandwritingBoard: CANVAS_SIZE = 300 # 固定画布尺寸 BRUSH_SIZE = 12 # 画笔大小 def __init__(self, root, model_factory, digits): self.root = root self.root.title("手写数字识别系统") self.root.geometry("1000x700") # 增加窗口尺寸以容纳所有组件 self.model_factory = model_factory self.digits = digits self.model_cache = {} self.current_model = None self.scaler = None self.current_model_type = None self.has_drawn = False self.custom_data = [] self.drawing = False self.last_x = self.last_y = 0 # 自定义数据目录 self.data_dir = "custom_digits_data" os.makedirs(self.data_dir, exist_ok=True) # 初始化画布 self.image = Image.new("L", (self.CANVAS_SIZE, self.CANVAS_SIZE), 255) self.draw_obj = ImageDraw.Draw(self.image) self.create_widgets() self.init_default_model() def create_widgets(self): """使用grid布局管理器创建界面组件""" # 创建主框架 main_frame = tk.Frame(self.root) main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 使用grid布局管理器 # 第一行:模型选择区域 model_frame = tk.LabelFrame(main_frame, text="模型选择", font=("Arial", 10, "bold")) model_frame.grid(row=0, column=0, columnspan=2, sticky="ew", padx=5, pady=5) model_frame.grid_columnconfigure(1, weight=1) # 让模型标签可以扩展 tk.Label(model_frame, text="选择模型:", font=("Arial", 10)).grid(row=0, column=0, padx=5, pady=5, sticky="w") self.available_models = [] for model_type, (name, _, _, _) in MODEL_METADATA.items(): if MODEL_METADATA[model_type][1]: self.available_models.append((model_type, name)) self.model_var = tk.StringVar() self.model_combobox = ttk.Combobox( model_frame, textvariable=self.model_var, values=[name for _, name in self.available_models], state="readonly", width=25, font=("Arial", 10) ) self.model_combobox.current(0) self.model_combobox.bind("<<ComboboxSelected>>", self.on_model_select) self.model_combobox.grid(row=0, column=1, padx=5, pady=5, sticky="ew") self.model_label = tk.Label( model_frame, text="", font=("Arial", 10), relief=tk.SUNKEN, padx=5, pady=2 ) self.model_label.grid(row=0, column=2, padx=5, pady=5, sticky="ew") # 第二行:左侧绘图区域和右侧结果区域 # 左侧绘图区域 left_frame = tk.LabelFrame(main_frame, text="绘制区域", font=("Arial", 10, "bold")) left_frame.grid(row=1, column=0, padx=5, pady=5, sticky="nsew") self.canvas = tk.Canvas(left_frame, bg="white", width=self.CANVAS_SIZE, height=self.CANVAS_SIZE) self.canvas.pack(padx=10, pady=10) self.canvas.bind("<Button-1>", self.start_draw) self.canvas.bind("<B1-Motion>", self.draw) self.canvas.bind("<ButtonRelease-1>", self.stop_draw) # 添加绘制提示 self.canvas.create_text( self.CANVAS_SIZE / 2, self.CANVAS_SIZE / 2, text="绘制数字", fill="gray", font=("Arial", 16) ) # 绘图控制按钮 btn_frame = tk.Frame(left_frame) btn_frame.pack(fill=tk.X, pady=(0, 10)) tk.Button(btn_frame, text="识别", command=self.recognize, width=8).pack(side=tk.LEFT, padx=5) tk.Button(btn_frame, text="清除", command=self.clear_canvas, width=8).pack(side=tk.LEFT, padx=5) tk.Button(btn_frame, text="样本", command=self.show_samples, width=8).pack(side=tk.LEFT, padx=5) # 右侧结果区域 right_frame = tk.Frame(main_frame) right_frame.grid(row=1, column=1, padx=5, pady=5, sticky="nsew") # 识别结果 result_frame = tk.LabelFrame(right_frame, text="识别结果", font=("Arial", 10, "bold")) result_frame.pack(fill=tk.X, padx=5, pady=5) self.result_label = tk.Label( result_frame, text="请绘制数字", font=("Arial", 24), pady=10 ) self.result_label.pack() self.prob_label = tk.Label( result_frame, text="", font=("Arial", 12) ) self.prob_label.pack() # 置信度可视化 confidence_frame = tk.LabelFrame(right_frame, text="识别置信度", font=("Arial", 10, "bold")) confidence_frame.pack(fill=tk.X, padx=5, pady=5) self.confidence_canvas = tk.Canvas( confidence_frame, bg="white", height=50 ) self.confidence_canvas.pack(fill=tk.X, padx=10, pady=10) self.confidence_canvas.create_text( 150, 25, text="识别后显示置信度", fill="gray", font=("Arial", 10) ) # 候选数字 candidates_frame = tk.LabelFrame(right_frame, text="可能的数字", font=("Arial", 10, "bold")) candidates_frame.pack(fill=tk.X, padx=5, pady=5) columns = ("数字", "概率") self.candidates_tree = ttk.Treeview( candidates_frame, columns=columns, show="headings", height=4 ) for col in columns: self.candidates_tree.heading(col, text=col) self.candidates_tree.column(col, width=80, anchor=tk.CENTER) scrollbar = ttk.Scrollbar( candidates_frame, orient=tk.VERTICAL, command=self.candidates_tree.yview ) self.candidates_tree.configure(yscroll=scrollbar.set) self.candidates_tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5, pady=5) scrollbar.pack(side=tk.RIGHT, fill=tk.Y, padx=5, pady=5) # 第三行:模型性能对比和训练集管理 # 模型性能对比 performance_frame = tk.LabelFrame(main_frame, text="模型性能对比", font=("Arial", 10, "bold")) performance_frame.grid(row=2, column=0, padx=5, pady=5, sticky="nsew") columns = ("模型名称", "准确率") self.performance_tree = ttk.Treeview( performance_frame, columns=columns, show="headings", height=8 ) for col in columns: self.performance_tree.heading(col, text=col) self.performance_tree.column(col, width=120, anchor=tk.CENTER) scrollbar = ttk.Scrollbar( performance_frame, orient=tk.VERTICAL, command=self.performance_tree.yview ) self.performance_tree.configure(yscroll=scrollbar.set) self.performance_tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5, pady=5) scrollbar.pack(side=tk.RIGHT, fill=tk.Y, padx=5, pady=5) # 训练集管理 train_frame = tk.LabelFrame(main_frame, text="训练集管理", font=("Arial", 10, "bold")) train_frame.grid(row=2, column=1, padx=5, pady=5, sticky="nsew") # 使用grid布局训练集管理按钮 tk.Button( train_frame, text="保存为训练样本", command=self.save_as_training_sample, width=18, height=2 ).grid(row=0, column=0, padx=5, pady=5, sticky="ew") tk.Button( train_frame, text="保存全部训练集", command=self.save_all_training_data, width=18, height=2 ).grid(row=0, column=1, padx=5, pady=5, sticky="ew") tk.Button( train_frame, text="加载训练集", command=self.load_training_data, width=18, height=2 ).grid(row=1, column=0, padx=5, pady=5, sticky="ew") tk.Button( train_frame, text="性能图表", command=self.show_performance_chart, width=18, height=2 ).grid(row=1, column=1, padx=5, pady=5, sticky="ew") # 状态信息 self.status_var = tk.StringVar(value="就绪") status_bar = tk.Label( self.root, textvariable=self.status_var, bd=1, relief=tk.SUNKEN, anchor=tk.W, font=("Arial", 10) ) status_bar.pack(side=tk.BOTTOM, fill=tk.X) # 配置权重 main_frame.grid_columnconfigure(0, weight=1) main_frame.grid_columnconfigure(1, weight=1) main_frame.grid_rowconfigure(1, weight=1) main_frame.grid_rowconfigure(2, weight=1) def start_draw(self, event): """开始绘制""" self.drawing = True self.last_x, self.last_y = event.x, event.y def draw(self, event): """绘制""" if not self.drawing: return x, y = event.x, event.y # 在画布上绘制 self.canvas.create_line( self.last_x, self.last_y, x, y, fill="black", width=self.BRUSH_SIZE, capstyle=tk.ROUND, smooth=True ) # 在图像上绘制 self.draw_obj.line( [self.last_x, self.last_y, x, y], fill=0, width=self.BRUSH_SIZE ) self.last_x, self.last_y = x, y def stop_draw(self, event): """停止绘制""" self.drawing = False self.has_drawn = True self.status_var.set("已绘制数字,点击'识别'进行识别") def clear_canvas(self): """清除画布""" self.canvas.delete("all") self.image = Image.new("L", (self.CANVAS_SIZE, self.CANVAS_SIZE), 255) self.draw_obj = ImageDraw.Draw(self.image) # 添加绘制提示 self.canvas.create_text( self.CANVAS_SIZE / 2, self.CANVAS_SIZE / 2, text="绘制数字", fill="gray", font=("Arial", 16) ) self.result_label.config(text="请绘制数字") self.prob_label.config(text="") self.clear_confidence_display() self.has_drawn = False self.status_var.set("画布已清除") def clear_confidence_display(self): """清除置信度显示""" self.confidence_canvas.delete("all") self.confidence_canvas.create_text( 150, 25, text="识别后显示置信度", fill="gray", font=("Arial", 10) ) for item in self.candidates_tree.get_children(): self.candidates_tree.delete(item) def preprocess_image(self): """预处理手写数字图像""" img_array = np.array(self.image) # 高斯模糊降噪 img_array = cv2.GaussianBlur(img_array, (5, 5), 0) # 二值化 _, img_array = cv2.threshold(img_array, 127, 255, cv2.THRESH_BINARY_INV) # 轮廓检测 contours, _ = cv2.findContours(img_array, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) if not contours: self.status_var.set("未检测到有效数字,请重新绘制") return None # 找到最大轮廓 c = max(contours, key=cv2.contourArea) x, y, w, h = cv2.boundingRect(c) # 提取数字区域 digit = img_array[y:y+h, x:x+w] # 填充为正方形 size = max(w, h) padded = np.ones((size, size), dtype=np.uint8) * 255 offset_x = (size - w) // 2 offset_y = (size - h) // 2 padded[offset_y:offset_y+h, offset_x:offset_x+w] = digit # 缩放为8x8 resized = cv2.resize(padded, (8, 8), interpolation=cv2.INTER_AREA) # 归一化 normalized = 16 - (resized / 255 * 16).astype(np.uint8) return normalized.flatten() def recognize(self): """识别手写数字""" if not self.has_drawn: self.status_var.set("请先绘制数字再识别") return if self.current_model is None: self.status_var.set("模型未加载,请选择模型") return # 预处理图像 img_array = self.preprocess_image() if img_array is None: return img_input = img_array.reshape(1, -1) try: # 标准化 if self.scaler: img_input = self.scaler.transform(img_input) # LightGBM特殊处理 if self.current_model_type == 'lgb' and hasattr(self.current_model, 'feature_name_'): img_input = pd.DataFrame(img_input, columns=self.current_model.feature_name_) # 预测 pred = self.current_model.predict(img_input)[0] self.result_label.config(text=f"识别结果: {pred}") # 概率预测 if hasattr(self.current_model, 'predict_proba'): probs = self.current_model.predict_proba(img_input)[0] confidence = probs[pred] # 更新UI self.prob_label.config(text=f"置信度: {confidence:.2%}") self.update_confidence_display(confidence) # 显示候选数字 top3 = sorted(enumerate(probs), key=lambda x: -x[1])[:3] self.update_candidates_display(top3) else: self.prob_label.config(text="该模型不支持概率输出") self.clear_confidence_display() self.status_var.set(f"识别完成: 数字 {pred}") except Exception as e: self.status_var.set(f"识别错误: {str(e)}") self.clear_confidence_display() def update_confidence_display(self, confidence): """更新置信度可视化""" self.confidence_canvas.delete("all") # 画布尺寸 canvas_width = self.confidence_canvas.winfo_width() or 300 # 绘制背景 self.confidence_canvas.create_rectangle( 10, 10, canvas_width - 10, 40, fill="#f0f0f0", outline="#cccccc" ) # 绘制置信度条 bar_width = int((canvas_width - 20) * confidence) color = self.get_confidence_color(confidence) self.confidence_canvas.create_rectangle( 10, 10, 10 + bar_width, 40, fill=color, outline="" ) # 绘制文本 self.confidence_canvas.create_text( canvas_width / 2, 25, text=f"{confidence:.1%}", font=("Arial", 10, "bold") ) # 绘制刻度 for i in range(0, 11): x_pos = 10 + i * (canvas_width - 20) / 10 self.confidence_canvas.create_line(x_pos, 40, x_pos, 45, width=1) if i % 2 == 0: self.confidence_canvas.create_text(x_pos, 55, text=f"{i*10}%", font=("Arial", 8)) def get_confidence_color(self, confidence): """根据置信度获取颜色""" if confidence >= 0.9: return "#4CAF50" # 绿色 elif confidence >= 0.7: return "#FFC107" # 黄色 else: return "#F44336" # 红色 def update_candidates_display(self, candidates): """更新候选数字显示""" # 清空现有项 for item in self.candidates_tree.get_children(): self.candidates_tree.delete(item) # 添加新项 for digit, prob in candidates: self.candidates_tree.insert( "", tk.END, values=(digit, f"{prob:.2%}") ) def show_samples(self): """显示样本图像""" plt.figure(figsize=(10, 4)) for i in range(10): plt.subplot(2, 5, i+1) sample_idx = np.where(self.digits.target == i)[0][0] plt.imshow(self.digits.images[sample_idx], cmap="gray") plt.title(f"数字 {i}", fontsize=9) plt.axis("off") plt.tight_layout() plt.show() def on_model_select(self, event): """模型选择事件处理""" selected_name = self.model_var.get() model_type = next( (k for k, v in self.available_models if v == selected_name), None ) if model_type: self.change_model(model_type) def change_model(self, model_type): """切换模型""" model_name = MODEL_METADATA[model_type][0] # 从缓存加载 if model_type in self.model_cache: self.current_model, self.scaler, accuracy, self.current_model_type = self.model_cache[model_type] self.model_label.config(text=f"{model_name} (准确率:{accuracy:.4f})") self.status_var.set(f"已加载模型: {model_name}") return self.status_var.set(f"正在加载模型: {model_name}...") self.root.update() # 更新UI显示状态 try: X_train, X_test, y_train, y_test = self.model_factory.get_split_data(self.digits) self.current_model, self.scaler, accuracy = self.model_factory.train_and_evaluate( model_type, X_train, y_train, X_test, y_test ) self.current_model_type = model_type self.model_cache[model_type] = (self.current_model, self.scaler, accuracy, self.current_model_type) self.model_label.config(text=f"{model_name} (准确率:{accuracy:.4f})") self.status_var.set(f"模型加载完成: {model_name}, 准确率: {accuracy:.4f}") self.clear_canvas() # 更新性能表格 self.load_performance_data() except Exception as e: self.status_var.set(f"模型加载失败: {str(e)}") self.model_label.config(text="模型加载失败") def init_default_model(self): """初始化默认模型""" self.model_var.set(self.available_models[0][1]) self.change_model(self.available_models[0][0]) def load_performance_data(self): """加载性能数据""" results = self.model_factory.evaluate_all_models(self.digits) # 清空表格 for item in self.performance_tree.get_children(): self.performance_tree.delete(item) # 添加数据 for i, result in enumerate(results): tag = "highlight" if i == 0 else "" self.performance_tree.insert( "", tk.END, values=(result["模型名称"], result["准确率"]), tags=(tag,) ) self.performance_tree.tag_configure("highlight", background="#e6f7ff") def show_performance_chart(self): """显示性能图表""" results = self.model_factory.evaluate_all_models(self.digits) # 提取有效结果 valid_results = [] for result in results: try: accuracy = float(result["准确率"]) valid_results.append((result["模型名称"], accuracy)) except ValueError: continue if not valid_results: messagebox.showinfo("提示", "没有可用的性能数据") return # 排序 valid_results.sort(key=lambda x: x[1], reverse=True) models, accuracies = zip(*valid_results) # 创建图表 plt.figure(figsize=(10, 5)) bars = plt.barh(models, accuracies, color='#2196F3') plt.xlabel('准确率', fontsize=10) plt.ylabel('模型', fontsize=10) plt.title('模型性能对比', fontsize=12) plt.xlim(0, 1.05) # 添加数值标签 for bar in bars: width = bar.get_width() plt.text( width + 0.01, bar.get_y() + bar.get_height()/2, f'{width:.4f}', ha='left', va='center', fontsize=8 ) plt.tight_layout() plt.show() def save_as_training_sample(self): """保存为训练样本""" if not self.has_drawn: self.status_var.set("请先绘制数字再保存") return img_array = self.preprocess_image() if img_array is None: return # 弹出标签输入窗口 label_window = tk.Toplevel(self.root) label_window.title("输入标签") label_window.geometry("300x150") label_window.transient(self.root) label_window.grab_set() tk.Label( label_window, text="请输入数字标签 (0-9):", font=("Arial", 10) ).pack(pady=10) entry = tk.Entry(label_window, font=("Arial", 12), width=5) entry.pack(pady=5) entry.focus_set() def save_with_label(): try: label = int(entry.get()) if label < 0 or label > 9: raise ValueError("标签必须是0-9的数字") self.custom_data.append((img_array.tolist(), label)) self.status_var.set(f"已保存数字 {label} (共 {len(self.custom_data)} 个样本)") label_window.destroy() except ValueError as e: self.status_var.set(f"保存错误: {str(e)}") tk.Button( label_window, text="保存", command=save_with_label, width=10 ).pack(pady=5) def save_all_training_data(self): """保存全部训练数据""" if not self.custom_data: self.status_var.set("没有训练数据可保存") return file_path = filedialog.asksaveasfilename( defaultextension=".csv", filetypes=[("CSV文件", "*.csv")], initialfile="custom_digits.csv", title="保存训练集" ) if not file_path: return try: with open(file_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow([f'pixel{i}' for i in range(64)] + ['label']) for img_data, label in self.custom_data: writer.writerow(img_data + [label]) self.status_var.set(f"已保存 {len(self.custom_data)} 个样本到 {os.path.basename(file_path)}") except Exception as e: self.status_var.set(f"保存失败: {str(e)}") def load_training_data(self): """加载训练数据""" file_path = filedialog.askopenfilename( filetypes=[("CSV文件", "*.csv")], title="加载训练集" ) if not file_path: return try: self.custom_data = [] with open(file_path, 'r', newline='', encoding='utf-8') as f: reader = csv.reader(f) next(reader) # 跳过标题 for row in reader: if len(row) != 65: continue img_data = [float(pixel) for pixel in row[:64]] label = int(row[64]) self.custom_data.append((img_data, label)) self.status_var.set(f"已加载 {len(self.custom_data)} 个样本") except Exception as e: self.status_var.set(f"加载失败: {str(e)}") def run(self): """运行应用""" self.root.mainloop() if __name__ == "__main__": digits = load_digits() root = tk.Tk() app = HandwritingBoard(root, ModelFactory, digits) app.run() 基于此代码,在其中做好大量注释,同时要明确代码的分区功能,要显示明白,让刚学python的同学要能看懂。
06-24
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, auc, roc_auc_score) from sklearn.preprocessing import StandardScaler, LabelBinarizer import os import re from collections import defaultdict import warnings warnings.filterwarnings('ignore') # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 def load_data(file_path): """加载STR图谱数据""" df = pd.read_excel("C:\\Users\\DELL\\Desktop\\B题附件\\附件1:不同人数的STR图谱数据.xlsx") print(f"数据加载成功,形状: {df.shape}") return df def extract_contributor_info(df): """从样本文件名中提取贡献者信息""" pattern = r'RD14-0003-([0-9_]+)-([0-9;]+)-' # 提取信息并创建新列 df['sample_id'] = range(len(df)) # 添加样本序号 # 初始化列 df['contributor_count'] = np.nan df['contributor_ids'] = None df['mixing_ratio'] = None for idx, row in df.iterrows(): sample_file = row['Sample File'] if pd.isna(sample_file): continue match = re.search(pattern, str(sample_file)) if match: contributor_ids = match.group(1).split('_') mixing_ratios = match.group(2).split(';') df.at[idx, 'contributor_count'] = len(contributor_ids) df.at[idx, 'contributor_ids'] = ','.join(contributor_ids) df.at[idx, 'mixing_ratio'] = ';'.join(mixing_ratios) # 将contributor_count转换为整数 df['contributor_count'] = df['contributor_count'].astype('Int64') return df def create_visualizations(df): """创建数据可视化""" import re save_dir = r"E:\数模\深圳杯\老哥D题\2025深圳杯&东三省数学建模D题思路解析\2025深圳杯D题【完整代码】\图0" os.makedirs(save_dir, exist_ok=True) plt.figure(figsize=(15, 12)) # 1. 贡献者数量分布 plt.subplot(2, 2, 1) contributor_counts = df['contributor_count'].value_counts().sort_index() ax = sns.barplot(x=contributor_counts.index, y=contributor_counts.values, palette='viridis') for i, v in enumerate(contributor_counts.values): ax.text(i, v + 5, str(v), ha='center') title1 = '贡献者数量分布' plt.title(title1, fontsize=14) plt.xlabel('贡献者数量', fontsize=12) plt.ylabel('样本数', fontsize=12) plt.grid(axis='y', linestyle='--', alpha=0.7) # 保存图片 safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title1) plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300) # 2. 不同贡献者数量的等位基因分布 plt.subplot(2, 2, 2) allele_counts = [] for _, group in df.groupby(['Sample File', 'Marker']): count = 0 for i in range(1, 21): col = f'Allele {i}' if col in group.columns and not group[col].isnull().all(): allele_val = group[col].iloc[0] if pd.notna(allele_val) and str(allele_val) != 'nan': count += 1 if count > 0 and 'contributor_count' in group.columns: contributor_count = group['contributor_count'].iloc[0] if pd.notna(contributor_count): allele_counts.append((int(contributor_count), count)) allele_df = pd.DataFrame(allele_counts, columns=['contributor_count', 'allele_count']) sns.boxplot(x='contributor_count', y='allele_count', data=allele_df, palette='viridis') sns.stripplot(x='contributor_count', y='allele_count', data=allele_df, size=4, color='.3', alpha=0.5) title2 = '不同贡献者数量的等位基因分布' plt.title(title2, fontsize=14) plt.xlabel('贡献者数量', fontsize=12) plt.ylabel('等位基因数量', fontsize=12) plt.grid(axis='y', linestyle='--', alpha=0.7) safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title2) plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300) # 3. 峰高分布 plt.subplot(2, 2, 3) heights = [] for i in range(1, 21): height_col = f'Height {i}' if height_col in df.columns: valid_heights = df[height_col].dropna() heights.extend(valid_heights[valid_heights > 0]) sns.histplot(heights, bins=50, kde=True, color='darkblue', alpha=0.7) title3 = '峰高分布' plt.title(title3, fontsize=14) plt.xlabel('峰高', fontsize=12) plt.ylabel('频次', fontsize=12) plt.xscale('log') plt.grid(axis='y', linestyle='--', alpha=0.7) safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title3) plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300) # 4. 贡献者数量与峰高比例关系 plt.subplot(2, 2, 4) ratio_data = [] for _, group in df.groupby(['Sample File', 'Marker']): heights = [] for i in range(1, 21): height_col = f'Height {i}' if height_col in group.columns and not group[height_col].isnull().all(): height_val = group[height_col].iloc[0] if pd.notna(height_val) and height_val > 0: heights.append(height_val) if len(heights) >= 2 and 'contributor_count' in group.columns: contributor_count = group['contributor_count'].iloc[0] if pd.notna(contributor_count): heights.sort(reverse=True) ratio = heights[0] / heights[1] ratio_data.append((int(contributor_count), ratio)) ratio_df = pd.DataFrame(ratio_data, columns=['contributor_count', 'peak_ratio']) sns.boxplot(x='contributor_count', y='peak_ratio', data=ratio_df, palette='viridis') sns.stripplot(x='contributor_count', y='peak_ratio', data=ratio_df, size=4, color='.3', alpha=0.5) title4 = '贡献者数量与峰高比例关系' plt.title(title4, fontsize=14) plt.xlabel('贡献者数量', fontsize=12) plt.ylabel('峰高比例 (最高/次高)', fontsize=12) plt.grid(axis='y', linestyle='--', alpha=0.7) safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title4) plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300) plt.tight_layout() plt.show() print("数据可视化完成") def extract_features(df): """提取特征用于机器学习模型""" print("开始提取特征...") # 按样本文件和标记分组处理 features_list = [] for (sample_file, marker), group in df.groupby(['Sample File', 'Marker']): # 跳过缺失贡献者数量的样本 if 'contributor_count' not in group.columns or group['contributor_count'].isnull().all(): continue contributor_count = int(group['contributor_count'].iloc[0]) # 基本信息 feature = { 'sample_file': sample_file, 'marker': marker, 'contributor_count': contributor_count, 'sample_id': group['sample_id'].iloc[0] # 使用样本序号作为标识 } # 提取等位基因和峰高数据 alleles = [] heights = [] non_ol_alleles = 0 sizes = [] for i in range(1, 21): # 假设最多20个等位基因 allele_col = f'Allele {i}' height_col = f'Height {i}' size_col = f'Size {i}' if (allele_col in group.columns and height_col in group.columns and not group[allele_col].isnull().all() and not group[height_col].isnull().all()): allele = group[allele_col].iloc[0] height = group[height_col].iloc[0] if pd.notna(allele) and pd.notna(height) and height > 0: alleles.append(str(allele)) heights.append(height) # 获取size信息(如果可用) if size_col in group.columns and not group[size_col].isnull().all(): size = group[size_col].iloc[0] if pd.notna(size): sizes.append(size) # 统计非OL等位基因 if str(allele) != 'OL': non_ol_alleles += 1 # 等位基因计数特征 feature['allele_count'] = len(alleles) feature['non_ol_allele_count'] = non_ol_alleles # 理论上每个人贡献2个等位基因 expected_alleles = 2 * contributor_count feature['expected_allele_ratio'] = len(alleles) / expected_alleles if expected_alleles > 0 else 0 # 基于阈值的特征 feature['exceeds_2_alleles'] = 1 if len(alleles) > 2 else 0 feature['exceeds_4_alleles'] = 1 if len(alleles) > 4 else 0 feature['exceeds_6_alleles'] = 1 if len(alleles) > 6 else 0 feature['exceeds_8_alleles'] = 1 if len(alleles) > 8 else 0 # OL等位基因比例 feature['ol_allele_ratio'] = (len(alleles) - non_ol_alleles) / len(alleles) if len(alleles) > 0 else 0 # 峰高相关特征 if heights: feature['height_count'] = len(heights) feature['height_mean'] = np.mean(heights) feature['height_std'] = np.std(heights) if len(heights) > 1 else 0 feature['height_min'] = np.min(heights) feature['height_max'] = np.max(heights) feature['height_range'] = feature['height_max'] - feature['height_min'] # 计算变异系数 feature['height_cv'] = feature['height_std'] / feature['height_mean'] if feature['height_mean'] > 0 else 0 # 峰高分布特征 sorted_heights = sorted(heights, reverse=True) feature['height_top1'] = sorted_heights[0] if len(sorted_heights) > 0 else 0 feature['height_top2'] = sorted_heights[1] if len(sorted_heights) > 1 else 0 feature['height_top3'] = sorted_heights[2] if len(sorted_heights) > 2 else 0 feature['height_top4'] = sorted_heights[3] if len(sorted_heights) > 3 else 0 # 峰高总和 feature['height_sum'] = sum(heights) # 峰高比例特征 if len(heights) > 1: # 计算相邻峰高比 ratios = [] for i in range(len(sorted_heights) - 1): if sorted_heights[i + 1] > 0: ratios.append(sorted_heights[i] / sorted_heights[i + 1]) if ratios: feature['height_ratio_mean'] = np.mean(ratios) feature['height_ratio_std'] = np.std(ratios) if len(ratios) > 1 else 0 feature['height_ratio_max'] = np.max(ratios) feature['height_ratio_min'] = np.min(ratios) # 具体峰高比例 feature['top1_top2_ratio'] = sorted_heights[0] / sorted_heights[1] if len(sorted_heights) > 1 and \ sorted_heights[1] > 0 else 0 feature['top2_top3_ratio'] = sorted_heights[1] / sorted_heights[2] if len(sorted_heights) > 2 and \ sorted_heights[2] > 0 else 0 feature['top3_top4_ratio'] = sorted_heights[2] / sorted_heights[3] if len(sorted_heights) > 3 and \ sorted_heights[3] > 0 else 0 # 最高峰与平均峰高比 feature['top_to_mean_ratio'] = sorted_heights[0] / np.mean(sorted_heights) if np.mean( sorted_heights) > 0 else 0 # Size特征 if sizes: feature['size_mean'] = np.mean(sizes) feature['size_std'] = np.std(sizes) if len(sizes) > 1 else 0 feature['size_range'] = np.max(sizes) - np.min(sizes) if len(sizes) > 1 else 0 # 标记特定特征 (为主要STR位点添加特征) for known_marker in ['D8S1179', 'D21S11', 'D7S820', 'CSF1PO', 'D3S1358', 'TH01', 'D13S317', 'D16S539', 'D2S1338', 'D19S433', 'vWA', 'TPOX', 'D18S51', 'AMEL', 'D5S818', 'FGA']: feature[f'marker_is_{known_marker}'] = 1 if marker == known_marker else 0 features_list.append(feature) # 创建特征数据框 if not features_list: print("警告: 未能生成有效特征") return None features_df = pd.DataFrame(features_list) # 填充缺失值 features_df = features_df.fillna(0) # 打印特征统计信息 print(f"特征提取完成,共生成 {len(features_df)} 个样本,{len(features_df.columns)} 个特征") print(f"每个贡献者人数的样本数量分布:") print(features_df['contributor_count'].value_counts().sort_index()) return features_df def plot_roc_curve(y_test, y_scores, n_classes): """绘制ROC曲线""" import re save_dir = r"E:\数模\深圳杯\老哥D题\2025深圳杯&东三省数学建模D题思路解析\2025深圳杯D题【完整代码】\图0" os.makedirs(save_dir, exist_ok=True) plt.figure(figsize=(12, 8)) fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_scores[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) plt.plot(fpr[i], tpr[i], lw=2, label=f'类别 {i + 2} ROC曲线 (AUC = {roc_auc[i]:.2f})') all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += np.interp(all_fpr, fpr[i], tpr[i]) mean_tpr /= n_classes mean_auc = auc(all_fpr, mean_tpr) plt.plot(all_fpr, mean_tpr, 'k--', lw=2, label=f'平均 ROC曲线 (AUC = {mean_auc:.2f})') plt.plot([0, 1], [0, 1], 'k:', lw=2) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('假正例率 (FPR)', fontsize=12) plt.ylabel('真正例率 (TPR)', fontsize=12) title = '多类别 ROC 曲线' plt.title(title, fontsize=14) plt.legend(loc="lower right", fontsize=10) plt.grid(linestyle='--', alpha=0.7) safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title) plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300) return plt def plot_confusion_matrix(cm, class_names): """绘制美观的混淆矩阵""" import re save_dir = r"E:\数模\深圳杯\老哥D题\2025深圳杯&东三省数学建模D题思路解析\2025深圳杯D题【完整代码】\图0" os.makedirs(save_dir, exist_ok=True) plt.figure(figsize=(10, 8)) accuracy = np.trace(cm) / float(np.sum(cm)) misclass = 1 - accuracy sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False) title = f'混淆矩阵_准确度_{accuracy:.4f}_错误率_{misclass:.4f}' plt.title(f'混淆矩阵\n准确度: {accuracy:.4f}, 错误率: {misclass:.4f}', fontsize=14) plt.ylabel('真实标签', fontsize=12) plt.xlabel('预测标签', fontsize=12) tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks + 0.5, class_names) plt.yticks(tick_marks + 0.5, class_names) safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title) plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300) return plt def evaluate_model(model, X_test_scaled, y_test, model_name, class_labels=None): """评估模型性能并打印详细指标""" # 预测 y_pred = model.predict(X_test_scaled) # 计算基本指标 accuracy = accuracy_score(y_test, y_pred) precision_weighted = precision_score(y_test, y_pred, average='weighted') recall_weighted = recall_score(y_test, y_pred, average='weighted') f1_weighted = f1_score(y_test, y_pred, average='weighted') # 打印指标 print(f"\n{model_name} 模型评估指标:") print(f"准确率 (Accuracy): {accuracy:.4f}") print(f"加权精确率 (Weighted Precision): {precision_weighted:.4f}") print(f"加权召回率 (Weighted Recall): {recall_weighted:.4f}") print(f"加权F1分数 (Weighted F1): {f1_weighted:.4f}") # 打印每个类别的指标 if class_labels is None: class_labels = sorted(set(y_test) | set(y_pred)) print("\n每个类别的详细指标:") precision_per_class = precision_score(y_test, y_pred, average=None) recall_per_class = recall_score(y_test, y_pred, average=None) f1_per_class = f1_score(y_test, y_pred, average=None) for i, label in enumerate(class_labels): print(f"类别 {label}:") print(f" 精确率: {precision_per_class[i]:.4f}") print(f" 召回率: {recall_per_class[i]:.4f}") print(f" F1分数: {f1_per_class[i]:.4f}") # 生成分类报告 print("\n分类报告:") print(classification_report(y_test, y_pred, target_names=[f"{n}人" for n in class_labels])) # 绘制混淆矩阵 cm = confusion_matrix(y_test, y_pred) cm_plt = plot_confusion_matrix(cm, [f"{n}人" for n in class_labels]) cm_plt.show() # 为ROC曲线准备数据 try: y_score = model.predict_proba(X_test_scaled) # 二值化标签 lb = LabelBinarizer() lb.fit(class_labels) y_test_bin = lb.transform(y_test) # 绘制ROC曲线 n_classes = len(class_labels) roc_plt = plot_roc_curve(y_test_bin, y_score, n_classes) roc_plt.show() except (AttributeError, ValueError) as e: print(f"注意: 无法绘制ROC曲线 - {e}") return { 'accuracy': accuracy, 'precision': precision_weighted, 'recall': recall_weighted, 'f1': f1_weighted, 'confusion_matrix': cm } def train_models_by_count(features_df): """按贡献者人数拆分数据集,为每个人数单独训练最佳模型""" print("开始按贡献者人数拆分并训练模型...") # 获取所有贡献者人数类别 count_classes = sorted(features_df['contributor_count'].unique()) count_classes = [c for c in count_classes if pd.notna(c)] print(f"检测到的贡献者人数类别: {count_classes}") # 存储每个类别的最佳模型、标准化器和结果 count_models = {} # 格式: {人数: (最佳模型, 标准化器)} count_results = {} # 格式: {人数: 评估结果} feature_cols = [col for col in features_df.columns if col not in ['sample_file', 'marker', 'contributor_count', 'sample_id']] # 为每个人数类别单独训练模型 for count in count_classes: print(f"\n{'=' * 50}") print(f"开始处理 {count}人混合样本的模型训练") print(f"{'=' * 50}") # 1. 筛选当前人数类别的数据(并包含少量其他类别作为负样本) current_data = features_df[features_df['contributor_count'] == count].copy() other_data = features_df[features_df['contributor_count'] != count].copy() # 负样本数量控制(最多与正样本数量相同,避免类别失衡) neg_sample_size = min(len(current_data), len(other_data)) other_data_sampled = other_data.sample(n=neg_sample_size, random_state=42) # 合并数据并创建二分类标签(1=当前人数,0=其他人数) combined_data = pd.concat([current_data, other_data_sampled], ignore_index=True) combined_data['binary_label'] = combined_data['contributor_count'].apply( lambda x: 1 if x == count else 0 ) print(f"训练数据分布: {count}人样本 {len(current_data)}个, 其他样本 {len(other_data_sampled)}个") # 2. 准备训练数据 X = combined_data[feature_cols] y = combined_data['binary_label'] # 分割数据 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42, stratify=y ) # 特征标准化 scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 3. 定义候选模型(为每个人数单独测试模型效果) models = { "随机森林": RandomForestClassifier( n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1 ), "梯度提升": GradientBoostingClassifier( n_estimators=100, learning_rate=0.1, random_state=42 ), "多层感知器": MLPClassifier( hidden_layer_sizes=(100, 50), max_iter=300, early_stopping=True, random_state=42 ), "支持向量机": SVC( kernel='rbf', probability=True, class_weight='balanced', random_state=42 ) } # 4. 训练并选择当前人数的最佳模型 best_model = None best_score = 0 best_metrics = None for name, model in models.items(): print(f"\n正在训练 {name} 模型...") model.fit(X_train_scaled, y_train) # 评估模型(二分类场景) metrics = evaluate_model( model, X_test_scaled, y_test, model_name=f"{count}人样本 - {name}", class_labels=[0, 1] # 0=其他人数,1=当前人数 ) # 跟踪最佳模型(以F1分数为标准) if metrics['f1'] > best_score: best_score = metrics['f1'] best_model = model best_metrics = metrics # 5. 保存当前人数的最佳模型和标准化器 count_models[count] = (best_model, scaler) count_results[count] = best_metrics print(f"\n{count}人样本的最佳模型确定为: {best_model.__class__.__name__}, F1分数: {best_score:.4f}") # 6. 训练一个"人数识别主模型"(用于先判断样本属于哪个人数类别) print(f"\n{'=' * 50}") print("训练人数识别主模型(用于初步分类)") print(f"{'=' * 50}") # 准备主模型数据 X_main = features_df[feature_cols] y_main = features_df['contributor_count'] X_train_main, X_test_main, y_train_main, y_test_main = train_test_split( X_main, y_main, test_size=0.25, random_state=42, stratify=y_main ) scaler_main = StandardScaler() X_train_main_scaled = scaler_main.fit_transform(X_train_main) X_test_main_scaled = scaler_main.transform(X_test_main) # 主模型(多分类,判断人数) main_model = RandomForestClassifier( n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1 ) main_model.fit(X_train_main_scaled, y_train_main) # 评估主模型 main_metrics = evaluate_model( main_model, X_test_main_scaled, y_test_main, model_name="人数识别主模型", class_labels=count_classes ) print(f"\n人数识别主模型准确率: {main_metrics['accuracy']:.4f}") return count_models, count_results, main_model, scaler_main, count_classes, feature_cols def aggregate_predictions(features_df, count_models, main_model, scaler_main, count_classes, feature_cols): """使用两级模型进行预测:先主模型判断人数,再专用模型验证""" print("\n开始两级模型预测...") # 1. 准备数据 X = features_df[feature_cols] X_main_scaled = scaler_main.transform(X) # 2. 第一步:主模型预测人数 features_df['main_prediction'] = main_model.predict(X_main_scaled) # 3. 第二步:使用对应人数的专用模型验证 features_df['final_prediction'] = None for idx, row in features_df.iterrows(): # 获取主模型预测的人数 pred_count = row['main_prediction'] # 如果预测人数不在已知类别中,默认使用主模型结果 if pred_count not in count_models: features_df.at[idx, 'final_prediction'] = pred_count continue # 获取对应人数的专用模型和标准化器 model, scaler = count_models[pred_count] # 标准化当前样本特征 sample_features = row[feature_cols].values.reshape(1, -1) sample_scaled = scaler.transform(sample_features) # 专用模型预测(1=符合该人数,0=不符合) bin_pred = model.predict(sample_scaled)[0] # 确定最终预测结果 if bin_pred == 1: # 专用模型验证通过,使用主模型结果 features_df.at[idx, 'final_prediction'] = pred_count else: # 专用模型验证不通过,使用概率最高的其他类别 proba = main_model.predict_proba(X_main_scaled[idx].reshape(1, -1))[0] proba_df = pd.DataFrame({ 'count': count_classes, 'prob': proba }).sort_values('prob', ascending=False) # 选择概率最高的其他类别 for _, p_row in proba_df.iterrows(): if p_row['count'] != pred_count: features_df.at[idx, 'final_prediction'] = p_row['count'] break # 4. 按样本聚合最终预测结果 agg_results = features_df.groupby('sample_file').agg({ 'contributor_count': 'first', # 实际人数 'final_prediction': lambda x: x.value_counts().index[0] # 最常见的预测值 }).reset_index() # 5. 评估最终结果 agg_results['correct'] = agg_results['contributor_count'] == agg_results['final_prediction'] sample_accuracy = agg_results['correct'].mean() print(f"\n两级模型样本级别准确率: {sample_accuracy:.4f}") # 绘制最终混淆矩阵 cm = confusion_matrix(agg_results['contributor_count'], agg_results['final_prediction']) cm_plt = plot_confusion_matrix(cm, [f"{n}人" for n in count_classes]) cm_plt.show() # 分类报告 print("\n最终样本级别分类报告:") print(classification_report( agg_results['contributor_count'], agg_results['final_prediction'], target_names=[f"{n}人" for n in count_classes] )) # 保存结果 agg_results.to_csv('两级模型样本预测结果.csv', index=False, encoding='utf-8-sig') print("\n两级模型预测结果已保存为 '两级模型样本预测结果.csv'") return agg_results def main(): """主函数(使用按人数拆分的模型)""" print("=" * 80) print("法医物证多人身份鉴定 - 按人数专用模型识别分析".center(60)) print("=" * 80) # 步骤1:加载数据 file_path = "附件1:不同人数的STR图谱数据.xlsx" if not os.path.exists(file_path): print(f"错误: 文件 '{file_path}' 不存在") return print("\n第1步: 数据加载") print("-" * 50) df = load_data(file_path) # 步骤2:提取贡献者信息 print("\n第2步: 提取贡献者信息") print("-" * 50) df = extract_contributor_info(df) # 显示贡献者数量分布 print("\n贡献者数量分布:") contributor_counts = df['contributor_count'].value_counts().sort_index() for count, freq in contributor_counts.items(): if pd.notna(count): print(f" {int(count)}人混合样本: {freq}行数据") # 步骤3:数据可视化 print("\n第3步: 数据可视化") print("-" * 50) create_visualizations(df) # 步骤4:提取特征 print("\n第4步: 特征提取") print("-" * 50) features_df = extract_features(df) if features_df is None: print("错误: 无法提取有效特征") return # 步骤5:按人数训练专用模型 print("\n第5步: 按贡献者人数训练专用模型") print("-" * 50) count_models, count_results, main_model, scaler_main, count_classes, feature_cols = train_models_by_count( features_df) # 步骤6:使用两级模型进行预测 print("\n第6步: 使用两级模型进行样本级别预测") print("-" * 50) agg_results = aggregate_predictions(features_df, count_models, main_model, scaler_main, count_classes, feature_cols) # 步骤7:比较新旧模型性能 print("\n第7步: 模型性能比较") print("-" * 50) # 这里可以添加新旧模型的对比分析代码 print("\n分析完成!") print("=" * 80) if __name__ == "__main__": main()这个怎么什么都没生成出来
最新发布
07-30
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值