引言
在教育领域,预测学生的学习成绩一直是一个重要且具有挑战性的任务。随着机器学习技术的发展,我们现在可以构建更加精确的预测模型,帮助教育工作者提前识别可能需要额外支持的学生,并为个性化教学提供数据支持。本文将详细介绍如何使用Python和机器学习技术构建一个学生成绩预测系统。
项目概述
本项目旨在开发一个基于机器学习的系统,通过分析学生的历史数据(如出勤率、作业完成情况、课堂参与度等)来预测他们在未来考试中的表现。该系统不仅可以预测最终成绩,还能识别影响学生学习成绩的关键因素,为教师提供有价值的教学反馈。
技术栈
- Python 3.8+:主要编程语言
- Pandas & NumPy:数据处理和分析
- Scikit-learn:机器学习模型构建和评估
- Matplotlib & Seaborn:数据可视化
- Flask:Web应用后端
- SQLite/MongoDB:数据存储
- HTML/CSS/JavaScript:前端界面
系统架构
该系统由以下几个主要组件构成:
- 数据收集与预处理模块:负责从各种来源收集学生数据,并进行清洗、标准化和特征工程。
- 模型训练与评估模块:使用多种机器学习算法构建预测模型,并评估其性能。
- 预测服务模块:将训练好的模型部署为API服务,接收新数据并返回预测结果。
- 可视化与报告模块:生成直观的图表和报告,帮助教师理解预测结果和影响因素。
- Web应用界面:提供友好的用户界面,方便教师使用系统功能。
详细实现
1. 数据收集与预处理
首先,我们需要收集和准备用于训练模型的数据。以下是一个简单的数据预处理示例:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# 加载数据
def load_data(file_path):
"""加载学生数据集"""
data = pd.read_csv(file_path)
return data
# 数据预处理
def preprocess_data(data):
"""清洗和预处理数据"""
# 处理缺失值
data = data.fillna(data.mean())
# 编码分类特征
categorical_features = ['gender', 'parent_education', 'school']
for feature in categorical_features:
if feature in data.columns:
data = pd.get_dummies(data, columns=[feature], drop_first=True)
# 特征缩放
features = data.drop('final_grade', axis=1)
target = data['final_grade']
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
return features_scaled, target, scaler
# 划分训练集和测试集
def split_data(features, target, test_size=0.2, random_state=42):
"""划分训练集和测试集"""
X_train, X_test, y_train, y_test = train_test_split(
features, target, test_size=test_size, random_state=random_state
)
return X_train, X_test, y_train, y_test
2. 特征工程
特征工程是提高模型性能的关键步骤。我们可以创建新的特征来捕捉学生学习行为的不同方面:
def create_features(data):
"""创建新特征"""
# 计算出勤率
if 'classes_attended' in data.columns and 'total_classes' in data.columns:
data['attendance_rate'] = data['classes_attended'] / data['total_classes']
# 计算作业完成率
if 'assignments_completed' in data.columns and 'total_assignments' in data.columns:
data['assignment_completion_rate'] = data['assignments_completed'] / data['total_assignments']
# 计算学习时间与成绩的比率
if 'study_time' in data.columns and 'previous_grade' in data.columns:
data['study_efficiency'] = data['previous_grade'] / (data['study_time'] + 1) # 避免除零
# 创建学习规律性指标
if 'study_regularity' in data.columns and 'extracurricular_activities' in data.columns:
data['balance_score'] = data['study_regularity'] - 0.5 * data['extracurricular_activities']
return data
3. 模型训练与评估
我们可以尝试多种机器学习算法,并选择性能最佳的模型:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
def train_models(X_train, y_train):
"""训练多个模型并返回结果"""
models = {
'Linear Regression': LinearRegression(),
'Ridge Regression': Ridge(alpha=1.0),
'Lasso Regression': Lasso(alpha=0.1),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
'SVR': SVR(kernel='rbf')
}
trained_models = {}
for name, model in models.items():
model.fit(X_train, y_train)
trained_models[name] = model
return trained_models
def evaluate_models(models, X_test, y_test):
"""评估模型性能"""
results = {}
for name, model in models.items():
y_pred = model.predict(X_test)
# 计算评估指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results[name] = {
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R2': r2
}
return results
def save_best_model(models, results, model_path):
"""保存性能最佳的模型"""
# 根据R2分数选择最佳模型
best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model = models[best_model_name]
# 保存模型
joblib.dump(best_model, model_path)
return best_model_name, best_model
4. 特征重要性分析
了解哪些因素对学生成绩影响最大,对教育工作者非常有价值:
import matplotlib.pyplot as plt
import seaborn as sns
def analyze_feature_importance(model, feature_names):
"""分析并可视化特征重要性"""
# 对于线性模型
if hasattr(model, 'coef_'):
importance = np.abs(model.coef_)
# 对于树模型
elif hasattr(model, 'feature_importances_'):
importance = model.feature_importances_
else:
return None
# 创建特征重要性DataFrame
feature_importance = pd.DataFrame({
'Feature': feature_names,
'Importance': importance
})
# 按重要性排序
feature_importance = feature_importance.sort_values('Importance', ascending=False)
# 可视化
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(10))
plt.title('Top 10 Most Important Features')
plt.tight_layout()
return feature_importance, plt
5. 预测服务API
使用Flask创建一个简单的API,使教师能够轻松获取预测结果:
from flask import Flask, request, jsonify
import joblib
import numpy as np
app = Flask(__name__)
# 加载模型和缩放器
model = joblib.load('model/best_model.pkl')
scaler = joblib.load('model/scaler.pkl')
feature_names = joblib.load('model/feature_names.pkl')
@app.route('/predict', methods=['POST'])
def predict():
"""接收学生数据并返回预测成绩"""
data = request.json
# 准备输入数据
input_data = []
for feature in feature_names:
if feature in data:
input_data.append(data[feature])
else:
return jsonify({'error': f'Missing feature: {feature}'}), 400
# 转换为numpy数组并重塑
input_array = np.array(input_data).reshape(1, -1)
# 缩放数据
scaled_input = scaler.transform(input_array)
# 预测
prediction = model.predict(scaled_input)[0]
return jsonify({
'predicted_grade': float(prediction),
'student_id': data.get('student_id', 'unknown')
})
@app.route('/batch_predict', methods=['POST'])
def batch_predict():
"""批量预测多个学生的成绩"""
data_batch = request.json
results = []
for student_data in data_batch:
# 准备输入数据
input_data = []
for feature in feature_names:
if feature in student_data:
input_data.append(student_data[feature])
else:
return jsonify({'error': f'Missing feature: {feature} for student {student_data.get("student_id", "unknown")}'}), 400
# 转换为numpy数组并重塑
input_array = np.array(input_data).reshape(1, -1)
# 缩放数据
scaled_input = scaler.transform(input_array)
# 预测
prediction = model.predict(scaled_input)[0]
results.append({
'student_id': student_data.get('student_id', 'unknown'),
'predicted_grade': float(prediction)
})
return jsonify(results)
if __name__ == '__main__':
app.run(debug=True)
6. Web应用界面
为了使系统更加用户友好,我们可以创建一个简单的Web界面:
# app.py (扩展版)
from flask import Flask, request, jsonify, render_template
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import io
import base64
app = Flask(__name__)
# 加载模型和相关组件
model = joblib.load('model/best_model.pkl')
scaler = joblib.load('model/scaler.pkl')
feature_names = joblib.load('model/feature_names.pkl')
@app.route('/')
def home():
"""渲染主页"""
return render_template('index.html', features=feature_names)
@app.route('/predict', methods=['POST'])
def predict():
"""处理单个学生的预测请求"""
if request.content_type == 'application/json':
data = request.json
else:
data = request.form.to_dict()
# 准备输入数据
input_data = []
for feature in feature_names:
if feature in data:
try:
value = float(data[feature])
input_data.append(value)
except ValueError:
return jsonify({'error': f'Invalid value for feature: {feature}'}), 400
else:
return jsonify({'error': f'Missing feature: {feature}'}), 400
# 转换为numpy数组并重塑
input_array = np.array(input_data).reshape(1, -1)
# 缩放数据
scaled_input = scaler.transform(input_array)
# 预测
prediction = model.predict(scaled_input)[0]
# 如果是API请求,返回JSON
if request.content_type == 'application/json':
return jsonify({
'predicted_grade': float(prediction),
'student_id': data.get('student_id', 'unknown')
})
# 如果是表单提交,返回结果页面
return render_template('result.html',
prediction=round(float(prediction), 2),
student_name=data.get('student_name', 'Student'),
features=data)
@app.route('/dashboard')
def dashboard():
"""显示数据分析仪表板"""
# 加载示例数据
data = pd.read_csv('data/student_data.csv')
# 生成一些基本统计图表
plt.figure(figsize=(10, 6))
# 成绩分布图
plt.subplot(2, 2, 1)
sns.histplot(data['final_grade'], kde=True)
plt.title('Grade Distribution')
# 出勤率与成绩关系图
plt.subplot(2, 2, 2)
sns.scatterplot(x='attendance_rate', y='final_grade', data=data)
plt.title('Attendance vs Grade')
# 学习时间与成绩关系图
plt.subplot(2, 2, 3)
sns.scatterplot(x='study_time', y='final_grade', data=data)
plt.title('Study Time vs Grade')
# 前测成绩与最终成绩关系图
plt.subplot(2, 2, 4)
sns.scatterplot(x='previous_grade', y='final_grade', data=data)
plt.title('Previous Grade vs Final Grade')
plt.tight_layout()
# 将图表转换为base64编码
buffer = io.BytesIO()
plt.savefig(buffer, format='png')
buffer.seek(0)
image_png = buffer.getvalue()
buffer.close()
graph = base64.b64encode(image_png).decode('utf-8')
return render_template('dashboard.html', graph=graph)
if __name__ == '__main__':
app.run(debug=True)
部署与使用
系统部署可以通过以下步骤完成:
-
准备环境:
# 创建虚拟环境 python -m venv venv # 激活虚拟环境 # Windows venv\Scripts\activate # Linux/Mac source venv/bin/activate # 安装依赖 pip install -r requirements.txt
-
数据准备:
- 收集学生历史数据
- 运行数据预处理脚本
- 生成训练数据集
-
模型训练:
python train_model.py
-
启动Web应用:
python app.py
-
访问系统:
- 打开浏览器,访问 http://localhost:5000
- 使用界面上传学生数据或手动输入
- 查看预测结果和数据分析
项目结构
student_performance_prediction/
│
├── data/
│ ├── raw/ # 原始数据
│ ├── processed/ # 处理后的数据
│ └── student_data.csv # 示例数据
│
├── model/
│ ├── best_model.pkl # 训练好的模型
│ ├── scaler.pkl # 特征缩放器
│ └── feature_names.pkl # 特征名称列表
│
├── src/
│ ├── data_processing.py # 数据预处理模块
│ ├── feature_engineering.py # 特征工程模块
│ ├── model_training.py # 模型训练模块
│ └── visualization.py # 数据可视化模块
│
├── static/
│ ├── css/
│ │ └── style.css # 样式表
│ └── js/
│ └── app.js # 前端脚本
│
├── templates/
│ ├── index.html # 主页模板
│ ├── result.html # 结果页模板
│ └── dashboard.html # 仪表板模板
│
├── app.py # Flask应用主文件
├── train_model.py # 模型训练脚本
├── requirements.txt # 项目依赖
└── README.md # 项目说明
进阶功能
除了基本的成绩预测功能外,该系统还可以扩展以下高级功能:
-
早期预警系统:识别可能面临学业困难的学生,并自动发送预警通知。
-
个性化学习建议:基于预测模型,为每个学生生成个性化的学习建议。
-
教学策略优化:分析不同教学策略对学生成绩的影响,帮助教师优化教学方法。
-
长期学习轨迹分析:跟踪学生的长期学习轨迹,预测未来的学习表现。
-
多模态数据整合:整合课堂行为、情感状态等多模态数据,提高预测准确性。
结论
基于机器学习的学生成绩预测系统为教育工作者提供了一个强大的工具,帮助他们更好地了解学生的学习情况,并提前识别需要额外支持的学生。通过分析影响学生成绩的关键因素,该系统还能为教学改进提供数据支持,最终提高教育质量和学生的学习体验。
随着教育数据的不断积累和机器学习技术的持续发展,这类系统的预测准确性和实用性将进一步提高,为教育领域带来更多创新应用。
源代码
源代码以这个为准(https://download.csdn.net/download/exlink2012/90796070?spm=1001.2014.3001.5501)
Directory Content Summary
Source Directory: ./student_performance_prediction
Directory Structure
student_performance_prediction/
app.py
README.md
requirements.txt
data/
processed/
raw/
student_data.csv
model/
src/
data_processing.py
feature_engineering.py
model_training.py
model_training_enhanced.py
prediction_service.py
visualization.py
static/
css/
js/
templates/
base.html
batch_prediction.html
data_processing.html
feature_engineering.html
index.html
model_training.html
prediction.html
prediction_result.html
single_prediction.html
upload.html
visualization.html
File Contents
app.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
学生成绩预测系统 - Web应用
提供友好的用户界面,方便教师使用系统功能
"""
import os
import pandas as pd
import numpy as np
import joblib
import json
from flask import Flask, render_template, request, jsonify, redirect, url_for, flash, send_file
import logging
from werkzeug.utils import secure_filename
# 导入自定义模块
from src.data_processing import DataProcessor
from src.feature_engineering import FeatureEngineer
from src.model_training import ModelTrainer
from src.visualization import DataVisualizer
from src.prediction_service import init_prediction_service, prediction_bp
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# 创建Flask应用
app = Flask(__name__)
app.secret_key = 'student_performance_prediction_secret_key'
# 设置上传文件的保存路径
UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'raw')
ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'xls'}
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 限制上传文件大小为16MB
# 确保上传目录存在
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
# 设置模型保存路径
MODEL_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'model')
if not os.path.exists(MODEL_FOLDER):
os.makedirs(MODEL_FOLDER)
# 初始化预测服务
init_prediction_service(app, MODEL_FOLDER)
# 设置处理后数据保存路径
PROCESSED_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'processed')
if not os.path.exists(PROCESSED_FOLDER):
os.makedirs(PROCESSED_FOLDER)
# 设置可视化图表保存路径
VISUALIZATION_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'images')
if not os.path.exists(VISUALIZATION_FOLDER):
os.makedirs(VISUALIZATION_FOLDER)
def allowed_file(filename):
"""检查文件扩展名是否允许上传"""
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
def index():
"""首页"""
return render_template('index.html')
@app.route('/upload', methods=['GET', 'POST'])
def upload_file():
"""上传数据文件"""
if request.method == 'POST':
# 检查是否有文件
if 'file' not in request.files:
flash('没有选择文件')
return redirect(request.url)
file = request.files['file']
# 如果用户没有选择文件
if file.filename == '':
flash('没有选择文件')
return redirect(request.url)
# 检查文件类型并保存
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(file_path)
flash(f'文件 {filename} 上传成功')
return redirect(url_for('data_processing', filename=filename))
else:
flash('不支持的文件类型')
return redirect(request.url)
return render_template('upload.html')
@app.route('/data_processing/<filename>', methods=['GET', 'POST'])
def data_processing(filename):
"""数据处理页面"""
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
# 初始化数据处理器
processor = DataProcessor()
# 加载数据
try:
data = processor.load_data(file_path)
columns = data.columns.tolist()
# 显示数据前10行
preview_data = data.head(10).to_dict('records')
# 数据统计信息
data_info = {
'shape': data.shape,
'missing_values': data.isnull().sum().to_dict(),
'dtypes': data.dtypes.astype(str).to_dict()
}
if request.method == 'POST':
# 获取表单数据
target_column = request.form.get('target_column')
categorical_columns = request.form.getlist('categorical_columns')
numerical_columns = request.form.getlist('numerical_columns')
drop_columns = request.form.getlist('drop_columns')
# 保存处理配置
processing_config = {
'target_column': target_column,
'categorical_columns': categorical_columns,
'numerical_columns': numerical_columns,
'drop_columns': drop_columns
}
# 保存配置到文件
config_path = os.path.join(PROCESSED_FOLDER, 'processing_config.json')
with open(config_path, 'w') as f:
json.dump(processing_config, f)
# 处理数据
processor.set_columns(
target_column=target_column,
categorical_columns=categorical_columns,
numerical_columns=numerical_columns,
drop_columns=drop_columns
)
# 清洗数据
processor.clean_data(data)
# 编码分类特征
processor.encode_categorical_features()
# 缩放数值特征
processor.scale_numerical_features()
# 获取处理后的特征和目标变量
X, y = processor.get_features_and_target()
# 保存处理后的数据
X.to_csv(os.path.join(PROCESSED_FOLDER, 'processed_features.csv'), index=False)
pd.DataFrame(y, columns=[target_column]).to_csv(os.path.join(PROCESSED_FOLDER, 'processed_target.csv'), index=False)
flash('数据处理完成')
return redirect(url_for('feature_engineering'))
return render_template('data_processing.html',
filename=filename,
columns=columns,
preview_data=preview_data,
data_info=data_info)
except Exception as e:
flash(f'数据处理出错: {str(e)}')
return redirect(url_for('upload_file'))
@app.route('/feature_engineering', methods=['GET', 'POST'])
def feature_engineering():
"""特征工程页面"""
# 检查处理后的数据是否存在
features_path = os.path.join(PROCESSED_FOLDER, 'processed_features.csv')
target_path = os.path.join(PROCESSED_FOLDER, 'processed_target.csv')
if not os.path.exists(features_path) or not os.path.exists(target_path):
flash('请先完成数据处理')
return redirect(url_for('upload_file'))
# 加载处理后的数据
X = pd.read_csv(features_path)
y = pd.read_csv(target_path)
# 获取特征列名
feature_columns = X.columns.tolist()
if request.method == 'POST':
# 初始化特征工程器
engineer = FeatureEngineer(X)
# 获取表单数据
interaction_features = request.form.getlist('interaction_features')
polynomial_features = request.form.getlist('polynomial_features')
binning_features = request.form.getlist('binning_features')
feature_selection = request.form.get('feature_selection', 'none')
n_features = int(request.form.get('n_features', 10))
# 创建交互特征
if interaction_features and len(interaction_features) >= 2:
# 每两个特征创建一个交互特征
for i in range(0, len(interaction_features), 2):
if i + 1 < len(interaction_features):
engineer.create_interaction_features([
(interaction_features[i], interaction_features[i+1])
])
# 创建多项式特征
if polynomial_features:
engineer.create_polynomial_features(polynomial_features)
# 创建分箱特征
if binning_features:
engineer.create_binning_features(binning_features)
# 特征选择
if feature_selection != 'none':
if feature_selection == 'correlation':
engineer.select_features_by_correlation(y.iloc[:, 0], n_features)
elif feature_selection == 'mutual_info':
engineer.select_features_by_mutual_info(y.iloc[:, 0], n_features)
# 获取工程化后的特征
X_engineered = engineer.get_engineered_features()
# 保存工程化后的特征
X_engineered.to_csv(os.path.join(PROCESSED_FOLDER, 'engineered_features.csv'), index=False)
# 保存特征工程配置
engineering_config = {
'interaction_features': interaction_features,
'polynomial_features': polynomial_features,
'binning_features': binning_features,
'feature_selection': feature_selection,
'n_features': n_features
}
config_path = os.path.join(PROCESSED_FOLDER, 'engineering_config.json')
with open(config_path, 'w') as f:
json.dump(engineering_config, f)
flash('特征工程完成')
return redirect(url_for('model_training'))
return render_template('feature_engineering.html', feature_columns=feature_columns)
@app.route('/model_training', methods=['GET', 'POST'])
def model_training():
"""模型训练页面"""
# 检查工程化后的特征是否存在
features_path = os.path.join(PROCESSED_FOLDER, 'engineered_features.csv')
target_path = os.path.join(PROCESSED_FOLDER, 'processed_target.csv')
if not os.path.exists(features_path):
features_path = os.path.join(PROCESSED_FOLDER, 'processed_features.csv')
if not os.path.exists(features_path) or not os.path.exists(target_path):
flash('请先完成数据处理和特征工程')
return redirect(url_for('upload_file'))
# 可用的模型列表
available_models = [
'LinearRegression',
'Ridge',
'Lasso',
'RandomForestRegressor',
'GradientBoostingRegressor',
'SVR'
]
if request.method == 'POST':
# 获取表单数据
selected_models = request.form.getlist('models')
test_size = float(request.form.get('test_size', 0.2))
random_state = int(request.form.get('random_state', 42))
cv_folds = int(request.form.get('cv_folds', 5))
# 加载数据
X = pd.read_csv(features_path)
y = pd.read_csv(target_path).iloc[:, 0]
# 初始化模型训练器
trainer = ModelTrainer(
test_size=test_size,
random_state=random_state,
cv_folds=cv_folds
)
# 设置数据
trainer.set_data(X, y)
# 添加选定的模型
for model_name in selected_models:
trainer.add_model(model_name)
# 训练模型
trainer.train_models()
# 评估模型
evaluation_results = trainer.evaluate_models()
# 保存模型评估结果
evaluation_path = os.path.join(MODEL_FOLDER, 'evaluation_results.json')
with open(evaluation_path, 'w') as f:
# 将numpy数据类型转换为Python原生类型
results_dict = {}
for model_name, metrics in evaluation_results.items():
results_dict[model_name] = {k: float(v) for k, v in metrics.items()}
json.dump(results_dict, f)
# 保存最佳模型
best_model, best_score = trainer.get_best_model()
joblib.dump(best_model, os.path.join(MODEL_FOLDER, 'best_model.pkl'))
# 保存所有模型
for model_name, model in trainer.models.items():
joblib.dump(model, os.path.join(MODEL_FOLDER, f'{model_name}.pkl'))
# 保存训练配置
training_config = {
'selected_models': selected_models,
'test_size': test_size,
'random_state': random_state,
'cv_folds': cv_folds,
'best_model': trainer.best_model_name,
'best_score': float(best_score)
}
config_path = os.path.join(MODEL_FOLDER, 'training_config.json')
with open(config_path, 'w') as f:
json.dump(training_config, f)
# 创建可视化
visualizer = DataVisualizer(X, y)
# 保存特征重要性图
if hasattr(best_model, 'feature_importances_'):
visualizer.plot_feature_importance(
best_model,
X.columns,
os.path.join(VISUALIZATION_FOLDER, 'feature_importance.png')
)
# 保存预测与实际值对比图
y_pred = best_model.predict(X)
visualizer.plot_prediction_vs_actual(
y,
y_pred,
os.path.join(VISUALIZATION_FOLDER, 'prediction_vs_actual.png')
)
# 保存残差图
visualizer.plot_residuals(
y,
y_pred,
os.path.join(VISUALIZATION_FOLDER, 'residuals.png')
)
# 保存相关性矩阵
visualizer.plot_correlation_matrix(
X,
os.path.join(VISUALIZATION_FOLDER, 'correlation_matrix.png')
)
# 保存成绩分布图
visualizer.plot_grade_distribution(
y,
os.path.join(VISUALIZATION_FOLDER, 'grade_distribution.png')
)
flash('模型训练完成')
return redirect(url_for('visualization'))
return render_template('model_training.html', available_models=available_models)
@app.route('/prediction', methods=['GET', 'POST'])
def prediction():
"""预测页面"""
# 检查模型是否已训练
model_path = os.path.join(MODEL_FOLDER, 'best_model.pkl')
if not os.path.exists(model_path):
flash('请先训练模型')
return redirect(url_for('model_training'))
# 加载特征工程配置
engineering_config_path = os.path.join(PROCESSED_FOLDER, 'engineering_config.json')
if os.path.exists(engineering_config_path):
with open(engineering_config_path, 'r') as f:
engineering_config = json.load(f)
else:
engineering_config = {}
# 加载处理配置
processing_config_path = os.path.join(PROCESSED_FOLDER, 'processing_config.json')
if os.path.exists(processing_config_path):
with open(processing_config_path, 'r') as f:
processing_config = json.load(f)
else:
flash('请先完成数据处理')
return redirect(url_for('upload_file'))
# 加载训练配置
training_config_path = os.path.join(MODEL_FOLDER, 'training_config.json')
if os.path.exists(training_config_path):
with open(training_config_path, 'r') as f:
training_config = json.load(f)
available_models = training_config.get('selected_models', [])
else:
available_models = []
# 加载特征列
features_path = os.path.join(PROCESSED_FOLDER, 'engineered_features.csv')
if not os.path.exists(features_path):
features_path = os.path.join(PROCESSED_FOLDER, 'processed_features.csv')
X = pd.read_csv(features_path)
feature_columns = X.columns.tolist()
if request.method == 'POST':
# 获取表单数据
input_data = {}
for column in feature_columns:
input_data[column] = request.form.get(column, '')
# 转换为DataFrame
input_df = pd.DataFrame([input_data])
# 加载模型
model = joblib.load(model_path)
# 预测
prediction = model.predict(input_df)[0]
# 返回预测结果
return render_template(
'prediction_result.html',
prediction=prediction,
model_name=training_config.get('best_model', 'Unknown'),
input_data=input_data
)
return render_template('prediction.html', feature_columns=feature_columns)
@app.route('/visualization')
def visualization():
"""可视化页面"""
# 检查模型评估结果是否存在
evaluation_path = os.path.join(MODEL_FOLDER, 'evaluation_results.json')
if not os.path.exists(evaluation_path):
flash('请先训练模型')
return redirect(url_for('model_training'))
# 加载评估结果
with open(evaluation_path, 'r') as f:
evaluation_results = json.load(f)
# 图片路径
image_paths = {
'grade_distribution': url_for('static', filename='images/grade_distribution.png'),
'feature_importance': url_for('static', filename='images/feature_importance.png'),
'prediction_vs_actual': url_for('static', filename='images/prediction_vs_actual.png'),
'residuals': url_for('static', filename='images/residuals.png'),
'correlation_matrix': url_for('static', filename='images/correlation_matrix.png')
}
return render_template(
'visualization.html',
evaluation_results=evaluation_results,
image_paths=image_paths
)
@app.route('/batch_prediction', methods=['GET', 'POST'])
def batch_prediction():
"""批量预测页面"""
# 检查模型是否已训练
model_path = os.path.join(MODEL_FOLDER, 'best_model.pkl')
if not os.path.exists(model_path):
flash('请先训练模型')
return redirect(url_for('model_training'))
# 加载训练配置
training_config_path = os.path.join(MODEL_FOLDER, 'training_config.json')
if os.path.exists(training_config_path):
with open(training_config_path, 'r') as f:
training_config = json.load(f)
available_models = training_config.get('selected_models', [])
else:
available_models = []
if request.method == 'POST':
# 检查是否有文件
if 'batch_file' not in request.files:
flash('没有选择文件')
return redirect(request.url)
file = request.files['batch_file']
# 如果用户没有选择文件
if file.filename == '':
flash('没有选择文件')
return redirect(request.url)
# 检查文件类型并处理
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
file_path = os.path.join(app.config['UPLOAD_FOLDER'], 'batch_' + filename)
file.save(file_path)
# 获取选项
model_select = request.form.get('model_select', 'best')
output_format = request.form.get('output_format', 'csv')
include_confidence = 'include_confidence' in request.form
include_explanations = 'include_explanations' in request.form
try:
# 使用预测服务进行批量预测
from src.prediction_service import prediction_service
# 加载数据
if file_path.endswith('.csv'):
batch_data = pd.read_csv(file_path)
else:
batch_data = pd.read_excel(file_path)
# 保存原始数据的ID列(如果存在)
id_column = None
for col in ['id', 'ID', 'student_id', 'student_ID', 'index']:
if col in batch_data.columns:
id_column = col
break
# 进行预测
predictions = prediction_service.predict(batch_data)
if predictions is None:
flash('预测失败,请检查输入数据格式是否正确')
return redirect(request.url)
# 添加预测结果到原始数据
batch_data['predicted_grade'] = predictions
# 添加置信区间(如果支持)
if include_confidence:
# 简单估计:使用模型的平均误差作为置信区间
model_info = prediction_service.get_model_info()
rmse = model_info.get('performance', {}).get('RMSE', 5.0)
batch_data['confidence_lower'] = batch_data['predicted_grade'] - 1.96 * rmse
batch_data['confidence_upper'] = batch_data['predicted_grade'] + 1.96 * rmse
# 添加解释(简单版本)
if include_explanations:
conditions = [
(batch_data['predicted_grade'] < 60, '需要额外辅导'),
(batch_data['predicted_grade'] < 70, '需要巩固基础'),
(batch_data['predicted_grade'] < 85, '良好表现'),
(batch_data['predicted_grade'] >= 85, '优秀表现')
]
explanations = ['需要额外辅导', '需要巩固基础', '良好表现', '优秀表现']
batch_data['explanation'] = np.select([c[0] for c in conditions], explanations, default='未知')
# 保存结果
result_filename = f'prediction_results_{pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")}'
result_path = os.path.join(PROCESSED_FOLDER, result_filename)
if output_format == 'csv':
batch_data.to_csv(result_path + '.csv', index=False)
result_file = result_filename + '.csv'
else:
batch_data.to_excel(result_path + '.xlsx', index=False)
result_file = result_filename + '.xlsx'
flash('批量预测完成')
return redirect(url_for('download_results', filename=result_file))
except Exception as e:
logger.error(f"批量预测失败: {str(e)}")
flash(f'批量预测失败: {str(e)}')
return redirect(request.url)
return render_template('batch_prediction.html', available_models=available_models)
@app.route('/download_template')
def download_template():
"""下载数据模板"""
# 检查处理后的数据是否存在
features_path = os.path.join(PROCESSED_FOLDER, 'processed_features.csv')
if not os.path.exists(features_path):
flash('请先完成数据处理')
return redirect(url_for('upload_file'))
# 加载特征列
X = pd.read_csv(features_path)
# 创建模板(只保留列名)
template = pd.DataFrame(columns=X.columns)
# 保存模板
template_path = os.path.join(PROCESSED_FOLDER, 'batch_prediction_template.csv')
template.to_csv(template_path, index=False)
# 返回文件下载
return send_file(template_path, as_attachment=True)
@app.route('/download_results/<filename>')
def download_results(filename):
"""下载预测结果"""
file_path = os.path.join(PROCESSED_FOLDER, filename)
if not os.path.exists(file_path):
flash('文件不存在')
return redirect(url_for('batch_prediction'))
return send_file(file_path, as_attachment=True)
@app.errorhandler(404)
def page_not_found(e):
"""404页面"""
return render_template('404.html'), 404
@app.errorhandler(500)
def internal_server_error(e):
"""500页面"""
return render_template('500.html'), 500
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=5000)
README.md
# 基于机器学习的学生成绩预测系统
这个项目实现了一个基于机器学习的学生成绩预测系统,通过分析学生的历史数据(如出勤率、作业完成情况、课堂参与度等)来预测他们在未来考试中的表现。
## 项目概述
本系统旨在帮助教育工作者提前识别可能需要额外支持的学生,并为个性化教学提供数据支持。系统不仅可以预测最终成绩,还能识别影响学生学习成绩的关键因素,为教师提供有价值的教学反馈。
## 功能特点
- 数据收集与预处理:清洗、标准化学生数据并进行特征工程
- 特征工程:创建新特征、特征选择和降维
- 模型训练与评估:支持多种机器学习算法,自动选择最佳模型
- 成绩预测:基于学生历史数据预测未来成绩
- 特征重要性分析:识别影响学生成绩的关键因素
- Web应用界面:提供友好的用户界面,方便教师使用系统功能
## 技术栈
- **Python 3.8+**:主要编程语言
- **Pandas & NumPy**:数据处理和分析
- **Scikit-learn**:机器学习模型构建和评估
- **Matplotlib & Seaborn**:数据可视化
- **Flask**:Web应用后端
- **SQLite/MongoDB**:数据存储
- **HTML/CSS/JavaScript**:前端界面
## 项目结构
student_performance_prediction/
│
├── data/
│ ├── raw/ # 原始数据
│ │ └── student_data.csv # 示例学生数据
│ └── processed/ # 处理后的数据
│
├── model/ # 训练好的模型
│
├── src/
│ ├── data_processing.py # 数据预处理模块
│ ├── feature_engineering.py # 特征工程模块
│ ├── model_training.py # 模型训练模块
│ └── visualization.py # 数据可视化模块
│
├── static/
│ ├── css/ # 样式表
│ └── js/ # 前端脚本
│
├── templates/ # HTML模板
│
├── app.py # Flask应用主文件
├── train_model.py # 模型训练脚本
├── requirements.txt # 项目依赖
└── README.md # 项目说明
## 安装与使用
### 环境准备
```bash
# 创建虚拟环境
python -m venv venv
# 激活虚拟环境
# Windows
venv\Scripts\activate
# Linux/Mac
source venv/bin/activate
# 安装依赖
pip install -r requirements.txt
数据准备
系统默认包含一个示例数据集,位于 data/raw/student_data.csv
。您也可以使用自己的数据集,只需确保数据格式与示例数据集兼容。
数据处理与特征工程
# 运行数据预处理
python src/data_processing.py
# 运行特征工程
python src/feature_engineering.py
模型训练
# 训练模型
python train_model.py
启动Web应用
# 启动Flask应用
python app.py
启动后,打开浏览器访问 http://localhost:5000 即可使用系统。
系统使用流程
- 数据上传:上传学生历史数据
- 数据预处理:系统自动清洗和标准化数据
- 特征工程:系统创建新特征并选择最重要的特征
- 模型训练:系统训练多个模型并选择最佳模型
- 成绩预测:输入学生信息,获取预测成绩
- 结果分析:查看预测结果和影响因素分析
贡献指南
欢迎对本项目进行贡献!您可以通过以下方式参与:
- 提交Bug报告或功能请求
- 提交代码改进或新功能
- 完善文档
许可证
本项目采用 MIT 许可证。详情请参阅 LICENSE 文件。
### requirements.txt
```text/plain
numpy>=1.19.5
pandas>=1.3.0
scikit-learn>=0.24.2
matplotlib>=3.4.2
seaborn>=0.11.1
flask>=2.0.1
joblib>=1.0.1
pytest>=6.2.5
data\raw\student_data.csv
student_id,gender,age,parent_education,school,study_time,classes_attended,total_classes,assignments_completed,total_assignments,previous_grade,extracurricular_activities,study_regularity,final_grade
1001,M,18,bachelor,urban,3.5,42,45,18,20,75,2,4,78
1002,F,17,master,urban,4.2,44,45,20,20,82,1,5,85
1003,M,18,high_school,rural,2.8,38,45,15,20,68,3,3,70
1004,F,19,phd,urban,5.0,45,45,19,20,88,2,5,92
1005,M,17,bachelor,suburban,3.0,40,45,16,20,72,4,2,74
1006,F,18,master,urban,4.5,43,45,20,20,85,1,4,88
1007,M,18,high_school,rural,2.5,35,45,14,20,65,2,2,67
1008,F,17,bachelor,suburban,3.8,41,45,18,20,78,3,4,80
1009,M,19,phd,urban,4.7,44,45,19,20,86,1,5,90
1010,F,18,high_school,rural,2.2,32,45,13,20,62,4,2,64
1011,M,17,bachelor,suburban,3.2,39,45,17,20,74,2,3,76
1012,F,18,master,urban,4.0,42,45,19,20,80,1,4,83
1013,M,19,high_school,rural,2.0,30,45,12,20,60,5,1,62
1014,F,17,bachelor,urban,3.7,41,45,18,20,77,2,4,79
1015,M,18,phd,suburban,4.5,43,45,20,20,85,1,5,89
1016,F,18,high_school,rural,2.3,33,45,14,20,63,3,2,65
1017,M,17,bachelor,urban,3.5,40,45,17,20,75,2,3,77
1018,F,19,master,suburban,4.3,42,45,19,20,83,1,4,86
1019,M,18,high_school,rural,2.1,31,45,13,20,61,4,2,63
1020,F,17,phd,urban,4.8,44,45,20,20,87,1,5,91
1021,M,18,bachelor,suburban,3.3,38,45,16,20,73,3,3,75
1022,F,19,master,urban,4.1,42,45,18,20,81,2,4,84
1023,M,17,high_school,rural,1.9,29,45,12,20,59,5,1,61
1024,F,18,bachelor,suburban,3.6,40,45,17,20,76,2,4,78
1025,M,18,phd,urban,4.6,43,45,19,20,86,1,5,89
1026,F,17,high_school,rural,2.4,34,45,14,20,64,3,2,66
1027,M,19,bachelor,suburban,3.4,39,45,17,20,74,2,3,76
1028,F,18,master,urban,4.4,43,45,19,20,84,1,4,87
1029,M,17,high_school,rural,2.0,30,45,13,20,60,4,2,62
1030,F,18,phd,suburban,4.9,44,45,20,20,88,1,5,92
src\data_processing.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
数据收集与预处理模块
负责从各种来源收集学生数据,并进行清洗、标准化和特征工程。
"""
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import logging
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class DataProcessor:
"""
学生数据处理类,负责数据加载、清洗、转换和特征工程
"""
def __init__(self, config=None):
"""
初始化数据处理器
参数:
config (dict, optional): 配置参数字典
"""
self.config = config or {}
self.data = None
self.features = None
self.target = None
self.categorical_features = ['gender', 'parent_education', 'school']
self.numerical_features = ['age', 'study_time', 'classes_attended', 'total_classes',
'assignments_completed', 'total_assignments', 'previous_grade',
'extracurricular_activities', 'study_regularity']
self.target_column = 'final_grade'
self.scaler = StandardScaler()
self.encoder = OneHotEncoder(sparse=False, drop='first')
self.imputer = SimpleImputer(strategy='mean')
def load_data(self, file_path):
"""
加载学生数据
参数:
file_path (str): 数据文件路径
返回:
pandas.DataFrame: 加载的数据
"""
logger.info(f"正在加载数据: {file_path}")
if not os.path.exists(file_path):
raise FileNotFoundError(f"数据文件不存在: {file_path}")
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.csv':
self.data = pd.read_csv(file_path)
elif file_ext == '.xlsx' or file_ext == '.xls':
self.data = pd.read_excel(file_path)
elif file_ext == '.json':
self.data = pd.read_json(file_path)
else:
raise ValueError(f"不支持的文件格式: {file_ext}")
logger.info(f"成功加载数据,共 {len(self.data)} 条记录")
return self.data
def handle_missing_values(self):
"""
处理缺失值
返回:
pandas.DataFrame: 处理后的数据
"""
logger.info("正在处理缺失值")
# 检查缺失值
missing_values = self.data.isnull().sum()
if missing_values.sum() > 0:
logger.info(f"发现缺失值:\n{missing_values[missing_values > 0]}")
# 对数值型特征使用均值填充
numerical_data = self.data[self.numerical_features]
self.data[self.numerical_features] = self.imputer.fit_transform(numerical_data)
# 对分类特征使用众数填充
for feature in self.categorical_features:
if feature in self.data.columns:
mode_value = self.data[feature].mode()[0]
self.data[feature].fillna(mode_value, inplace=True)
logger.info("缺失值处理完成")
return self.data
def encode_categorical_features(self):
"""
对分类特征进行编码
返回:
pandas.DataFrame: 处理后的数据
"""
logger.info("正在编码分类特征")
# 检查哪些分类特征存在于数据中
available_categorical = [f for f in self.categorical_features if f in self.data.columns]
if available_categorical:
# 使用OneHotEncoder进行编码
categorical_data = self.data[available_categorical]
encoded_data = self.encoder.fit_transform(categorical_data)
# 获取编码后的特征名称
encoded_feature_names = []
for i, feature in enumerate(available_categorical):
categories = self.encoder.categories_[i][1:] # 跳过第一个类别(drop='first')
for category in categories:
encoded_feature_names.append(f"{feature}_{category}")
# 创建编码后的DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=encoded_feature_names, index=self.data.index)
# 将编码后的特征与原始数据合并
self.data = pd.concat([self.data.drop(available_categorical, axis=1), encoded_df], axis=1)
logger.info(f"分类特征编码完成,新增 {len(encoded_feature_names)} 个特征")
else:
logger.info("没有找到需要编码的分类特征")
return self.data
def create_features(self):
"""
创建新特征
返回:
pandas.DataFrame: 处理后的数据
"""
logger.info("正在创建新特征")
# 计算出勤率
if 'classes_attended' in self.data.columns and 'total_classes' in self.data.columns:
self.data['attendance_rate'] = self.data['classes_attended'] / self.data['total_classes']
logger.info("已创建特征: attendance_rate")
# 计算作业完成率
if 'assignments_completed' in self.data.columns and 'total_assignments' in self.data.columns:
self.data['assignment_completion_rate'] = self.data['assignments_completed'] / self.data['total_assignments']
logger.info("已创建特征: assignment_completion_rate")
# 计算学习时间与成绩的比率(学习效率)
if 'study_time' in self.data.columns and 'previous_grade' in self.data.columns:
self.data['study_efficiency'] = self.data['previous_grade'] / (self.data['study_time'] + 1) # 避免除零
logger.info("已创建特征: study_efficiency")
# 创建学习规律性指标
if 'study_regularity' in self.data.columns and 'extracurricular_activities' in self.data.columns:
self.data['balance_score'] = self.data['study_regularity'] - 0.5 * self.data['extracurricular_activities']
logger.info("已创建特征: balance_score")
return self.data
def scale_numerical_features(self):
"""
对数值特征进行标准化
返回:
pandas.DataFrame: 处理后的数据
"""
logger.info("正在标准化数值特征")
# 获取当前所有的数值特征(包括新创建的特征)
current_numerical = [col for col in self.data.columns
if col != self.target_column and self.data[col].dtype in ['int64', 'float64']]
if current_numerical:
numerical_data = self.data[current_numerical]
self.data[current_numerical] = self.scaler.fit_transform(numerical_data)
logger.info(f"已标准化 {len(current_numerical)} 个数值特征")
else:
logger.info("没有找到需要标准化的数值特征")
return self.data
def prepare_features_target(self):
"""
准备特征和目标变量
返回:
tuple: (特征, 目标变量)
"""
logger.info("正在准备特征和目标变量")
if self.target_column not in self.data.columns:
raise ValueError(f"目标列 '{self.target_column}' 不在数据中")
self.target = self.data[self.target_column]
self.features = self.data.drop(self.target_column, axis=1)
logger.info(f"特征准备完成,共 {self.features.shape[1]} 个特征")
return self.features, self.target
def split_train_test(self, test_size=0.2, random_state=42):
"""
划分训练集和测试集
参数:
test_size (float): 测试集比例
random_state (int): 随机种子
返回:
tuple: (X_train, X_test, y_train, y_test)
"""
logger.info(f"正在划分训练集和测试集,测试集比例: {test_size}")
if self.features is None or self.target is None:
self.prepare_features_target()
X_train, X_test, y_train, y_test = train_test_split(
self.features, self.target, test_size=test_size, random_state=random_state
)
logger.info(f"数据集划分完成,训练集: {X_train.shape[0]} 条记录,测试集: {X_test.shape[0]} 条记录")
return X_train, X_test, y_train, y_test
def save_processed_data(self, output_dir, prefix='processed'):
"""
保存处理后的数据
参数:
output_dir (str): 输出目录
prefix (str): 文件名前缀
返回:
tuple: (特征文件路径, 目标文件路径)
"""
logger.info(f"正在保存处理后的数据到: {output_dir}")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.features is None or self.target is None:
self.prepare_features_target()
# 保存特征
features_file = os.path.join(output_dir, f"{prefix}_features.csv")
self.features.to_csv(features_file, index=False)
# 保存目标变量
target_file = os.path.join(output_dir, f"{prefix}_target.csv")
self.target.to_csv(target_file, index=False, header=True)
# 保存特征名称
feature_names_file = os.path.join(output_dir, f"{prefix}_feature_names.txt")
with open(feature_names_file, 'w') as f:
f.write('\n'.join(self.features.columns))
# 保存缩放器
import joblib
scaler_file = os.path.join(output_dir, f"{prefix}_scaler.pkl")
joblib.dump(self.scaler, scaler_file)
logger.info(f"数据保存完成,特征文件: {features_file},目标文件: {target_file}")
return features_file, target_file
def process_data(self, file_path, output_dir=None, save=True):
"""
完整的数据处理流程
参数:
file_path (str): 数据文件路径
output_dir (str, optional): 输出目录
save (bool): 是否保存处理后的数据
返回:
tuple: (特征, 目标变量) 或 (X_train, X_test, y_train, y_test)
"""
logger.info("开始数据处理流程")
# 加载数据
self.load_data(file_path)
# 处理缺失值
self.handle_missing_values()
# 编码分类特征
self.encode_categorical_features()
# 创建新特征
self.create_features()
# 标准化数值特征
self.scale_numerical_features()
# 准备特征和目标变量
self.prepare_features_target()
# 保存处理后的数据
if save and output_dir:
self.save_processed_data(output_dir)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = self.split_train_test()
logger.info("数据处理流程完成")
return X_train, X_test, y_train, y_test
def main():
"""
主函数,用于测试数据处理模块
"""
# 设置数据路径
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
data_dir = os.path.join(base_dir, 'data')
raw_data_path = os.path.join(data_dir, 'raw', 'student_data.csv')
processed_data_dir = os.path.join(data_dir, 'processed')
# 创建数据处理器
processor = DataProcessor()
# 处理数据
X_train, X_test, y_train, y_test = processor.process_data(
raw_data_path, processed_data_dir, save=True
)
# 打印处理结果
print("\n数据处理完成!")
print(f"训练集特征形状: {X_train.shape}")
print(f"测试集特征形状: {X_test.shape}")
print(f"训练集目标变量形状: {y_train.shape}")
print(f"测试集目标变量形状: {y_test.shape}")
# 打印特征列表
print("\n特征列表:")
for i, feature in enumerate(X_train.columns):
print(f"{i+1}. {feature}")
if __name__ == "__main__":
main()
src\feature_engineering.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
特征工程模块
负责创建和选择对学生成绩预测有价值的特征
"""
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import logging
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class FeatureEngineer:
"""
特征工程类,负责特征创建、选择和可视化
"""
def __init__(self, config=None):
"""
初始化特征工程器
参数:
config (dict, optional): 配置参数字典
"""
self.config = config or {}
self.features = None
self.target = None
self.selected_features = None
self.feature_importances = None
def load_data(self, features_path, target_path=None):
"""
加载特征和目标数据
参数:
features_path (str): 特征数据文件路径
target_path (str, optional): 目标数据文件路径
返回:
tuple: (特征, 目标变量)
"""
logger.info(f"正在加载特征数据: {features_path}")
self.features = pd.read_csv(features_path)
if target_path:
logger.info(f"正在加载目标数据: {target_path}")
self.target = pd.read_csv(target_path)
# 如果目标是DataFrame,转换为Series
if isinstance(self.target, pd.DataFrame) and self.target.shape[1] == 1:
self.target = self.target.iloc[:, 0]
logger.info(f"数据加载完成,特征形状: {self.features.shape}")
if self.target is not None:
logger.info(f"目标变量形状: {self.target.shape}")
return self.features, self.target
def create_interaction_features(self):
"""
创建特征交互项
返回:
pandas.DataFrame: 包含交互特征的数据
"""
logger.info("正在创建特征交互项")
# 选择数值型特征进行交互
numerical_features = self.features.select_dtypes(include=['float64', 'int64']).columns.tolist()
# 定义要创建交互项的特征对
interaction_pairs = [
('study_time', 'previous_grade'),
('attendance_rate', 'assignment_completion_rate'),
('study_efficiency', 'balance_score')
]
# 创建交互特征
for feature1, feature2 in interaction_pairs:
if feature1 in self.features.columns and feature2 in self.features.columns:
# 乘积交互
interaction_name = f"{feature1}_x_{feature2}"
self.features[interaction_name] = self.features[feature1] * self.features[feature2]
logger.info(f"已创建交互特征: {interaction_name}")
# 比率交互(避免除零)
ratio_name = f"{feature1}_div_{feature2}"
self.features[ratio_name] = self.features[feature1] / (self.features[feature2] + 1e-5)
logger.info(f"已创建交互特征: {ratio_name}")
logger.info(f"特征交互项创建完成,当前特征数量: {self.features.shape[1]}")
return self.features
def create_polynomial_features(self, degree=2):
"""
创建多项式特征
参数:
degree (int): 多项式的次数
返回:
pandas.DataFrame: 包含多项式特征的数据
"""
logger.info(f"正在创建{degree}次多项式特征")
# 选择适合创建多项式的特征
poly_candidates = ['study_time', 'previous_grade', 'attendance_rate',
'assignment_completion_rate', 'study_efficiency']
# 筛选存在的特征
poly_features = [f for f in poly_candidates if f in self.features.columns]
# 为每个特征创建多项式
for feature in poly_features:
for d in range(2, degree + 1):
poly_name = f"{feature}_pow{d}"
self.features[poly_name] = self.features[feature] ** d
logger.info(f"已创建多项式特征: {poly_name}")
logger.info(f"多项式特征创建完成,当前特征数量: {self.features.shape[1]}")
return self.features
def create_binned_features(self, n_bins=5):
"""
创建分箱特征
参数:
n_bins (int): 分箱数量
返回:
pandas.DataFrame: 包含分箱特征的数据
"""
logger.info(f"正在创建分箱特征,分箱数量: {n_bins}")
# 选择适合分箱的特征
bin_candidates = ['study_time', 'previous_grade', 'attendance_rate',
'assignment_completion_rate']
# 筛选存在的特征
bin_features = [f for f in bin_candidates if f in self.features.columns]
# 为每个特征创建分箱
for feature in bin_features:
bin_name = f"{feature}_bin"
self.features[bin_name] = pd.qcut(
self.features[feature],
q=n_bins,
labels=False,
duplicates='drop'
)
logger.info(f"已创建分箱特征: {bin_name}")
logger.info(f"分箱特征创建完成,当前特征数量: {self.features.shape[1]}")
return self.features
def select_features_correlation(self, threshold=0.1):
"""
基于相关性选择特征
参数:
threshold (float): 相关性阈值
返回:
pandas.DataFrame: 选择后的特征
"""
logger.info(f"正在基于相关性选择特征,阈值: {threshold}")
if self.target is None:
logger.warning("目标变量未设置,无法计算相关性")
return self.features
# 计算与目标变量的相关性
corr = pd.DataFrame()
corr['feature'] = self.features.columns
corr['correlation'] = [abs(np.corrcoef(self.features[f], self.target)[0, 1])
for f in self.features.columns]
# 按相关性排序
corr = corr.sort_values('correlation', ascending=False)
# 选择相关性高于阈值的特征
selected_features = corr[corr['correlation'] > threshold]['feature'].tolist()
logger.info(f"相关性选择完成,选择了 {len(selected_features)}/{self.features.shape[1]} 个特征")
# 保存特征重要性
self.feature_importances = corr
# 更新选择的特征
self.selected_features = self.features[selected_features]
return self.selected_features
def select_features_mutual_info(self, k=10):
"""
基于互信息选择特征
参数:
k (int): 选择的特征数量
返回:
pandas.DataFrame: 选择后的特征
"""
logger.info(f"正在基于互信息选择特征,选择数量: {k}")
if self.target is None:
logger.warning("目标变量未设置,无法计算互信息")
return self.features
# 调整k,确保不超过特征数量
k = min(k, self.features.shape[1])
# 使用互信息选择特征
selector = SelectKBest(mutual_info_regression, k=k)
selector.fit(self.features, self.target)
# 获取特征得分
feature_scores = pd.DataFrame()
feature_scores['feature'] = self.features.columns
feature_scores['score'] = selector.scores_
# 按得分排序
feature_scores = feature_scores.sort_values('score', ascending=False)
# 选择得分最高的k个特征
selected_features = feature_scores.head(k)['feature'].tolist()
logger.info(f"互信息选择完成,选择了 {len(selected_features)}/{self.features.shape[1]} 个特征")
# 保存特征重要性
self.feature_importances = feature_scores
# 更新选择的特征
self.selected_features = self.features[selected_features]
return self.selected_features
def apply_pca(self, n_components=0.95):
"""
应用PCA降维
参数:
n_components (float or int): 如果是小数,表示保留的方差比例;如果是整数,表示保留的主成分数量
返回:
pandas.DataFrame: PCA转换后的特征
"""
logger.info(f"正在应用PCA降维,n_components: {n_components}")
# 创建PCA对象
pca = PCA(n_components=n_components)
# 应用PCA
pca_result = pca.fit_transform(self.features)
# 创建包含主成分的DataFrame
if isinstance(n_components, int):
columns = [f'PC{i+1}' for i in range(n_components)]
else:
columns = [f'PC{i+1}' for i in range(pca_result.shape[1])]
pca_df = pd.DataFrame(pca_result, columns=columns)
logger.info(f"PCA降维完成,从 {self.features.shape[1]} 个特征降至 {pca_df.shape[1]} 个主成分")
logger.info(f"解释方差比例: {pca.explained_variance_ratio_}")
# 更新选择的特征
self.selected_features = pca_df
return self.selected_features
def visualize_feature_importance(self, top_n=10, output_dir=None):
"""
可视化特征重要性
参数:
top_n (int): 显示前N个重要特征
output_dir (str, optional): 输出目录
返回:
matplotlib.figure.Figure: 图表对象
"""
logger.info(f"正在可视化特征重要性,显示前 {top_n} 个特征")
if self.feature_importances is None:
logger.warning("特征重要性未计算,请先运行特征选择方法")
return None
# 调整top_n,确保不超过特征数量
top_n = min(top_n, len(self.feature_importances))
# 获取前N个重要特征
top_features = self.feature_importances.head(top_n)
# 创建可视化
plt.figure(figsize=(10, 6))
# 确定要绘制的列名
if 'correlation' in top_features.columns:
value_col = 'correlation'
title = '特征相关性'
else:
value_col = 'score'
title = '特征重要性分数'
# 绘制条形图
sns.barplot(x=value_col, y='feature', data=top_features)
plt.title(f'Top {top_n} {title}')
plt.tight_layout()
# 保存图表
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, 'feature_importance.png')
plt.savefig(output_file)
logger.info(f"特征重要性图表已保存至: {output_file}")
return plt.gcf()
def visualize_correlation_matrix(self, output_dir=None):
"""
可视化特征相关性矩阵
参数:
output_dir (str, optional): 输出目录
返回:
matplotlib.figure.Figure: 图表对象
"""
logger.info("正在可视化特征相关性矩阵")
# 计算相关性矩阵
if self.selected_features is not None:
corr_matrix = self.selected_features.corr()
else:
corr_matrix = self.features.corr()
# 创建可视化
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title('特征相关性矩阵')
plt.tight_layout()
# 保存图表
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, 'correlation_matrix.png')
plt.savefig(output_file)
logger.info(f"相关性矩阵图表已保存至: {output_file}")
return plt.gcf()
def save_features(self, output_path):
"""
保存选择的特征
参数:
output_path (str): 输出文件路径
返回:
str: 输出文件路径
"""
logger.info(f"正在保存特征至: {output_path}")
# 确定要保存的特征
features_to_save = self.selected_features if self.selected_features is not None else self.features
# 保存特征
features_to_save.to_csv(output_path, index=False)
logger.info(f"特征已保存,形状: {features_to_save.shape}")
return output_path
def process_features(self, features_path, target_path=None, output_dir=None):
"""
完整的特征工程流程
参数:
features_path (str): 特征数据文件路径
target_path (str, optional): 目标数据文件路径
output_dir (str, optional): 输出目录
返回:
pandas.DataFrame: 处理后的特征
"""
logger.info("开始特征工程流程")
# 加载数据
self.load_data(features_path, target_path)
# 创建交互特征
self.create_interaction_features()
# 创建多项式特征
self.create_polynomial_features(degree=2)
# 创建分箱特征
self.create_binned_features(n_bins=5)
# 基于互信息选择特征
if self.target is not None:
self.select_features_mutual_info(k=15)
# 可视化特征重要性
if output_dir:
self.visualize_feature_importance(output_dir=output_dir)
self.visualize_correlation_matrix(output_dir=output_dir)
# 保存处理后的特征
if output_dir:
output_path = os.path.join(output_dir, 'engineered_features.csv')
self.save_features(output_path)
logger.info("特征工程流程完成")
return self.selected_features if self.selected_features is not None else self.features
def main():
"""
主函数,用于测试特征工程模块
"""
# 设置数据路径
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
data_dir = os.path.join(base_dir, 'data')
processed_data_dir = os.path.join(data_dir, 'processed')
features_path = os.path.join(processed_data_dir, 'processed_features.csv')
target_path = os.path.join(processed_data_dir, 'processed_target.csv')
output_dir = os.path.join(processed_data_dir, 'feature_engineering')
# 创建输出目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 创建特征工程器
engineer = FeatureEngineer()
# 处理特征
processed_features = engineer.process_features(features_path, target_path, output_dir)
# 打印处理结果
print("\n特征工程完成!")
print(f"处理后的特征形状: {processed_features.shape}")
# 打印特征列表
print("\n处理后的特征列表:")
for i, feature in enumerate(processed_features.columns):
print(f"{i+1}. {feature}")
if __name__ == "__main__":
main()
src\model_training.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
模型训练与评估模块
使用多种机器学习算法构建预测模型,并评估其性能
"""
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import logging
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class ModelTrainer:
"""
模型训练类,负责训练、评估和选择最佳模型
"""
def __init__(self, config=None):
"""
初始化模型训练器
参数:
config (dict, optional): 配置参数字典
"""
self.config = config or {}
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
self.models = {}
self.trained_models = {}
self.model_results = {}
self.best_model_name = None
self.best_model = None
# 初始化默认模型
self._init_models()
def _init_models(self):
"""
初始化模型字典
"""
self.models = {
'Linear Regression': LinearRegression(),
'Ridge Regression': Ridge(alpha=1.0),
'Lasso Regression': Lasso(alpha=0.1),
'Elastic Net': ElasticNet(alpha=0.1, l1_ratio=0.5),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
'SVR': SVR(kernel='rbf'),
'K-Neighbors': KNeighborsRegressor(n_neighbors=5),
'AdaBoost': AdaBoostRegressor(random_state=42),
'MLP': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}
def load_data(self, X_train_path, y_train_path, X_test_path=None, y_test_path=None):
"""
加载训练和测试数据
参数:
X_train_path (str): 训练特征文件路径
y_train_path (str): 训练目标文件路径
X_test_path (str, optional): 测试特征文件路径
y_test_path (str, optional): 测试目标文件路径
返回:
tuple: (X_train, X_test, y_train, y_test)
"""
logger.info(f"正在加载训练数据: {X_train_path}, {y_train_path}")
# 加载训练数据
self.X_train = pd.read_csv(X_train_path)
# 加载训练目标
y_train_data = pd.read_csv(y_train_path)
if isinstance(y_train_data, pd.DataFrame) and y_train_data.shape[1] == 1:
self.y_train = y_train_data.iloc[:, 0]
else:
self.y_train = y_train_data
# 加载测试数据(如果提供)
if X_test_path and y_test_path:
logger.info(f"正在加载测试数据: {X_test_path}, {y_test_path}")
self.X_test = pd.read_csv(X_test_path)
y_test_data = pd.read_csv(y_test_path)
if isinstance(y_test_data, pd.DataFrame) and y_test_data.shape[1] == 1:
self.y_test = y_test_data.iloc[:, 0]
else:
self.y_test = y_test_data
logger.info(f"数据加载完成,训练集形状: {self.X_train.shape}, {self.y_train.shape}")
if self.X_test is not None:
logger.info(f"测试集形状: {self.X_test.shape}, {self.y_test.shape}")
return self.X_train, self.X_test, self.y_train, self.y_test
def add_model(self, name, model):
"""
添加自定义模型
参数:
name (str): 模型名称
model: 模型对象
返回:
dict: 更新后的模型字典
"""
logger.info(f"添加模型: {name}")
self.models[name] = model
return self.models
def remove_model(self, name):
"""
移除模型
参数:
name (str): 模型名称
返回:
dict: 更新后的模型字典
"""
if name in self.models:
logger.info(f"移除模型: {name}")
del self.models[name]
else:
logger.warning(f"模型 {name} 不存在")
return self.models
def train_models(self):
"""
训练所有模型
返回:
dict: 训练后的模型字典
"""
logger.info("开始训练模型")
if self.X_train is None or self.y_train is None:
raise ValueError("训练数据未加载,请先调用load_data方法")
self.trained_models = {}
for name, model in self.models.items():
logger.info(f"正在训练模型: {name}")
try:
model.fit(self.X_train, self.y_train)
self.trained_models[name] = model
logger.info(f"模型 {name} 训练完成")
except Exception as e:
logger.error(f"模型 {name} 训练失败: {str(e)}")
logger.info(f"所有模型训练完成,共 {len(self.trained_models)} 个模型")
return self.trained_models
def evaluate_models(self):
"""
评估所有训练好的模型
返回:
dict: 模型评估结果
"""
logger.info("开始评估模型")
if not self.trained_models:
logger.warning("没有训练好的模型,请先调用train_models方法")
return {}
if self.X_test is None or self.y_test is None:
logger.warning("测试数据未加载,将使用训练数据进行评估")
X_eval = self.X_train
y_eval = self.y_train
else:
X_eval = self.X_test
y_eval = self.y_test
self.model_results = {}
for name, model in self.trained_models.items():
logger.info(f"正在评估模型: {name}")
try:
# 预测
y_pred = model.predict(X_eval)
# 计算评估指标
mse = mean_squared_error(y_eval, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_eval, y_pred)
r2 = r2_score(y_eval, y_pred)
# 保存结果
self.model_results[name] = {
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R2': r2,
'predictions': y_pred
}
logger.info(f"模型 {name} 评估完成: RMSE={rmse:.4f}, R2={r2:.4f}")
except Exception as e:
logger.error(f"模型 {name} 评估失败: {str(e)}")
# 根据R2分数选择最佳模型
if self.model_results:
self.best_model_name = max(self.model_results, key=lambda x: self.model_results[x]['R2'])
self.best_model = self.trained_models[self.best_model_name]
logger.info(f"最佳模型: {self.best_model_name}, R2={self.model_results[self.best_model_name]['R2']:.4f}")
return self.model_results
def cross_validate_models(self, cv=5, scoring='r2'):
"""
对模型进行交叉验证
参数:
cv (int): 交叉验证折数
scoring (str): 评分标准
返回:
dict: 交叉验证结果
"""
logger.info(f"开始交叉验证,折数: {cv}, 评分标准: {scoring}")
if self.X_train is None or self.y_train is None:
raise ValueError("训练数据未加载,请先调用load_data方法")
cv_results = {}
for name, model in self.models.items():
logger.info(f"正在对模型 {name} 进行交叉验证")
try:
scores = cross_val_score(model, self.X_train, self.y_train, cv=cv, scoring=scoring)
cv_results[name] = {
'mean_score': scores.mean(),
'std_score': scores.std(),
'scores': scores
}
logger.info(f"模型 {name} 交叉验证完成: 平均分数={scores.mean():.4f}, 标准差={scores.std():.4f}")
except Exception as e:
logger.error(f"模型 {name} 交叉验证失败: {str(e)}")
return cv_results
def hyperparameter_tuning(self, model_name, param_grid, cv=5, scoring='r2'):
"""
对指定模型进行超参数调优
参数:
model_name (str): 模型名称
param_grid (dict): 参数网格
cv (int): 交叉验证折数
scoring (str): 评分标准
返回:
object: 调优后的最佳模型
"""
logger.info(f"开始对模型 {model_name} 进行超参数调优")
if self.X_train is None or self.y_train is None:
raise ValueError("训练数据未加载,请先调用load_data方法")
if model_name not in self.models:
raise ValueError(f"模型 {model_name} 不存在")
model = self.models[model_name]
# 创建网格搜索对象
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=cv,
scoring=scoring,
n_jobs=-1
)
# 执行网格搜索
grid_search.fit(self.X_train, self.y_train)
# 获取最佳参数和模型
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_model = grid_search.best_estimator_
logger.info(f"超参数调优完成,最佳参数: {best_params}, 最佳分数: {best_score:.4f}")
# 更新模型
self.models[model_name] = best_model
return best_model
def analyze_feature_importance(self, model_name=None):
"""
分析特征重要性
参数:
model_name (str, optional): 模型名称,如果为None则使用最佳模型
返回:
pandas.DataFrame: 特征重要性DataFrame
"""
logger.info("开始分析特征重要性")
if not self.trained_models:
logger.warning("没有训练好的模型,请先调用train_models方法")
return None
# 确定要分析的模型
if model_name is None:
if self.best_model_name is None:
logger.warning("最佳模型未确定,请先调用evaluate_models方法")
return None
model_name = self.best_model_name
if model_name not in self.trained_models:
logger.warning(f"模型 {model_name} 不存在或未训练")
return None
model = self.trained_models[model_name]
# 获取特征名称
feature_names = self.X_train.columns
# 获取特征重要性
importance = None
# 对于线性模型
if hasattr(model, 'coef_'):
if len(model.coef_.shape) == 1:
importance = np.abs(model.coef_)
else:
importance = np.abs(model.coef_[0])
# 对于树模型
elif hasattr(model, 'feature_importances_'):
importance = model.feature_importances_
else:
logger.warning(f"模型 {model_name} 不支持特征重要性分析")
return None
# 创建特征重要性DataFrame
feature_importance = pd.DataFrame({
'Feature': feature_names,
'Importance': importance
})
# 按重要性排序
feature_importance = feature_importance.sort_values('Importance', ascending=False)
logger.info("特征重要性分析完成")
return feature_importance
def visualize_feature_importance(self, model_name=None, top_n=10, output_dir=None):
"""
可视化特征重要性
参数:
model_name (str, optional): 模型名称,如果为None则使用最佳模型
top_n (int): 显示前N个重要特征
output_dir (str, optional): 输出目录
返回:
matplotlib.figure.Figure: 图表对象
"""
logger.info(f"正在可视化特征重要性,显示前 {top_n} 个特征")
# 获取特征重要性
feature_importance = self.analyze_feature_importance(model_name)
if feature_importance is None:
return None
# 确定要使用的模型名称(用于标题)
if model_name is None:
model_name = self.best_model_name
# 调整top_n,确保不超过特征数量
top_n = min(top_n, len(feature_importance))
# 获取前N个重要特征
top_features = feature_importance.head(top_n)
# 创建可视化
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=top_features)
plt.title(f'Top {top_n} 特征重要性 ({model_name})')
plt.tight_layout()
# 保存图表
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, f'feature_importance_{model_name.replace(" ", "_")}.png')
plt.savefig(output_file)
logger.info(f"特征重要性图表已保存至: {output_file}")
return plt.gcf()
def visualize_predictions(self, model_name=None, output_dir=None):
"""
可视化预测结果
参数:
model_name (str, optional): 模型名称,如果为None则使用最佳模型
output_dir (str, optional): 输出目录
返回:
matplotlib.figure.Figure: 图表对象
"""
logger.info("正在可视化预测结果")
if not self.model_results:
logger.warning("没有模型评估结果,请先调用evaluate_models方法")
return None
# 确定要使用的模型
if model_name is None:
if self.best_model_name is None:
logger.warning("最佳模型未确定,请先调用evaluate_models方法")
return None
model_name = self.best_model_name
if model_name not in self.model_results:
logger.warning(f"模型 {model_name} 的评估结果不存在")
return None
# 获取真实值和预测值
if self.X_test is None or self.y_test is None:
y_true = self.y_train
else:
y_true = self.y_test
y_pred = self.model_results[model_name]['predictions']
# 创建可视化
plt.figure(figsize=(10, 6))
# 散点图:真实值 vs 预测值
plt.scatter(y_true, y_pred, alpha=0.5)
# 添加对角线(完美预测)
min_val = min(y_true.min(), y_pred.min())
max_val = max(y_true.max(), y_pred.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--')
plt.xlabel('真实成绩')
plt.ylabel('预测成绩')
plt.title(f'真实成绩 vs 预测成绩 ({model_name})')
# 添加R2分数
r2 = self.model_results[model_name]['R2']
rmse = self.model_results[model_name]['RMSE']
plt.annotate(f'R² = {r2:.4f}\nRMSE = {rmse:.4f}',
xy=(0.05, 0.95), xycoords='axes fraction',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
plt.tight_layout()
# 保存图表
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, f'predictions_{model_name.replace(" ", "_")}.png')
plt.savefig(output_file)
logger.info(f"预测结果图表已保存至: {output_file}")
return plt.gcf()
def visualize_model_comparison(self, metric='R2', output_dir=None):
"""
可视化模型比较
参数:
metric (str): 比较指标,可选 'R2', 'RMSE', 'MSE', 'MAE'
output_dir (str, optional): 输出目录
返回:
matplotlib.figure.Figure: 图表对象
"""
logger.info(f"正在可视化模型比较,指标: {metric}")
if not self.model_results:
logger.warning("没有模型评估结果,请先调用evaluate_models方法")
return None
# 提取指标数据
models = []
scores = []
for name, result in self.model_results.items():
if metric in result:
models.append(name)
scores.append(result[metric])
# 创建DataFrame
df = pd.DataFrame({
'Model': models,
metric: scores
})
# 根据指标排序
if metric == 'R2': # R2越高越好
df = df.sort_values(metric, ascending=False)
else: # 其他指标越低越好
df = df.sort_values(metric, ascending=True)
# 创建可视化
plt.figure(figsize=(12, 6))
# 创建条形图
ax = sns.barplot(x='Model', y=metric, data=df)
# 添加数值标签
for i, v in enumerate(scores):
ax.text(i, v, f'{v:.4f}', ha='center', va='bottom' if metric == 'R2' else 'top')
plt.title(f'模型比较 ({metric})')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# 保存图表
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, f'model_comparison_{metric}.png')
plt.savefig(output_file)
logger.info(f"模型比较图表已保存至: {output_file}")
return plt.gcf()
def save_model(self, model_name=None, output_dir=None):
"""
保存模型
参数:
model_name (str, optional): 模型名称,如果为None则保存最佳模型
output_dir (str): 输出目录
返回:
str: 模型文件路径
"""
if output_dir is None:
logger.warning("未指定输出目录,无法保存模型")
return None
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 确定要保存的模型
if model_name is None:
if self.best_model_name is None:
logger.warning("最佳模型未确定,请先调用evaluate_models方法")
return None
model_name = self.best_model_name
if model_name not in self.trained_models:
logger.warning(f"模型 {model_name} 不存在或未训练")
return None
model = self.trained_models[model_name]
# 保存模型
model_file = os.path.join(output_dir, f'{model_name.replace(" ", "_")}_model.pkl')
joblib.dump(model, model_file)
logger.info(f"模型已保存至: {model_file}")
# 保存特征名称
feature_names_file = os.path.join(output_dir, 'feature_names.pkl')
joblib.dump(list(self.X_train.columns), feature_names_file)
logger.info(f"特征名称已保存至: {feature_names_file}")
return model_file
def save_results(self, output_dir):
"""
保存评估结果
参数:
output_dir (str): 输出目录
返回:
str: 结果文件路径
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if not self.model_results:
logger.warning("没有模型评估结果,请先调用evaluate_models方法")
return None
# 提取评估指标
results = []
for name, result in self.model_results.items():
results.append({
'Model': name,
'MSE': result['MSE'],
'RMSE': result['RMSE'],
'MAE': result['MAE'],
'R2': result['R2']
})
# 创建DataFrame
df = pd.DataFrame(results)
# 按R2分数排序
df = df.sort_values('R2', ascending=False)
# 保存结果
results_file = os.path.join(output_dir, 'model_evaluation_results.csv')
df.to_csv(results_file, index=False)
logger.info(f"评估结果已保存至: {results_file}")
return results_file
def train_and_evaluate(self, X_train_path, y_train_path, X_test_path=None, y_test_path=None, output_dir=None):
"""
完整的模型训练和评估流程
参数:
X_train_path (str): 训练特征文件路径
y_train_path (str): 训练目标文件路径
X_test_path (str, optional): 测试特征文件路径
y_test_path (str, optional): 测试目标文件路径
output_dir (str, optional): 输出目录
返回:
tuple: (最佳模型名称, 最佳模型, 评估结果)
"""
logger.info("开始模型训练和评估流程")
# 加载数据
self.load_data(X_train_path, y_train_path, X_test_path, y_test_path)
# 训练模型
self.train_models()
# 评估模型
self.evaluate_models()
# 可视化结果
if output_dir:
# 创建可视化目录
vis_dir = os.path.join(output_dir, 'visualizations')
if not os.path.exists(vis_dir):
os.makedirs(vis_dir)
# 特征重要性可视化
self.visualize_feature_importance(output_dir=vis_dir)
# 预测结果可视化
self.visualize_predictions(output_dir=vis_dir)
# 模型比较可视化
self.visualize_model_comparison(metric='R2', output_dir=vis_dir)
self.visualize_model_comparison(metric='RMSE', output_dir=vis_dir)
# 保存模型
self.save_model(output_dir=output_dir)
# 保存评估结果
self.save_results(output_dir)
logger.info("模型训练和评估流程完成")
return self.best_model_name, self.best_model, self.model_results
def main():
"""
主函数,用于测试模型训练模块
"""
# 设置数据路径
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
data_dir = os.path.join(base_dir, 'data')
processed_data_dir = os.path.join(data_dir, 'processed')
model_dir = os.path.join(base_dir, 'model')
# 特征工程后的特征文件
features_file = os.path.join(processed_data_dir, 'feature_engineering', 'engineered_features.csv')
# 如果特征工程后的文件不存在,使用处理后的原始特征
if not os.path.exists(features_file):
features_file = os.path.join(processed_data_dir, 'processed_features.csv')
# 目标文件
target_file = os.path.join(processed_data_dir, 'processed_target.csv')
# 创建模型训练器
trainer = ModelTrainer()
# 训练和评估模型
best_model_name, best_model, results = trainer.train_and_evaluate(
features_file, target_file, output_dir=model_dir
)
# 打印最佳模型信息
print(f"\n最佳模型: {best_model_name}")
print(f"R2分数: {results[best_model_name]['R2']:.4f}")
print(f"RMSE: {results[best_model_name]['RMSE']:.4f}")
# 打印所有模型的R2分数
print("\n所有模型的R2分数:")
for name, result in sorted(results.items(), key=lambda x: x[1]['R2'], reverse=True):
print(f"{name}: {result['R2']:.4f}")
if __name__ == "__main__":
main()
src\model_training_enhanced.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
增强版模型训练与评估模块
使用多种机器学习算法构建预测模型,并评估其性能
增加了更多高级功能和优化选项
"""
import os
import pandas as pd
import numpy as np
import joblib
import json
import time
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import warnings
from sklearn.base import clone
# 忽略警告
warnings.filterwarnings('ignore')
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class EnhancedModelTrainer:
"""
增强版模型训练类,负责训练、评估和选择最佳模型
增加了更多高级功能和优化选项
"""
def __init__(self, config=None):
"""
初始化模型训练器
参数:
config (dict, optional): 配置参数字典
"""
self.config = config or {}
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
self.models = {}
self.trained_models = {}
self.model_results = {}
self.best_model_name = None
self.best_model = None
self.feature_names = None
self.training_time = {}
self.model_pipelines = {}
self.preprocessor = None
# 初始化默认模型
self._init_models()
def _init_models(self):
"""
初始化模型字典,包含更多模型选项和默认参数
"""
self.models = {
'Linear Regression': LinearRegression(),
'Ridge Regression': Ridge(alpha=1.0),
'Lasso Regression': Lasso(alpha=0.1),
'Elastic Net': ElasticNet(alpha=0.1, l1_ratio=0.5),
'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
'Extra Trees': ExtraTreesRegressor(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.1),
'K-Neighbors': KNeighborsRegressor(n_neighbors=5),
'AdaBoost': AdaBoostRegressor(random_state=42),
'MLP': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}
def load_data(self, X_train_path, y_train_path, X_test_path=None, y_test_path=None):
"""
加载训练和测试数据
参数:
X_train_path (str): 训练特征文件路径
y_train_path (str): 训练目标文件路径
X_test_path (str, optional): 测试特征文件路径
y_test_path (str, optional): 测试目标文件路径
返回:
tuple: (X_train, X_test, y_train, y_test)
"""
logger.info(f"正在加载训练数据: {X_train_path}, {y_train_path}")
# 加载训练数据
self.X_train = pd.read_csv(X_train_path)
self.feature_names = self.X_train.columns.tolist()
# 加载训练目标
y_train_data = pd.read_csv(y_train_path)
if isinstance(y_train_data, pd.DataFrame) and y_train_data.shape[1] == 1:
self.y_train = y_train_data.iloc[:, 0]
else:
self.y_train = y_train_data
# 加载测试数据(如果提供)
if X_test_path and y_test_path:
logger.info(f"正在加载测试数据: {X_test_path}, {y_test_path}")
self.X_test = pd.read_csv(X_test_path)
y_test_data = pd.read_csv(y_test_path)
if isinstance(y_test_data, pd.DataFrame) and y_test_data.shape[1] == 1:
self.y_test = y_test_data.iloc[:, 0]
else:
self.y_test = y_test_data
logger.info(f"数据加载完成,训练集形状: {self.X_train.shape}, {self.y_train.shape}")
if self.X_test is not None:
logger.info(f"测试集形状: {self.X_test.shape}, {self.y_test.shape}")
return self.X_train, self.X_test, self.y_train, self.y_test
def load_data_from_dataframe(self, X_train, y_train, X_test=None, y_test=None):
"""
直接从DataFrame加载数据
参数:
X_train (DataFrame): 训练特征DataFrame
y_train (Series/DataFrame): 训练目标Series或DataFrame
X_test (DataFrame, optional): 测试特征DataFrame
y_test (Series/DataFrame, optional): 测试目标Series或DataFrame
返回:
tuple: (X_train, X_test, y_train, y_test)
"""
logger.info("正在从DataFrame加载数据")
# 加载训练数据
self.X_train = X_train
self.feature_names = self.X_train.columns.tolist()
# 加载训练目标
if isinstance(y_train, pd.DataFrame) and y_train.shape[1] == 1:
self.y_train = y_train.iloc[:, 0]
else:
self.y_train = y_train
# 加载测试数据(如果提供)
if X_test is not None and y_test is not None:
self.X_test = X_test
if isinstance(y_test, pd.DataFrame) and y_test.shape[1] == 1:
self.y_test = y_test.iloc[:, 0]
else:
self.y_test = y_test
logger.info(f"数据加载完成,训练集形状: {self.X_train.shape}, {self.y_train.shape}")
if self.X_test is not None:
logger.info(f"测试集形状: {self.X_test.shape}, {self.y_test.shape}")
return self.X_train, self.X_test, self.y_train, self.y_test
def preprocess_data(self, categorical_features=None, numerical_features=None, scaling='standard'):
"""
预处理数据,包括类别特征编码和数值特征缩放
参数:
categorical_features (list, optional): 类别特征列表
numerical_features (list, optional): 数值特征列表
scaling (str): 缩放方法,'standard'或'minmax'
返回:
tuple: (X_train_processed, X_test_processed)
"""
logger.info(f"正在预处理数据,缩放方法: {scaling}")
if self.X_train is None:
raise ValueError("训练数据未加载,请先调用load_data方法")
# 如果未指定特征类型,自动检测
if categorical_features is None and numerical_features is None:
categorical_features = self.X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = self.X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
logger.info(f"自动检测到 {len(categorical_features)} 个类别特征和 {len(numerical_features)} 个数值特征")
# 创建预处理管道
preprocessor = self._create_preprocessor(categorical_features, numerical_features, scaling)
# 应用预处理
X_train_processed = preprocessor.fit_transform(self.X_train)
# 如果有测试集,也进行预处理
X_test_processed = None
if self.X_test is not None:
X_test_processed = preprocessor.transform(self.X_test)
logger.info("数据预处理完成")
# 保存预处理器
self.preprocessor = preprocessor
return X_train_processed, X_test_processed
def _create_preprocessor(self, categorical_features, numerical_features, scaling='standard'):
"""
创建数据预处理管道
参数:
categorical_features (list): 类别特征列表
numerical_features (list): 数值特征列表
scaling (str): 缩放方法,'standard'或'minmax'
返回:
ColumnTransformer: 预处理管道
"""
from sklearn.preprocessing import OneHotEncoder
# 选择缩放器
if scaling == 'standard':
scaler = StandardScaler()
elif scaling == 'minmax':
scaler = MinMaxScaler()
else:
raise ValueError(f"不支持的缩放方法: {scaling}")
# 创建预处理管道
transformers = []
# 添加数值特征处理
if numerical_features:
transformers.append(('num', scaler, numerical_features))
# 添加类别特征处理
if categorical_features:
transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features))
# 创建列转换器
preprocessor = ColumnTransformer(transformers, remainder='passthrough')
return preprocessor
def train_models(self, selected_models=None):
"""
训练所有模型或指定的模型
参数:
selected_models (list, optional): 要训练的模型名称列表,如果为None则训练所有模型
返回:
dict: 训练后的模型字典
"""
logger.info("开始训练模型")
if self.X_train is None or self.y_train is None:
raise ValueError("训练数据未加载,请先调用load_data方法")
self.trained_models = {}
self.training_time = {}
# 确定要训练的模型
models_to_train = {}
if selected_models is not None:
for name in selected_models:
if name in self.models:
models_to_train[name] = self.models[name]
else:
logger.warning(f"模型 {name} 不存在,将被跳过")
else:
models_to_train = self.models
# 训练模型
for name, model in models_to_train.items():
logger.info(f"正在训练模型: {name}")
try:
# 记录训练开始时间
start_time = time.time()
# 训练模型
model.fit(self.X_train, self.y_train)
# 记录训练结束时间
end_time = time.time()
training_time = end_time - start_time
# 保存模型和训练时间
self.trained_models[name] = model
self.training_time[name] = training_time
logger.info(f"模型 {name} 训练完成,耗时: {training_time:.2f}秒")
except Exception as e:
logger.error(f"模型 {name} 训练失败: {str(e)}")
logger.info(f"所有模型训练完成,共 {len(self.trained_models)} 个模型")
return self.trained_models
def evaluate_models(self, additional_metrics=False):
"""
评估所有训练好的模型
参数:
additional_metrics (bool): 是否计算额外的评估指标
返回:
dict: 模型评估结果
"""
logger.info("开始评估模型")
if not self.trained_models:
logger.warning("没有训练好的模型,请先调用train_models方法")
return {}
if self.X_test is None or self.y_test is None:
logger.warning("测试数据未加载,将使用训练数据进行评估")
X_eval = self.X_train
y_eval = self.y_train
else:
X_eval = self.X_test
y_eval = self.y_test
self.model_results = {}
for name, model in self.trained_models.items():
logger.info(f"正在评估模型: {name}")
try:
# 预测
y_pred = model.predict(X_eval)
# 计算基本评估指标
mse = mean_squared_error(y_eval, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_eval, y_pred)
r2 = r2_score(y_eval, y_pred)
# 创建结果字典
result = {
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R2': r2,
'predictions': y_pred,
'training_time': self.training_time.get(name, 0)
}
# 计算额外的评估指标
if additional_metrics:
evs = explained_variance_score(y_eval, y_pred)
result['EVS'] = evs
# 计算预测误差
errors = y_eval - y_pred
result['mean_error'] = np.mean(errors)
result['median_error'] = np.median(errors)
result['error_std'] = np.std(errors)
# 保存结果
self.model_results[name] = result
logger.info(f"模型 {name} 评估完成: RMSE={rmse:.4f}, R2={r2:.4f}")
except Exception as e:
logger.error(f"模型 {name} 评估失败: {str(e)}")
# 根据R2分数选择最佳模型
if self.model_results:
self.best_model_name = max(self.model_results, key=lambda x: self.model_results[x]['R2'])
self.best_model = self.trained_models[self.best_model_name]
logger.info(f"最佳模型: {self.best_model_name}, R2={self.model_results[self.best_model_name]['R2']:.4f}")
return self.model_results
def cross_validate_models(self, cv=5, scoring='r2', selected_models=None):
"""
对模型进行交叉验证
参数:
cv (int): 交叉验证折数
scoring (str): 评分标准
selected_models (list, optional): 要验证的模型名称列表,如果为None则验证所有模型
返回:
dict: 交叉验证结果
"""
logger.info(f"开始交叉验证,折数: {cv}, 评分标准: {scoring}")
if self.X_train is None or self.y_train is None:
raise ValueError("训练数据未加载,请先调用load_data方法")
cv_results = {}
# 确定要验证的模型
models_to_validate = {}
if selected_models is not None:
for name in selected_models:
if name in self.models:
models_to_validate[name] = self.models[name]
else:
logger.warning(f"模型 {name} 不存在,将被跳过")
else:
models_to_validate = self.models
for name, model in models_to_validate.items():
logger.info(f"正在对模型 {name} 进行交叉验证")
try:
# 使用KFold进行交叉验证
kf = KFold(n_splits=cv, shuffle=True, random_state=42)
# 存储每折的预测结果
fold_predictions = []
fold_scores = []
# 对每折进行训练和评估
for fold, (train_idx, val_idx) in enumerate(kf.split(self.X_train)):
X_fold_train, X_fold_val = self.X_train.iloc[train_idx], self.X_train.iloc[val_idx]
y_fold_train, y_fold_val = self.y_train.iloc[train_idx], self.y_train.iloc[val_idx]
# 训练模型
model_clone = clone(model)
model_clone.fit(X_fold_train, y_fold_train)
# 预测并评估
y_fold_pred = model_clone.predict(X_fold_val)
fold_score = r2_score(y_fold_val, y_fold_pred)
# 存储结果
fold_predictions.append(y_fold_pred)
fold_scores.append(fold_score)
logger.debug(f"模型 {name}, 折 {fold+1}/{cv}, R2: {fold_score:.4f}")
# 计算平均分数和标准差
mean_score = np.mean(fold_scores)
std_score = np.std(fold_scores)
cv_results[name] = {
'mean_score': mean_score,
'std_score': std_score,
'fold_scores': fold_scores,
'fold_predictions': fold_predictions
}
logger.info(f"模型 {name} 交叉验证完成: 平均R2={mean_score:.4f}, 标准差={std_score:.4f}")
except Exception as e:
logger.error(f"模型 {name} 交叉验证失败: {str(e)}")
return cv_results
def hyperparameter_tuning(self, model_name, param_grid, method='grid', cv=5, scoring='r2', n_iter=10):
"""
对指定模型进行超参数调优
参数:
model_name (str): 模型名称
param_grid (dict): 参数网格
method (str): 调优方法,'grid'或'random'
cv (int): 交叉验证折数
scoring (str): 评分标准
n_iter (int): 随机搜索的迭代次数
返回:
object: 调优后的最佳模型
"""
logger.info(f"开始对模型 {model_name} 进行超参数调优,方法: {method}")
if self.X_train is None or self.y_train is None:
raise ValueError("训练数据未加载,请先调用load_data方法")
if model_name not in self.models:
raise ValueError(f"模型 {model_name} 不存在")
model = self.models[model_name]
# 创建搜索对象
if method == 'grid':
search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=cv,
scoring=scoring,
n_jobs=-1,
verbose=1
)
elif method == 'random':
search = RandomizedSearchCV(
estimator=model,
param_distributions=param_grid,
n_iter=n_iter,
cv=cv,
scoring=scoring,
n_jobs=-1,
random_state=42,
verbose=1
)
else:
raise ValueError(f"不支持的调优方法: {method}")
# 执行搜索
search.fit(self.X_train, self.y_train)
# 获取最佳参数和模型
best_params = search.best_params_
best_score = search.best_score_
best_model = search.best_estimator_
logger.info(f"超参数调优完成,最佳参数: {best_params}, 最佳分数: {best_score:.4f}")
# 更新模型
self.models[model_name] = best_model
# 如果模型已经训练过,更新训练后的模型
if model_name in self.trained_models:
self.trained_models[model_name] = best_model
return best_model, best_params, best_score
def analyze_feature_importance(self, model_name=None):
"""
分析特征重要性
参数:
model_name (str, optional): 模型名称,如果为None则使用最佳模型
返回:
pandas.DataFrame: 特征重要性DataFrame
"""
logger.info("开始分析特征重要性")
if not self.trained_models:
logger.warning("没有训练好的模型,请先调用train_models方法")
return None
# 确定要分析的模型
if model_name is None:
if self.best_model_name is None:
logger.warning("最佳模型未确定,请先调用evaluate_models方法")
return None
model_name = self.best_model_name
if model_name not in self.trained_models:
logger.warning(f"模型 {model_name} 不存在或未训练")
return None
model = self.trained_models[model_name]
# 获取特征名称
feature_names = self.X_train.columns.tolist()
# 获取特征重要性
importance = None
# 对于线性模型
if hasattr(model, 'coef_'):
if len(model.coef_.shape) == 1:
importance = np.abs(model.coef_)
else:
importance = np.mean([np.abs(model.coef_[i]) for i in range(model.coef_.shape[0])], axis=0)
# 对于树模型
elif hasattr(model, 'feature_importances_'):
importance = model.feature_importances_
# 对于管道模型
elif hasattr(model, 'steps') and hasattr(model.steps[-1][1], 'feature_importances_'):
importance = model.steps[-1][1].feature_importances_
elif hasattr(model, 'steps') and hasattr(model.steps[-1][1], 'coef_'):
if len(model.steps[-1][1].coef_.shape) == 1:
importance = np.abs(model.steps[-1][1].coef_)
else:
importance = np.mean([np.abs(model.steps[-1][1].coef_[i]) for i in range(model.steps[-1][1].coef_.shape[0])], axis=0)
else:
logger.warning(f"模型 {model_name} 不支持特征重要性分析")
# 尝试使用排列重要性
try:
from sklearn.inspection import permutation_importance
logger.info("尝试使用排列重要性分析")
# 使用训练集或测试集
if self.X_test is not None and self.y_test is not None:
X_perm = self.X_test
y_perm = self.y_test
else:
X_perm = self.X_train
y_perm = self.y_train
# 计算排列重要性
perm_importance = permutation_importance(model, X_perm, y_perm, n_repeats=10, random_state=42)
importance = perm_importance.importances_mean
except Exception as e:
logger.error(f"排列重要性分析失败: {str(e)}")
return None
# 创建特征重要性DataFrame
feature_importance = pd.DataFrame({
'Feature': feature_names,
'Importance': importance
})
# 按重要性排序
feature_importance = feature_importance.sort_values('Importance', ascending=False)
logger.info("特征重要性分析完成")
return feature_importance
def visualize_feature_importance(self, model_name=None, top_n=10, output_dir=None):
"""
可视化特征重要性
参数:
model_name (str, optional): 模型名称,如果为None则使用最佳模型
top_n (int): 显示前N个重要特征
output_dir (str, optional): 输出目录
返回:
matplotlib.figure.Figure: 图表对象
"""
logger.info(f"正在可视化特征重要性,显示前 {top_n} 个特征")
# 获取特征重要性
feature_importance = self.analyze_feature_importance(model_name)
if feature_importance is None:
return None
# 确定要使用的模型名称(用于标题)
if model_name is None:
model_name = self.best_model_name
# 调整top_n,确保不超过特征数量
top_n = min(top_n, len(feature_importance))
# 获取前N个重要特征
top_features = feature_importance.head(top_n)
# 创建可视化
plt.figure(figsize=(12, 8))
ax = sns.barplot(x='Importance', y='Feature', data=top_features, palette='viridis')
# 添加数值标签
for i, v in enumerate(top_features['Importance']):
ax.text(v, i, f'{v:.4f}', va='center')
plt.title(f'Top {top_n} 特征重要性 ({model_name})', fontsize=15)
plt.xlabel('重要性', fontsize=12)
plt.ylabel('特征', fontsize=12)
plt.tight_layout()
# 保存图表
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, f'feature_importance_{model_name.replace(" ", "_")}.png')
plt.savefig(output_file, dpi=300, bbox_inches='tight')
logger.info(f"特征重要性图表已保存至: {output_file}")
return plt.gcf()
def visualize_predictions(self, model_name=None, output_dir=None):
"""
可视化预测结果
参数:
model_name (str, optional): 模型名称,如果为None则使用最佳模型
output_dir (str, optional): 输出目录
返回:
matplotlib.figure.Figure: 图表对象
"""
logger.info("正在可视化预测结果")
if not self.model_results:
logger.warning("没有模型评估结果,请先调用evaluate_models方法")
return None
# 确定要使用的模型
if model_name is None:
if self.best_model_name is None:
logger.warning("最佳模型未确定,请先调用evaluate_models方法")
return None
model_name = self.best_model_name
if model_name not in self.model_results:
logger.warning(f"模型 {model_name} 的评估结果不存在")
return None
# 获取真实值和预测值
if self.X_test is None or self.y_test is None:
y_true = self.y_train
else:
y_true = self.y_test
y_pred = self.model_results[model_name]['predictions']
# 创建可视化
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))
# 散点图:真实值 vs 预测值
ax1.scatter(y_true, y_pred, alpha=0.6, color='blue', edgecolor='k')
# 添加对角线(完美预测)
min_val = min(y_true.min(), y_pred.min())
max_val = max(y_true.max(), y_pred.max())
ax1.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
ax1.set_xlabel('真实成绩', fontsize=12)
ax1.set_ylabel('预测成绩', fontsize=12)
ax1.set_title(f'真实成绩 vs 预测成绩 ({model_name})', fontsize=15)
# 添加R2分数
r2 = self.model_results[model_name]['R2']
rmse = self.model_results[model_name]['RMSE']
mae = self.model_results[model_name]['MAE']
ax1.annotate(f'R² = {r2:.4f}\nRMSE = {rmse:.4f}\nMAE = {mae:.4f}',
xy=(0.05, 0.95), xycoords='axes fraction',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8),
fontsize=12)
# 误差直方图
errors = y_true - y_pred
ax2.hist(errors, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
ax2.axvline(x=0, color='r', linestyle='--', linewidth=2)
ax2.set_xlabel('预测误差', fontsize=12)
ax2.set_ylabel('频率', fontsize=12)
ax2.set_title('预测误差分布', fontsize=15)
# 添加误差统计信息
mean_error = np.mean(errors)
median_error = np.median(errors)
std_error = np.std(errors)
ax2.annotate(f'平均误差 = {mean_error:.4f}\n中位数误差 = {median_error:.4f}\n标准差 = {std_error:.4f}',
xy=(0.05, 0.95), xycoords='axes fraction',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8),
fontsize=12)
plt.tight_layout()
# 保存图表
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, f'predictions_{model_name.replace(" ", "_")}.png')
plt.savefig(output_file, dpi=300, bbox_inches='tight')
logger.info(f"预测结果图表已保存至: {output_file}")
return fig
def visualize_model_comparison(self, metric='R2', output_dir=None, sort_ascending=None):
"""
可视化模型比较
参数:
metric (str): 比较指标,可选 'R2', 'RMSE', 'MSE', 'MAE'
output_dir (str, optional): 输出目录
sort_ascending (bool, optional): 是否升序排序,如果为None则根据指标自动决定
返回:
matplotlib.figure.Figure: 图表对象
"""
logger.info(f"正在可视化模型比较,指标: {metric}")
if not self.model_results:
logger.warning("没有模型评估结果,请先调用evaluate_models方法")
return None
# 提取指标数据
models = []
scores = []
training_times = []
for name, result in self.model_results.items():
if metric in result:
models.append(name)
scores.append(result[metric])
training_times.append(result.get('training_time', 0))
# 创建DataFrame
df = pd.DataFrame({
'Model': models,
metric: scores,
'Training Time (s)': training_times
})
# 确定排序方向
if sort_ascending is None:
# R2越高越好,其他指标越低越好
sort_ascending = False if metric == 'R2' else True
# 根据指标排序
df = df.sort_values(metric, ascending=sort_ascending)
# 创建可视化
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 12), gridspec_kw={'height_ratios': [3, 1]})
# 创建条形图 - 性能指标
colors = plt.cm.viridis(np.linspace(0, 1, len(df)))
bars = ax1.bar(df['Model'], df[metric], color=colors)
# 添加数值标签
for bar, score in zip(bars, df[metric]):
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width()/2., height + (0.01 * max(df[metric])),
f'{score:.4f}', ha='center', va='bottom', fontsize=10)
ax1.set_title(f'模型比较 ({metric})', fontsize=16)
ax1.set_ylabel(metric, fontsize=14)
ax1.tick_params(axis='x', rotation=45, labelsize=12)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
# 创建条形图 - 训练时间
ax2.bar(df['Model'], df['Training Time (s)'], color=colors, alpha=0.7)
ax2.set_title('模型训练时间 (秒)', fontsize=14)
ax2.set_ylabel('时间 (秒)', fontsize=12)
ax2.tick_params(axis='x', rotation=45, labelsize=12)
ax2.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
# 保存图表
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, f'model_comparison_{metric}.png')
plt.savefig(output_file, dpi=300, bbox_inches='tight')
logger.info(f"模型比较图表已保存至: {output_file}")
return fig
def save_model(self, model_name=None, output_dir=None):
"""
保存模型
参数:
model_name (str, optional): 模型名称,如果为None则保存最佳模型
output_dir (str): 输出目录
返回:
str: 模型文件路径
"""
if output_dir is None:
logger.warning("未指定输出目录,无法保存模型")
return None
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 确定要保存的模型
if model_name is None:
if self.best_model_name is None:
logger.warning("最佳模型未确定,请先调用evaluate_models方法")
return None
model_name = self.best_model_name
if model_name not in self.trained_models:
logger.warning(f"模型 {model_name} 不存在或未训练")
return None
model = self.trained_models[model_name]
# 保存模型
model_file = os.path.join(output_dir, f'{model_name.replace(" ", "_")}_model.pkl')
joblib.dump(model, model_file)
logger.info(f"模型已保存至: {model_file}")
# 保存特征名称
feature_names_file = os.path.join(output_dir, 'feature_names.pkl')
joblib.dump(list(self.X_train.columns), feature_names_file)
logger.info(f"特征名称已保存至: {feature_names_file}")
# 保存预处理器(如果存在)
if hasattr(self, 'preprocessor') and self.preprocessor is not None:
preprocessor_file = os.path.join(output_dir, 'preprocessor.pkl')
joblib.dump(self.preprocessor, preprocessor_file)
logger.info(f"预处理器已保存至: {preprocessor_file}")
# 保存模型元数据
metadata = {
'model_name': model_name,
'feature_count': len(self.X_train.columns),
'training_samples': len(self.X_train),
'creation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'performance': {
'R2': self.model_results[model_name]['R2'],
'RMSE': self.model_results[model_name]['RMSE'],
'MAE': self.model_results[model_name]['MAE'],
'MSE': self.model_results[model_name]['MSE']
},
'training_time': self.training_time.get(model_name, 0)
}
metadata_file = os.path.join(output_dir, 'model_metadata.json')
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=4)
logger.info(f"模型元数据已保存至: {metadata_file}")
return model_file
def save_results(self, output_dir):
"""
保存评估结果
参数:
output_dir (str): 输出目录
返回:
str: 结果文件路径
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if not self.model_results:
logger.warning("没有模型评估结果,请先调用evaluate_models方法")
return None
# 提取评估指标
results = []
for name, result in self.model_results.items():
results.append({
'Model': name,
'MSE': result['MSE'],
'RMSE': result['RMSE'],
'MAE': result['MAE'],
'R2': result['R2'],
'Training Time (s)': result.get('training_time', 0)
})
# 创建DataFrame
df = pd.DataFrame(results)
# 按R2分数排序
df = df.sort_values('R2', ascending=False)
# 保存结果
results_file = os.path.join(output_dir, 'model_evaluation_results.csv')
df.to_csv(results_file, index=False)
logger.info(f"评估结果已保存至: {results_file}")
return results_file
@staticmethod
def load_model(model_path, feature_names_path=None, preprocessor_path=None):
"""
加载已保存的模型
参数:
model_path (str): 模型文件路径
feature_names_path (str, optional): 特征名称文件路径
preprocessor_path (str, optional): 预处理器文件路径
返回:
tuple: (模型, 特征名称列表, 预处理器)
"""
logger.info(f"正在加载模型: {model_path}")
# 加载模型
model = joblib.load(model_path)
# 加载特征名称(如果提供)
feature_names = None
if feature_names_path and os.path.exists(feature_names_path):
feature_names = joblib.load(feature_names_path)
logger.info(f"已加载特征名称,共 {len(feature_names)} 个特征")
# 加载预处理器(如果提供)
preprocessor = None
if preprocessor_path and os.path.exists(preprocessor_path):
preprocessor = joblib.load(preprocessor_path)
logger.info("已加载预处理器")
return model, feature_names, preprocessor
def predict(self, X, model_name=None):
"""
使用指定模型进行预测
参数:
X (DataFrame): 输入特征
model_name (str, optional): 模型名称,如果为None则使用最佳模型
返回:
array: 预测结果
"""
logger.info("正在进行预测")
# 确定要使用的模型
if model_name is None:
if self.best_model_name is None:
logger.warning("最佳模型未确定,请先调用evaluate_models方法")
return None
model_name = self.best_model_name
if model_name not in self.trained_models:
logger.warning(f"模型 {model_name} 不存在或未训练")
return None
model = self.trained_models[model_name]
# 进行预测
try:
predictions = model.predict(X)
logger.info(f"预测完成,使用模型: {model_name}")
return predictions
except Exception as e:
logger.error(f"预测失败: {str(e)}")
return None
def plot_learning_curve(self, model_name=None, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), output_dir=None):
"""
绘制学习曲线
参数:
model_name (str, optional): 模型名称,如果为None则使用最佳模型
cv (int): 交叉验证折数
train_sizes (array): 训练集大小比例
output_dir (str, optional): 输出目录
返回:
matplotlib.figure.Figure: 图表对象
"""
logger.info("正在绘制学习曲线")
# 确定要使用的模型
if model_name is None:
if self.best_model_name is None:
logger.warning("最佳模型未确定,请先调用evaluate_models方法")
return None
model_name = self.best_model_name
if model_name not in self.models:
logger.warning(f"模型 {model_name} 不存在")
return None
model = self.models[model_name]
# 计算学习曲线
try:
train_sizes, train_scores, test_scores = learning_curve(
model, self.X_train, self.y_train, cv=cv, train_sizes=train_sizes,
scoring='r2', n_jobs=-1
)
# 计算平均值和标准差
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
# 创建可视化
plt.figure(figsize=(10, 6))
# 绘制训练集和验证集分数
plt.plot(train_sizes, train_mean, 'o-', color='r', label='训练集分数')
plt.plot(train_sizes, test_mean, 'o-', color='g', label='验证集分数')
# 绘制标准差区域
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='g')
# 添加标签和标题
plt.xlabel('训练样本数', fontsize=12)
plt.ylabel('R²分数', fontsize=12)
plt.title(f'学习曲线 ({model_name})', fontsize=15)
plt.legend(loc='best', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
# 保存图表
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_file = os.path.join(output_dir, f'learning_curve_{model_name.replace(" ", "_")}.png')
plt.savefig(output_file, dpi=300, bbox_inches='tight')
logger.info(f"学习曲线图表已保存至: {output_file}")
return plt.gcf()
except Exception as e:
logger.error(f"绘制学习曲线失败: {str(e)}")
return None
def train_and_evaluate(self, X_train_path, y_train_path, X_test_path=None, y_test_path=None,
output_dir=None, selected_models=None, preprocess=True,
categorical_features=None, numerical_features=None, scaling='standard'):
"""
完整的模型训练和评估流程
参数:
X_train_path (str): 训练特征文件路径
y_train_path (str): 训练目标文件路径
X_test_path (str, optional): 测试特征文件路径
y_test_path (str, optional): 测试目标文件路径
output_dir (str, optional): 输出目录
selected_models (list, optional): 要训练的模型名称列表
preprocess (bool): 是否预处理数据
categorical_features (list, optional): 类别特征列表
numerical_features (list, optional): 数值特征列表
scaling (str): 缩放方法,'standard'或'minmax'
返回:
tuple: (最佳模型名称, 最佳模型, 评估结果)
"""
logger.info("开始模型训练和评估流程")
# 加载数据
self.load_data(X_train_path, y_train_path, X_test_path, y_test_path)
# 预处理数据(如果需要)
if preprocess:
self.preprocess_data(categorical_features, numerical_features, scaling)
# 训练模型
self.train_models(selected_models)
# 评估模型
self.evaluate_models(additional_metrics=True)
# 可视化结果
if output_dir:
# 创建可视化目录
vis_dir = os.path.join(output_dir, 'visualizations')
if not os.path.exists(vis_dir):
os.makedirs(vis_dir)
# 特征重要性可视化
self.visualize_feature_importance(output_dir=vis_dir)
# 预测结果可视化
self.visualize_predictions(output_dir=vis_dir)
# 模型比较可视化
self.visualize_model_comparison(metric='R2', output_dir=vis_dir)
self.visualize_model_comparison(metric='RMSE', output_dir=vis_dir)
# 学习曲线
self.plot_learning_curve(output_dir=vis_dir)
# 保存模型
self.save_model(output_dir=output_dir)
# 保存评估结果
self.save_results(output_dir)
logger.info("模型训练和评估流程完成")
return self.best_model_name, self.best_model, self.model_results
def main():
"""
主函数,用于测试增强版模型训练模块
"""
# 设置数据路径
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
data_dir = os.path.join(base_dir, 'data')
processed_data_dir = os.path.join(data_dir, 'processed')
model_dir = os.path.join(base_dir, 'model')
# 特征工程后的特征文件
features_file = os.path.join(processed_data_dir, 'feature_engineering', 'engineered_features.csv')
# 如果特征工程后的文件不存在,使用处理后的原始特征
if not os.path.exists(features_file):
features_file = os.path.join(processed_data_dir, 'processed_features.csv')
# 目标文件
target_file = os.path.join(processed_data_dir, 'processed_target.csv')
# 创建模型训练器
trainer = EnhancedModelTrainer()
# 训练和评估模型
best_model_name, best_model, results = trainer.train_and_evaluate(
features_file, target_file, output_dir=model_dir
)
# 打印最佳模型信息
print(f"\n最佳模型: {best_model_name}")
print(f"R2分数: {results[best_model_name]['R2']:.4f}")
print(f"RMSE: {results[best_model_name]['RMSE']:.4f}")
# 打印所有模型的R2分数
print("\n所有模型的R2分数:")
for name, result in sorted(results.items(), key=lambda x: x[1]['R2'], reverse=True):
print(f"{name}: {result['R2']:.4f}")
if __name__ == "__main__":
main()