import sys
import os
import pandas as pd
from PySide6.QtWidgets import (
QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
QPushButton, QLabel, QFileDialog, QMessageBox, QTableWidget,
QTableWidgetItem, QComboBox, QLineEdit, QCheckBox,
QDialog, QListWidget, QListWidgetItem, QFormLayout,
QGroupBox, QTabWidget, QSplitter, QHeaderView, QGridLayout,
QVBoxLayout, QLabel
)
from PySide6.QtCore import Qt
from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure
import numpy as np
from sklearn.linear_model import LinearRegression
import re
# 模拟用户数据库(实际项目应使用文件或数据库持久化存储)
user_db = {
"qweer": "051212" # 示例账号
}
class LoginWindow(QDialog):
"""登录与注册窗口"""
def __init__(self):
super().__init__()
self.setWindowTitle("登录")
self.resize(300, 180)
layout = QVBoxLayout(self)
# 用户名输入
self.username_input = QLineEdit()
layout.addWidget(QLabel("用户名:"))
layout.addWidget(self.username_input)
# 密码输入
self.password_input = QLineEdit()
self.password_input.setEchoMode(QLineEdit.Password)
layout.addWidget(QLabel("密码:"))
layout.addWidget(self.password_input)
# 登录按钮
login_btn = QPushButton("登录")
login_btn.clicked.connect(self.login)
layout.addWidget(login_btn)
# 注册按钮
register_btn = QPushButton("注册")
register_btn.clicked.connect(self.register)
layout.addWidget(register_btn)
def login(self):
username = self.username_input.text()
password = self.password_input.text()
if not username or not password:
QMessageBox.warning(self, "错误", "请输入用户名和密码!")
return
if username in user_db and user_db[username] == password:
self.accept() # 关闭登录窗口并继续执行
else:
QMessageBox.warning(self, "错误", "用户名或密码错误!")
def register(self):
username = self.username_input.text()
password = self.password_input.text()
if not username or not password:
QMessageBox.warning(self, "错误", "请输入用户名和密码!")
return
if username in user_db:
QMessageBox.warning(self, "错误", "用户名已存在!")
return
user_db[username] = password
QMessageBox.information(self, "成功", "注册成功!请登录")
class DataCleanerApp(QMainWindow):
"""高级数据清洗与分析工具主窗口"""
def __init__(self):
super().__init__()
self.setWindowTitle("高级数据清洗与分析工具")
self.setGeometry(100, 100, 1200, 800)
self.df = None # 原始数据
self.processed_df = None # 处理后的数据
self.file_path = None
self.init_ui()
def init_ui(self):
"""初始化用户界面"""
# 创建中央部件和布局
central_widget = QWidget()
self.setCentralWidget(central_widget)
main_layout = QHBoxLayout(central_widget)
# 创建左侧控制面板
control_panel = QWidget()
control_panel.setMaximumWidth(300)
control_layout = QVBoxLayout(control_panel)
# 数据导入区域
import_group = QGroupBox("数据导入")
import_layout = QVBoxLayout(import_group)
self.import_btn = QPushButton("导入数据")
self.import_btn.clicked.connect(self.import_data)
import_layout.addWidget(self.import_btn)
self.file_path_label = QLabel("未选择文件")
self.file_path_label.setWordWrap(True)
import_layout.addWidget(self.file_path_label)
control_layout.addWidget(import_group)
# 数据清洗区域
clean_group = QGroupBox("数据清洗")
clean_layout = QVBoxLayout(clean_group)
self.missing_values_btn = QPushButton("处理缺失值")
self.missing_values_btn.clicked.connect(self.handle_missing_values)
clean_layout.addWidget(self.missing_values_btn)
self.outliers_btn = QPushButton("识别并处理异常值")
self.outliers_btn.clicked.connect(self.identify_outliers)
clean_layout.addWidget(self.outliers_btn)
self.remove_duplicates_btn = QPushButton("删除重复行")
self.remove_duplicates_btn.clicked.connect(self.remove_duplicates)
clean_layout.addWidget(self.remove_duplicates_btn)
self.convert_type_btn = QPushButton("转换列类型为数值")
self.convert_type_btn.clicked.connect(self.convert_column_to_numeric)
clean_layout.addWidget(self.convert_type_btn)
control_layout.addWidget(clean_group)
# 数据操作区域
operation_group = QGroupBox("数据操作")
operation_layout = QVBoxLayout(operation_group)
self.sort_btn = QPushButton("按列排序")
self.sort_btn.clicked.connect(self.sort_data)
operation_layout.addWidget(self.sort_btn)
self.filter_btn = QPushButton("按条件筛选")
self.filter_btn.clicked.connect(self.filter_data)
operation_layout.addWidget(self.filter_btn)
self.export_btn = QPushButton("导出数据")
self.export_btn.clicked.connect(self.export_data)
operation_layout.addWidget(self.export_btn)
self.plot_btn = QPushButton("绘制图表")
self.plot_btn.clicked.connect(self.plot_data)
operation_layout.addWidget(self.plot_btn)
control_layout.addWidget(operation_group)
control_layout.addStretch()
# 右侧数据显示区域
display_splitter = QSplitter(Qt.Vertical)
# 数据预览表
self.table_widget = QTableWidget()
self.table_widget.setEditTriggers(QTableWidget.NoEditTriggers)
self.table_widget.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch)
self.table_widget.verticalHeader().setVisible(False)
# 数据信息面板
self.info_widget = QLabel("数据信息将显示在这里")
self.info_widget.setWordWrap(True)
self.info_widget.setStyleSheet("background-color: #f0f0f0; padding: 10px;")
display_splitter.addWidget(self.table_widget)
display_splitter.addWidget(self.info_widget)
display_splitter.setSizes([500, 200])
# 添加到主布局
main_layout.addWidget(control_panel)
main_layout.addWidget(display_splitter, 1)
# 创建状态栏
self.statusBar().showMessage("就绪")
def import_data(self):
"""导入数据文件"""
file_path, _ = QFileDialog.getOpenFileName(
self, "选择数据文件", "", "CSV文件 (*.csv);;Excel文件 (*.xlsx *.xls)"
)
if not file_path:
return
self.file_path = file_path
self.file_path_label.setText(os.path.basename(file_path))
self.statusBar().showMessage(f"正在导入文件: {file_path}")
try:
if file_path.endswith(('.xlsx', '.xls')):
self.df = pd.read_excel(file_path)
else:
self.df = pd.read_csv(file_path)
self.processed_df = self.df.copy()
self.update_table_display()
self.update_data_info()
self.statusBar().showMessage(f"文件导入成功: {file_path}")
except Exception as e:
QMessageBox.critical(self, "导入错误", f"无法导入文件: {str(e)}")
self.statusBar().showMessage("导入失败")
def update_table_display(self, df=None):
"""更新表格显示"""
if df is None:
df = self.processed_df
if df is None or df.empty:
self.table_widget.setRowCount(0)
self.table_widget.setColumnCount(0)
return
self.table_widget.setRowCount(min(1000, len(df)))
self.table_widget.setColumnCount(len(df.columns))
self.table_widget.setHorizontalHeaderLabels(df.columns)
for row_idx in range(min(1000, len(df))):
for col_idx, col_name in enumerate(df.columns):
value = df.iloc[row_idx, col_idx]
if pd.isna(value):
item = QTableWidgetItem("nan")
item.setForeground(Qt.red)
else:
item = QTableWidgetItem(str(value))
self.table_widget.setItem(row_idx, col_idx, item)
if row_idx % 100 == 0:
QApplication.processEvents()
def update_data_info(self, df=None):
"""更新数据信息显示"""
if df is None:
df = self.processed_df
if df is None or df.empty:
self.info_widget.setText("没有数据")
return
info_text = f"数据基本信息:\n"
info_text += f"- 行数: {len(df)}\n"
info_text += f"- 列数: {len(df.columns)}\n"
info_text += f"- 列名: {', '.join(df.columns)}\n\n"
info_text += "列信息:\n"
for col in df.columns:
dtype = str(df[col].dtype)
missing = df[col].isna().sum()
percent_missing = missing / len(df) * 100
info_text += f"- {col}: {dtype}, 缺失值: {missing} ({percent_missing:.1f}%)\n"
self.info_widget.setText(info_text)
def handle_missing_values(self):
"""处理缺失值"""
if self.processed_df is None or self.processed_df.empty:
QMessageBox.warning(self, "操作失败", "没有数据可处理")
return
dialog = QDialog(self)
dialog.setWindowTitle("处理缺失值")
layout = QVBoxLayout(dialog)
list_widget = QListWidget()
for col in self.processed_df.columns:
if self.processed_df[col].isna().sum() > 0:
item = QListWidgetItem(col)
item.setCheckState(Qt.Checked)
list_widget.addItem(item)
if list_widget.count() == 0:
QMessageBox.warning(self, "操作失败", "没有需要处理的缺失值")
return
layout.addWidget(QLabel("选择要处理缺失值的列:"))
layout.addWidget(list_widget)
method_combo = QComboBox()
method_combo.addItems(["填充均值", "填充中位数", "填充固定值", "删除行"])
layout.addWidget(QLabel("选择处理方法:"))
layout.addWidget(method_combo)
value_input = QLineEdit()
value_input.setPlaceholderText("填充值(仅在选择“填充固定值”时有效)")
layout.addWidget(value_input)
button_layout = QHBoxLayout()
ok_button = QPushButton("确定")
cancel_button = QPushButton("取消")
button_layout.addWidget(ok_button)
button_layout.addWidget(cancel_button)
layout.addLayout(button_layout)
ok_button.clicked.connect(dialog.accept)
cancel_button.clicked.connect(dialog.reject)
if dialog.exec() != QDialog.Accepted:
return
selected_cols = [list_widget.item(i).text() for i in range(list_widget.count()) if list_widget.item(i).checkState() == Qt.Checked]
method = method_combo.currentText()
fill_value = value_input.text()
if not selected_cols:
QMessageBox.warning(self, "操作失败", "未选择任何列")
return
try:
for col in selected_cols:
if method == "填充均值":
self.processed_df[col].fillna(self.processed_df[col].mean(), inplace=True)
elif method == "填充中位数":
self.processed_df[col].fillna(self.processed_df[col].median(), inplace=True)
elif method == "填充固定值":
self.processed_df[col].fillna(fill_value, inplace=True)
elif method == "删除行":
self.processed_df.dropna(subset=[col], inplace=True)
self.update_table_display()
self.update_data_info()
self.statusBar().showMessage("缺失值处理完成")
except Exception as e:
QMessageBox.critical(self, "处理错误", f"处理缺失值时出错: {str(e)}")
def identify_outliers(self):
"""识别并处理异常值"""
pass # 此处可扩展,略
def remove_duplicates(self):
"""删除重复行"""
if self.processed_df is None or self.processed_df.empty:
QMessageBox.warning(self, "操作失败", "没有数据可处理")
return
try:
self.processed_df.drop_duplicates(inplace=True)
self.update_table_display()
self.update_data_info()
self.statusBar().showMessage("重复行已删除")
except Exception as e:
QMessageBox.critical(self, "错误", f"删除重复行时出错: {str(e)}")
def convert_column_to_numeric(self):
"""转换列类型为数值型,并尝试清理非数字字符"""
dialog = QDialog(self)
dialog.setWindowTitle("选择要转换的列")
layout = QVBoxLayout(dialog)
list_widget = QListWidget()
for col in self.processed_df.columns:
if pd.api.types.is_object_dtype(self.processed_df[col]):
item = QListWidgetItem(col)
item.setCheckState(Qt.Checked)
list_widget.addItem(item)
layout.addWidget(QLabel("选择要转换为数值类型的列:"))
layout.addWidget(list_widget)
button_layout = QHBoxLayout()
ok_button = QPushButton("确定")
cancel_button = QPushButton("取消")
button_layout.addWidget(ok_button)
button_layout.addWidget(cancel_button)
layout.addLayout(button_layout)
ok_button.clicked.connect(dialog.accept)
cancel_button.clicked.connect(dialog.reject)
if dialog.exec() != QDialog.Accepted:
return
selected_cols = [list_widget.item(i).text() for i in range(list_widget.count()) if list_widget.item(i).checkState() == Qt.Checked]
if not selected_cols:
QMessageBox.warning(self, "操作失败", "未选择任何列")
return
try:
def extract_number(text):
# 使用正则表达式提取字符串中的第一个数字(支持整数和小数)
match = re.search(r'[-+]?\d*\.?\d+', str(text))
return float(match.group()) if match else np.nan
for col in selected_cols:
# 先提取数字,再转换为数值类型
self.processed_df[col] = self.processed_df[col].apply(extract_number)
self.update_table_display()
self.update_data_info()
self.statusBar().showMessage("列类型转换完成(已清理非数字字符)")
except Exception as e:
QMessageBox.critical(self, "错误", f"转换列类型时出错: {str(e)}")
def sort_data(self):
"""按列排序"""
dialog = QDialog(self)
dialog.setWindowTitle("选择排序列")
layout = QVBoxLayout(dialog)
combo_box = QComboBox()
combo_box.addItems(self.processed_df.columns.tolist())
layout.addWidget(QLabel("选择排序列:"))
layout.addWidget(combo_box)
ascending_checkbox = QCheckBox("升序")
ascending_checkbox.setChecked(True)
layout.addWidget(ascending_checkbox)
button_layout = QHBoxLayout()
ok_button = QPushButton("确定")
cancel_button = QPushButton("取消")
button_layout.addWidget(ok_button)
button_layout.addWidget(cancel_button)
layout.addLayout(button_layout)
ok_button.clicked.connect(dialog.accept)
cancel_button.clicked.connect(dialog.reject)
if dialog.exec() != QDialog.Accepted:
return
column = combo_box.currentText()
ascending = ascending_checkbox.isChecked()
try:
self.processed_df.sort_values(by=column, ascending=ascending, inplace=True)
self.update_table_display()
self.update_data_info()
self.statusBar().showMessage("数据排序完成")
except Exception as e:
QMessageBox.critical(self, "错误", f"排序时出错: {str(e)}")
def filter_data(self):
"""按条件筛选数据"""
dialog = QDialog(self)
dialog.setWindowTitle("筛选数据")
layout = QVBoxLayout(dialog)
col_combo = QComboBox()
col_combo.addItems(self.processed_df.columns.tolist())
layout.addWidget(QLabel("选择列:"))
layout.addWidget(col_combo)
condition_combo = QComboBox()
condition_combo.addItems(["等于", "不等于", "大于", "小于", "包含"])
layout.addWidget(QLabel("选择条件:"))
layout.addWidget(condition_combo)
value_input = QLineEdit()
layout.addWidget(QLabel("输入值:"))
layout.addWidget(value_input)
button_layout = QHBoxLayout()
ok_button = QPushButton("确定")
cancel_button = QPushButton("取消")
button_layout.addWidget(ok_button)
button_layout.addWidget(cancel_button)
layout.addLayout(button_layout)
ok_button.clicked.connect(dialog.accept)
cancel_button.clicked.connect(dialog.reject)
if dialog.exec() != QDialog.Accepted:
return
column = col_combo.currentText()
condition = condition_combo.currentText()
value = value_input.text()
try:
if condition == "等于":
filtered_df = self.processed_df[self.processed_df[column] == value]
elif condition == "不等于":
filtered_df = self.processed_df[self.processed_df[column] != value]
elif condition == "大于":
filtered_df = self.processed_df[self.processed_df[column] > float(value)]
elif condition == "小于":
filtered_df = self.processed_df[self.processed_df[column] < float(value)]
elif condition == "包含":
filtered_df = self.processed_df[self.processed_df[column].str.contains(value, case=False)]
self.update_table_display(filtered_df)
self.statusBar().showMessage("数据筛选完成")
except Exception as e:
QMessageBox.critical(self, "错误", f"筛选数据时出错: {str(e)}")
def export_data(self):
"""导出数据"""
if self.processed_df is None or self.processed_df.empty:
QMessageBox.warning(self, "操作失败", "没有数据可导出")
return
file_path, ext = QFileDialog.getSaveFileName(
self, "保存文件", "", "CSV文件 (*.csv);;Excel文件 (*.xlsx)"
)
if not file_path:
return
try:
if ext.endswith("*.csv"):
self.processed_df.to_csv(file_path, index=False)
elif ext.endswith("*.xlsx"):
self.processed_df.to_excel(file_path, index=False)
QMessageBox.information(self, "成功", "数据已成功导出!")
except Exception as e:
QMessageBox.critical(self, "错误", f"导出数据时出错: {str(e)}")
def plot_data(self):
"""选择绘图类型并绘制图表,并添加图表总结与预测"""
if self.processed_df is None or self.processed_df.empty:
QMessageBox.warning(self, "操作失败", "没有可绘制的数据")
return
dialog = QDialog(self)
dialog.setWindowTitle("选择图表类型和参数")
layout = QVBoxLayout(dialog)
# 图表类型选择
chart_type_combo = QComboBox()
chart_type_combo.addItems(["柱状图", "折线图", "散点图", "直方图", "箱型图"])
layout.addWidget(QLabel("选择图表类型:"))
layout.addWidget(chart_type_combo)
# X轴选择
x_combo = QComboBox()
x_combo.addItems(self.processed_df.columns.tolist())
layout.addWidget(QLabel("选择X轴列:"))
layout.addWidget(x_combo)
# Y轴选择(有些图表不需要Y轴)
y_combo = QComboBox()
y_combo.addItems(["无"] + self.processed_df.columns.tolist())
layout.addWidget(QLabel("选择Y轴列 (如适用):"))
layout.addWidget(y_combo)
# 是否分组(仅适用于箱型图)
group_combo = QComboBox()
group_combo.addItems(["无"] + self.processed_df.columns.tolist())
layout.addWidget(QLabel("选择分组列 (如箱型图):"))
layout.addWidget(group_combo)
# 预测开关
predict_checkbox = QCheckBox("启用趋势预测(线性回归)")
layout.addWidget(predict_checkbox)
button_layout = QHBoxLayout()
ok_button = QPushButton("确定")
cancel_button = QPushButton("取消")
button_layout.addWidget(ok_button)
button_layout.addWidget(cancel_button)
layout.addLayout(button_layout)
ok_button.clicked.connect(dialog.accept)
cancel_button.clicked.connect(dialog.reject)
if dialog.exec() != QDialog.Accepted:
return
chart_type = chart_type_combo.currentText()
x_col = x_combo.currentText()
y_col = y_combo.currentText() if y_combo.currentText() != "无" else None
group_col = group_combo.currentText() if group_combo.currentText() != "无" else None
enable_predict = predict_checkbox.isChecked()
try:
# 创建新的窗口显示图表
plot_window = QDialog(self)
plot_window.setWindowTitle(f"{chart_type} - {x_col}" + (f" vs {y_col}" if y_col else ""))
plot_layout = QVBoxLayout(plot_window)
fig = Figure(figsize=(8, 6))
canvas = FigureCanvas(fig)
ax = fig.add_subplot(111)
summary_text = ""
if chart_type == "柱状图":
if y_col:
data = self.processed_df.groupby(x_col)[y_col].mean()
data.plot(kind='bar', ax=ax)
max_row = data.idxmax()
min_row = data.idxmin()
summary_text += f"柱状图总结:\n最大值出现在 '{max_row}' ({data.loc[max_row]:.2f})\n最小值出现在 '{min_row}' ({data.loc[min_row]:.2f})"
else:
data = self.processed_df[x_col].value_counts()
data.plot(kind='bar', ax=ax)
max_row = data.idxmax()
summary_text += f"柱状图总结:\n出现频率最高的是 '{max_row}' ({data.loc[max_row]} 次)"
elif chart_type == "折线图":
if y_col:
ax.plot(self.processed_df[x_col], self.processed_df[y_col])
trend = np.polyfit(self.processed_df[x_col], self.processed_df[y_col], deg=1)
if enable_predict:
future_x = np.linspace(self.processed_df[x_col].min(), self.processed_df[x_col].max() + 10, len(self.processed_df) + 10)
ax.plot(future_x, np.polyval(trend, future_x), 'r--', label="预测趋势")
ax.legend()
summary_text += f"折线图总结:\n趋势斜率为 {trend[0]:.2f},{'上升' if trend[0] > 0 else '下降'}趋势"
else:
self.processed_df[x_col].plot(ax=ax)
summary_text += "折线图总结: 数据未指定Y轴,展示默认序列"
elif chart_type == "散点图":
if y_col:
ax.scatter(self.processed_df[x_col], self.processed_df[y_col])
if enable_predict:
model = LinearRegression()
X = self.processed_df[[x_col]]
y = self.processed_df[y_col]
model.fit(X, y)
future_x = np.linspace(X.min()[0], X.max()[0] + 10, 100).reshape(-1, 1)
ax.plot(future_x, model.predict(future_x), color='red', linestyle='--', label='预测线')
ax.legend()
summary_text += f"散点图总结:\n线性回归R²: {model.score(X, y):.2f}, 斜率: {model.coef_[0]:.2f}"
else:
summary_text += "散点图总结: 数据分布随机,未启用预测"
elif chart_type == "直方图":
counts, bins, _ = ax.hist(self.processed_df[x_col].dropna(), bins=20)
mean = self.processed_df[x_col].mean()
std = self.processed_df[x_col].std()
summary_text += f"直方图总结:\n均值: {mean:.2f}, 标准差: {std:.2f}\n最大频段区间: [{bins[np.argmax(counts)]:.2f}, {bins[np.argmax(counts)+1]:.2f}]"
elif chart_type == "箱型图":
if group_col and y_col:
self.processed_df.boxplot(column=y_col, by=group_col, ax=ax)
summary_text += f"箱型图总结:\n按 '{group_col}' 分组比较 '{y_col}' 的分布"
elif y_col:
ax.boxplot(self.processed_df[y_col].dropna())
summary_text += f"箱型图总结:\n展示 '{y_col}' 的分布情况,识别异常值"
ax.set_title(f"{chart_type}: {x_col}" + (f" vs {y_col}" if y_col else ""))
ax.grid(True)
plot_layout.addWidget(canvas)
# 添加总结信息
summary_label = QLabel(summary_text)
summary_label.setStyleSheet("background-color: #f9f9f9; padding: 10px;")
summary_label.setWordWrap(True)
plot_layout.addWidget(summary_label)
plot_window.setLayout(plot_layout)
plot_window.resize(800, 600)
plot_window.exec()
except Exception as e:
QMessageBox.critical(self, "错误", f"绘制图表时出错: {str(e)}")
if __name__ == "__main__":
app = QApplication(sys.argv)
# 显示登录窗口
login_window = LoginWindow()
if login_window.exec() != QDialog.Accepted:
sys.exit() # 如果未登录成功,则退出程序
# 登录成功后显示主窗口
window = DataCleanerApp()
window.show()
sys.exit(app.exec())
改变操作界面布局