data.apply(pd.value_counts).fillna(0)用法

这段内容展示了如何利用Python的Pandas库对数据框中的各列进行频数统计,并用fillna()方法将缺失值填充为0。具体操作是通过data.apply(pd.value_counts)计算每列的频数,然后使用fillna(0)填充缺失值。

利用python进行数据分析

data.apply(pd.value_counts).fillna(0)的意思

import pandas as pd 
from pandas import Series, DataFrame

data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})

result = data.apply(pd.value_counts).fillna(0)  

In [26]:data
Out[26]:
Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4

In [27]:result
Out[28]:
Qu1 Qu2 Qu3
1 1 1 1
2 0 2 1
3 2 2 0
4 2 0 2
5 0 0 1
#首先计算Qu1列中的频数,行索引为原来data的所以
data.value_counts('Qu1')
Out[21]: 
Qu1
3    2
4    2
1    1
dtype: int64

pd.value_counts(data.Qu1).reindex(range(1, 6))
Out[22]: 
1    1.0
2    NaN
3    2.0
4    2.0
5    NaN
Name: Qu1, dtype: float64

#用0填充缺失值
pd.value_counts(data.Qu1).reindex(range(1, 6)).fillna(0)
Out[23]: 
1    1.0
2    0.0
3    2.0
4    2.0
5    0.0
Name: Qu1, dtype: float64
import sys import os import pandas as pd from PySide6.QtWidgets import ( QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QLabel, QFileDialog, QMessageBox, QTableWidget, QTableWidgetItem, QComboBox, QLineEdit, QCheckBox, QDialog, QListWidget, QListWidgetItem, QFormLayout, QGroupBox, QTabWidget, QSplitter, QHeaderView, QGridLayout, QVBoxLayout, QLabel ) from PySide6.QtCore import Qt from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg as FigureCanvas from matplotlib.figure import Figure import numpy as np from sklearn.linear_model import LinearRegression import re # 模拟用户数据库(实际项目应使用文件或数据库持久化存储) user_db = { "qweer": "051212" # 示例账号 } class LoginWindow(QDialog): """登录与注册窗口""" def __init__(self): super().__init__() self.setWindowTitle("登录") self.resize(300, 180) layout = QVBoxLayout(self) # 用户名输入 self.username_input = QLineEdit() layout.addWidget(QLabel("用户名:")) layout.addWidget(self.username_input) # 密码输入 self.password_input = QLineEdit() self.password_input.setEchoMode(QLineEdit.Password) layout.addWidget(QLabel("密码:")) layout.addWidget(self.password_input) # 登录按钮 login_btn = QPushButton("登录") login_btn.clicked.connect(self.login) layout.addWidget(login_btn) # 注册按钮 register_btn = QPushButton("注册") register_btn.clicked.connect(self.register) layout.addWidget(register_btn) def login(self): username = self.username_input.text() password = self.password_input.text() if not username or not password: QMessageBox.warning(self, "错误", "请输入用户名和密码!") return if username in user_db and user_db[username] == password: self.accept() # 关闭登录窗口并继续执行 else: QMessageBox.warning(self, "错误", "用户名或密码错误!") def register(self): username = self.username_input.text() password = self.password_input.text() if not username or not password: QMessageBox.warning(self, "错误", "请输入用户名和密码!") return if username in user_db: QMessageBox.warning(self, "错误", "用户名已存在!") return user_db[username] = password QMessageBox.information(self, "成功", "注册成功!请登录") class DataCleanerApp(QMainWindow): """高级数据清洗与分析工具主窗口""" def __init__(self): super().__init__() self.setWindowTitle("高级数据清洗与分析工具") self.setGeometry(100, 100, 1200, 800) self.df = None # 原始数据 self.processed_df = None # 处理后的数据 self.file_path = None self.init_ui() def init_ui(self): """初始化用户界面""" # 创建中央部件和布局 central_widget = QWidget() self.setCentralWidget(central_widget) main_layout = QHBoxLayout(central_widget) # 创建左侧控制面板 control_panel = QWidget() control_panel.setMaximumWidth(300) control_layout = QVBoxLayout(control_panel) # 数据导入区域 import_group = QGroupBox("数据导入") import_layout = QVBoxLayout(import_group) self.import_btn = QPushButton("导入数据") self.import_btn.clicked.connect(self.import_data) import_layout.addWidget(self.import_btn) self.file_path_label = QLabel("未选择文件") self.file_path_label.setWordWrap(True) import_layout.addWidget(self.file_path_label) control_layout.addWidget(import_group) # 数据清洗区域 clean_group = QGroupBox("数据清洗") clean_layout = QVBoxLayout(clean_group) self.missing_values_btn = QPushButton("处理缺失值") self.missing_values_btn.clicked.connect(self.handle_missing_values) clean_layout.addWidget(self.missing_values_btn) self.outliers_btn = QPushButton("识别并处理异常值") self.outliers_btn.clicked.connect(self.identify_outliers) clean_layout.addWidget(self.outliers_btn) self.remove_duplicates_btn = QPushButton("删除重复行") self.remove_duplicates_btn.clicked.connect(self.remove_duplicates) clean_layout.addWidget(self.remove_duplicates_btn) self.convert_type_btn = QPushButton("转换列类型为数值") self.convert_type_btn.clicked.connect(self.convert_column_to_numeric) clean_layout.addWidget(self.convert_type_btn) control_layout.addWidget(clean_group) # 数据操作区域 operation_group = QGroupBox("数据操作") operation_layout = QVBoxLayout(operation_group) self.sort_btn = QPushButton("按列排序") self.sort_btn.clicked.connect(self.sort_data) operation_layout.addWidget(self.sort_btn) self.filter_btn = QPushButton("按条件筛选") self.filter_btn.clicked.connect(self.filter_data) operation_layout.addWidget(self.filter_btn) self.export_btn = QPushButton("导出数据") self.export_btn.clicked.connect(self.export_data) operation_layout.addWidget(self.export_btn) self.plot_btn = QPushButton("绘制图表") self.plot_btn.clicked.connect(self.plot_data) operation_layout.addWidget(self.plot_btn) control_layout.addWidget(operation_group) control_layout.addStretch() # 右侧数据显示区域 display_splitter = QSplitter(Qt.Vertical) # 数据预览表 self.table_widget = QTableWidget() self.table_widget.setEditTriggers(QTableWidget.NoEditTriggers) self.table_widget.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch) self.table_widget.verticalHeader().setVisible(False) # 数据信息面板 self.info_widget = QLabel("数据信息将显示在这里") self.info_widget.setWordWrap(True) self.info_widget.setStyleSheet("background-color: #f0f0f0; padding: 10px;") display_splitter.addWidget(self.table_widget) display_splitter.addWidget(self.info_widget) display_splitter.setSizes([500, 200]) # 添加到主布局 main_layout.addWidget(control_panel) main_layout.addWidget(display_splitter, 1) # 创建状态栏 self.statusBar().showMessage("就绪") def import_data(self): """导入数据文件""" file_path, _ = QFileDialog.getOpenFileName( self, "选择数据文件", "", "CSV文件 (*.csv);;Excel文件 (*.xlsx *.xls)" ) if not file_path: return self.file_path = file_path self.file_path_label.setText(os.path.basename(file_path)) self.statusBar().showMessage(f"正在导入文件: {file_path}") try: if file_path.endswith(('.xlsx', '.xls')): self.df = pd.read_excel(file_path) else: self.df = pd.read_csv(file_path) self.processed_df = self.df.copy() self.update_table_display() self.update_data_info() self.statusBar().showMessage(f"文件导入成功: {file_path}") except Exception as e: QMessageBox.critical(self, "导入错误", f"无法导入文件: {str(e)}") self.statusBar().showMessage("导入失败") def update_table_display(self, df=None): """更新表格显示""" if df is None: df = self.processed_df if df is None or df.empty: self.table_widget.setRowCount(0) self.table_widget.setColumnCount(0) return self.table_widget.setRowCount(min(1000, len(df))) self.table_widget.setColumnCount(len(df.columns)) self.table_widget.setHorizontalHeaderLabels(df.columns) for row_idx in range(min(1000, len(df))): for col_idx, col_name in enumerate(df.columns): value = df.iloc[row_idx, col_idx] if pd.isna(value): item = QTableWidgetItem("nan") item.setForeground(Qt.red) else: item = QTableWidgetItem(str(value)) self.table_widget.setItem(row_idx, col_idx, item) if row_idx % 100 == 0: QApplication.processEvents() def update_data_info(self, df=None): """更新数据信息显示""" if df is None: df = self.processed_df if df is None or df.empty: self.info_widget.setText("没有数据") return info_text = f"数据基本信息:\n" info_text += f"- 行数: {len(df)}\n" info_text += f"- 列数: {len(df.columns)}\n" info_text += f"- 列名: {', '.join(df.columns)}\n\n" info_text += "列信息:\n" for col in df.columns: dtype = str(df[col].dtype) missing = df[col].isna().sum() percent_missing = missing / len(df) * 100 info_text += f"- {col}: {dtype}, 缺失值: {missing} ({percent_missing:.1f}%)\n" self.info_widget.setText(info_text) def handle_missing_values(self): """处理缺失值""" if self.processed_df is None or self.processed_df.empty: QMessageBox.warning(self, "操作失败", "没有数据可处理") return dialog = QDialog(self) dialog.setWindowTitle("处理缺失值") layout = QVBoxLayout(dialog) list_widget = QListWidget() for col in self.processed_df.columns: if self.processed_df[col].isna().sum() > 0: item = QListWidgetItem(col) item.setCheckState(Qt.Checked) list_widget.addItem(item) if list_widget.count() == 0: QMessageBox.warning(self, "操作失败", "没有需要处理的缺失值") return layout.addWidget(QLabel("选择要处理缺失值的列:")) layout.addWidget(list_widget) method_combo = QComboBox() method_combo.addItems(["填充均值", "填充中位数", "填充固定值", "删除行"]) layout.addWidget(QLabel("选择处理方法:")) layout.addWidget(method_combo) value_input = QLineEdit() value_input.setPlaceholderText("填充值(仅在选择“填充固定值”时有效)") layout.addWidget(value_input) button_layout = QHBoxLayout() ok_button = QPushButton("确定") cancel_button = QPushButton("取消") button_layout.addWidget(ok_button) button_layout.addWidget(cancel_button) layout.addLayout(button_layout) ok_button.clicked.connect(dialog.accept) cancel_button.clicked.connect(dialog.reject) if dialog.exec() != QDialog.Accepted: return selected_cols = [list_widget.item(i).text() for i in range(list_widget.count()) if list_widget.item(i).checkState() == Qt.Checked] method = method_combo.currentText() fill_value = value_input.text() if not selected_cols: QMessageBox.warning(self, "操作失败", "未选择任何列") return try: for col in selected_cols: if method == "填充均值": self.processed_df[col].fillna(self.processed_df[col].mean(), inplace=True) elif method == "填充中位数": self.processed_df[col].fillna(self.processed_df[col].median(), inplace=True) elif method == "填充固定值": self.processed_df[col].fillna(fill_value, inplace=True) elif method == "删除行": self.processed_df.dropna(subset=[col], inplace=True) self.update_table_display() self.update_data_info() self.statusBar().showMessage("缺失值处理完成") except Exception as e: QMessageBox.critical(self, "处理错误", f"处理缺失值时出错: {str(e)}") def identify_outliers(self): """识别并处理异常值""" pass # 此处可扩展,略 def remove_duplicates(self): """删除重复行""" if self.processed_df is None or self.processed_df.empty: QMessageBox.warning(self, "操作失败", "没有数据可处理") return try: self.processed_df.drop_duplicates(inplace=True) self.update_table_display() self.update_data_info() self.statusBar().showMessage("重复行已删除") except Exception as e: QMessageBox.critical(self, "错误", f"删除重复行时出错: {str(e)}") def convert_column_to_numeric(self): """转换列类型为数值型,并尝试清理非数字字符""" dialog = QDialog(self) dialog.setWindowTitle("选择要转换的列") layout = QVBoxLayout(dialog) list_widget = QListWidget() for col in self.processed_df.columns: if pd.api.types.is_object_dtype(self.processed_df[col]): item = QListWidgetItem(col) item.setCheckState(Qt.Checked) list_widget.addItem(item) layout.addWidget(QLabel("选择要转换为数值类型的列:")) layout.addWidget(list_widget) button_layout = QHBoxLayout() ok_button = QPushButton("确定") cancel_button = QPushButton("取消") button_layout.addWidget(ok_button) button_layout.addWidget(cancel_button) layout.addLayout(button_layout) ok_button.clicked.connect(dialog.accept) cancel_button.clicked.connect(dialog.reject) if dialog.exec() != QDialog.Accepted: return selected_cols = [list_widget.item(i).text() for i in range(list_widget.count()) if list_widget.item(i).checkState() == Qt.Checked] if not selected_cols: QMessageBox.warning(self, "操作失败", "未选择任何列") return try: def extract_number(text): # 使用正则表达式提取字符串中的第一个数字(支持整数和小数) match = re.search(r'[-+]?\d*\.?\d+', str(text)) return float(match.group()) if match else np.nan for col in selected_cols: # 先提取数字,再转换为数值类型 self.processed_df[col] = self.processed_df[col].apply(extract_number) self.update_table_display() self.update_data_info() self.statusBar().showMessage("列类型转换完成(已清理非数字字符)") except Exception as e: QMessageBox.critical(self, "错误", f"转换列类型时出错: {str(e)}") def sort_data(self): """按列排序""" dialog = QDialog(self) dialog.setWindowTitle("选择排序列") layout = QVBoxLayout(dialog) combo_box = QComboBox() combo_box.addItems(self.processed_df.columns.tolist()) layout.addWidget(QLabel("选择排序列:")) layout.addWidget(combo_box) ascending_checkbox = QCheckBox("升序") ascending_checkbox.setChecked(True) layout.addWidget(ascending_checkbox) button_layout = QHBoxLayout() ok_button = QPushButton("确定") cancel_button = QPushButton("取消") button_layout.addWidget(ok_button) button_layout.addWidget(cancel_button) layout.addLayout(button_layout) ok_button.clicked.connect(dialog.accept) cancel_button.clicked.connect(dialog.reject) if dialog.exec() != QDialog.Accepted: return column = combo_box.currentText() ascending = ascending_checkbox.isChecked() try: self.processed_df.sort_values(by=column, ascending=ascending, inplace=True) self.update_table_display() self.update_data_info() self.statusBar().showMessage("数据排序完成") except Exception as e: QMessageBox.critical(self, "错误", f"排序时出错: {str(e)}") def filter_data(self): """按条件筛选数据""" dialog = QDialog(self) dialog.setWindowTitle("筛选数据") layout = QVBoxLayout(dialog) col_combo = QComboBox() col_combo.addItems(self.processed_df.columns.tolist()) layout.addWidget(QLabel("选择列:")) layout.addWidget(col_combo) condition_combo = QComboBox() condition_combo.addItems(["等于", "不等于", "大于", "小于", "包含"]) layout.addWidget(QLabel("选择条件:")) layout.addWidget(condition_combo) value_input = QLineEdit() layout.addWidget(QLabel("输入值:")) layout.addWidget(value_input) button_layout = QHBoxLayout() ok_button = QPushButton("确定") cancel_button = QPushButton("取消") button_layout.addWidget(ok_button) button_layout.addWidget(cancel_button) layout.addLayout(button_layout) ok_button.clicked.connect(dialog.accept) cancel_button.clicked.connect(dialog.reject) if dialog.exec() != QDialog.Accepted: return column = col_combo.currentText() condition = condition_combo.currentText() value = value_input.text() try: if condition == "等于": filtered_df = self.processed_df[self.processed_df[column] == value] elif condition == "不等于": filtered_df = self.processed_df[self.processed_df[column] != value] elif condition == "大于": filtered_df = self.processed_df[self.processed_df[column] > float(value)] elif condition == "小于": filtered_df = self.processed_df[self.processed_df[column] < float(value)] elif condition == "包含": filtered_df = self.processed_df[self.processed_df[column].str.contains(value, case=False)] self.update_table_display(filtered_df) self.statusBar().showMessage("数据筛选完成") except Exception as e: QMessageBox.critical(self, "错误", f"筛选数据时出错: {str(e)}") def export_data(self): """导出数据""" if self.processed_df is None or self.processed_df.empty: QMessageBox.warning(self, "操作失败", "没有数据可导出") return file_path, ext = QFileDialog.getSaveFileName( self, "保存文件", "", "CSV文件 (*.csv);;Excel文件 (*.xlsx)" ) if not file_path: return try: if ext.endswith("*.csv"): self.processed_df.to_csv(file_path, index=False) elif ext.endswith("*.xlsx"): self.processed_df.to_excel(file_path, index=False) QMessageBox.information(self, "成功", "数据已成功导出!") except Exception as e: QMessageBox.critical(self, "错误", f"导出数据时出错: {str(e)}") def plot_data(self): """选择绘图类型并绘制图表,并添加图表总结与预测""" if self.processed_df is None or self.processed_df.empty: QMessageBox.warning(self, "操作失败", "没有可绘制的数据") return dialog = QDialog(self) dialog.setWindowTitle("选择图表类型和参数") layout = QVBoxLayout(dialog) # 图表类型选择 chart_type_combo = QComboBox() chart_type_combo.addItems(["柱状图", "折线图", "散点图", "直方图", "箱型图"]) layout.addWidget(QLabel("选择图表类型:")) layout.addWidget(chart_type_combo) # X轴选择 x_combo = QComboBox() x_combo.addItems(self.processed_df.columns.tolist()) layout.addWidget(QLabel("选择X轴列:")) layout.addWidget(x_combo) # Y轴选择(有些图表不需要Y轴) y_combo = QComboBox() y_combo.addItems(["无"] + self.processed_df.columns.tolist()) layout.addWidget(QLabel("选择Y轴列 (如适用):")) layout.addWidget(y_combo) # 是否分组(仅适用于箱型图) group_combo = QComboBox() group_combo.addItems(["无"] + self.processed_df.columns.tolist()) layout.addWidget(QLabel("选择分组列 (如箱型图):")) layout.addWidget(group_combo) # 预测开关 predict_checkbox = QCheckBox("启用趋势预测(线性回归)") layout.addWidget(predict_checkbox) button_layout = QHBoxLayout() ok_button = QPushButton("确定") cancel_button = QPushButton("取消") button_layout.addWidget(ok_button) button_layout.addWidget(cancel_button) layout.addLayout(button_layout) ok_button.clicked.connect(dialog.accept) cancel_button.clicked.connect(dialog.reject) if dialog.exec() != QDialog.Accepted: return chart_type = chart_type_combo.currentText() x_col = x_combo.currentText() y_col = y_combo.currentText() if y_combo.currentText() != "无" else None group_col = group_combo.currentText() if group_combo.currentText() != "无" else None enable_predict = predict_checkbox.isChecked() try: # 创建新的窗口显示图表 plot_window = QDialog(self) plot_window.setWindowTitle(f"{chart_type} - {x_col}" + (f" vs {y_col}" if y_col else "")) plot_layout = QVBoxLayout(plot_window) fig = Figure(figsize=(8, 6)) canvas = FigureCanvas(fig) ax = fig.add_subplot(111) summary_text = "" if chart_type == "柱状图": if y_col: data = self.processed_df.groupby(x_col)[y_col].mean() data.plot(kind='bar', ax=ax) max_row = data.idxmax() min_row = data.idxmin() summary_text += f"柱状图总结:\n最大值出现在 '{max_row}' ({data.loc[max_row]:.2f})\n最小值出现在 '{min_row}' ({data.loc[min_row]:.2f})" else: data = self.processed_df[x_col].value_counts() data.plot(kind='bar', ax=ax) max_row = data.idxmax() summary_text += f"柱状图总结:\n出现频率最高的是 '{max_row}' ({data.loc[max_row]} 次)" elif chart_type == "折线图": if y_col: ax.plot(self.processed_df[x_col], self.processed_df[y_col]) trend = np.polyfit(self.processed_df[x_col], self.processed_df[y_col], deg=1) if enable_predict: future_x = np.linspace(self.processed_df[x_col].min(), self.processed_df[x_col].max() + 10, len(self.processed_df) + 10) ax.plot(future_x, np.polyval(trend, future_x), 'r--', label="预测趋势") ax.legend() summary_text += f"折线图总结:\n趋势斜率为 {trend[0]:.2f},{'上升' if trend[0] > 0 else '下降'}趋势" else: self.processed_df[x_col].plot(ax=ax) summary_text += "折线图总结: 数据未指定Y轴,展示默认序列" elif chart_type == "散点图": if y_col: ax.scatter(self.processed_df[x_col], self.processed_df[y_col]) if enable_predict: model = LinearRegression() X = self.processed_df[[x_col]] y = self.processed_df[y_col] model.fit(X, y) future_x = np.linspace(X.min()[0], X.max()[0] + 10, 100).reshape(-1, 1) ax.plot(future_x, model.predict(future_x), color='red', linestyle='--', label='预测线') ax.legend() summary_text += f"散点图总结:\n线性回归R²: {model.score(X, y):.2f}, 斜率: {model.coef_[0]:.2f}" else: summary_text += "散点图总结: 数据分布随机,未启用预测" elif chart_type == "直方图": counts, bins, _ = ax.hist(self.processed_df[x_col].dropna(), bins=20) mean = self.processed_df[x_col].mean() std = self.processed_df[x_col].std() summary_text += f"直方图总结:\n均值: {mean:.2f}, 标准差: {std:.2f}\n最大频段区间: [{bins[np.argmax(counts)]:.2f}, {bins[np.argmax(counts)+1]:.2f}]" elif chart_type == "箱型图": if group_col and y_col: self.processed_df.boxplot(column=y_col, by=group_col, ax=ax) summary_text += f"箱型图总结:\n按 '{group_col}' 分组比较 '{y_col}' 的分布" elif y_col: ax.boxplot(self.processed_df[y_col].dropna()) summary_text += f"箱型图总结:\n展示 '{y_col}' 的分布情况,识别异常值" ax.set_title(f"{chart_type}: {x_col}" + (f" vs {y_col}" if y_col else "")) ax.grid(True) plot_layout.addWidget(canvas) # 添加总结信息 summary_label = QLabel(summary_text) summary_label.setStyleSheet("background-color: #f9f9f9; padding: 10px;") summary_label.setWordWrap(True) plot_layout.addWidget(summary_label) plot_window.setLayout(plot_layout) plot_window.resize(800, 600) plot_window.exec() except Exception as e: QMessageBox.critical(self, "错误", f"绘制图表时出错: {str(e)}") if __name__ == "__main__": app = QApplication(sys.argv) # 显示登录窗口 login_window = LoginWindow() if login_window.exec() != QDialog.Accepted: sys.exit() # 如果未登录成功,则退出程序 # 登录成功后显示主窗口 window = DataCleanerApp() window.show() sys.exit(app.exec()) 改变操作界面布局
07-17
评论 1
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值