前言
在本教程中,我们将使用Python编写一个功能强大的桌面应用程序,用于数据清洗、压缩、合并excel文件。该程序基于PySide6库和其他辅助库进行开发,提供了直观的用户界面和易于使用的功能。下面我们将对代码进行分块介绍。(底部附项目完整代码)
导入所需库和模块
在程序的开头,我们导入了各种必要的库和模块,包括在数据处理中使用的bag模块、图形界面相关的PySide6库、文件处理相关的zipfile、xlsxwriter和openpyxl,以及其他辅助库
创建数据清洗类(DataCleaning)
DataCleaning类是我们应用程序的核心组件之一,用于实现数据清洗功能。它包含了选择文件、保存文件、开始清洗和清洗附加选项等功能。具体而言,该类包括以下方法:
- select_file:用于选择要清洗的文件
- save_file:用于选择保存清洗结果的文件
- run:开始执行数据清洗操作,根据用户选择的附加选项进行不同的清洗动
创建压缩、解压类(CompressionDecompression)
CompressionDecompression类用于实现文件压缩和解压功能。它包括选择文件、保存文件、压缩文件和解压文件等方法。具体而言,该类包括以下方法:
- select_file:用于选择要压缩或解压的文件
- save_file_1:用于选择保存压缩或解压结果的文件
- on_button_clicked:用于执行解压文件操作
- on_button_clicked1:用于执行压缩文件操作
创建合并表格类(SelectDirectory)
SelectDirectory类实现了合并表格的功能。它包括选择文件夹、选择保存文件和运行合并操作等方法。具体而言,该类包括以下方法:
- choose_folder:用于选择要合并的表格文件所在的文件夹
- save_file_1:用于选择保存合并结果的文件
- run:执行合并表格操作
创建主布局类(Layout)
Layout类是整个应用程序的主布局类,用于显示各个功能的按钮和界面。它包含了数据清洗、压缩、合并和关键词抽取四个子界面,并提供了切换子界面的功能
程序入口
在程序的最后,我们创建了一个QApplication对象,实例化了Layout类,并显示应用程序的主窗口。
通过以上分块介绍,我们已经了解了整个应用程序的结构和各个功能的实现方式
功能图示
完整代码
#!/usr/bin/env python3
# coding:utf-8
import re
import sys
import time
import os
from datetime import datetime
import bag
from PySide6.QtGui import QFont, QPainter, QPen, QPalette, QColor, QIntValidator
from PySide6.QtWidgets import QApplication, QMainWindow, QLabel, QPushButton, QFrame, QFileDialog, QLineEdit, \
QProgressBar, QMessageBox, QStackedWidget, QVBoxLayout, QTextBrowser
from PySide6.QtCore import Qt, QJsonDocument
from tqdm import tqdm
import zipfile
import xlsxwriter
import openpyxl
import emoji
import json
session = bag.session.create_session()
def judge(file_path): # 加载数据
try:
suffix = file_path.split('.')[-1]
suffix_dic = {'xlsx': bag.Bag.read_excel, 'csv': bag.Bag.read_csv, 'json': bag.Bag.read_json}
data = suffix_dic.get(suffix)(file_path)
return data
except AttributeError:
print('未选择文件')
exit()
def set_text_browser_background_color(text_browser, color): # 设置背景颜色
palette = text_browser.palette()
palette.setColor(QPalette.Base, color)
text_browser.setPalette(palette)
def set_text_browser_foreground_color(text_browser, color): # 设置字体颜色
palette = text_browser.palette()
palette.setColor(QPalette.Text, color)
text_browser.setPalette(palette)
def unzip_file(zip_path, extract_path):
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
# noinspection PyTypeChecker
class SeparatorLine(QFrame):
def paintEvent(self, event):
painter = QPainter(self)
painter.setPen(QPen(self.palette().color(QPalette.Dark).darker(150), 2))
painter.drawLine(self.width() / 2, 0, self.width() / 2, self.height())
# noinspection PyTypeChecker
class SeparatorLine1(QFrame):
def paintEvent(self, event):
painter = QPainter(self)
painter.setPen(QPen(self.palette().color(QPalette.Dark).darker(150), 2))
painter.drawLine(0, self.height() / 2, self.width(), self.height() / 2)
class Layout(QMainWindow):
def __init__(self):
super().__init__()
self.data_cleaning = None
self.Compression = None
self.file_directory = None
self.keywords = None
self.ppt_order = None
# 定义标题与窗口大小
self.setWindowTitle("数据清洗")
self.setGeometry(100, 100, 1280, 720)
self.stacked_widget = QStackedWidget(self)
self.setCentralWidget(self.stacked_widget)
self.label = QLabel(datetime.now().strftime("%Y-%m-%d"), self)
self.label.setGeometry(10, 5, 130, 50)
self.label.setFont(QFont("等线", 18)) # 设置字体和字号
self.clean_data = QPushButton('数据清洗', self)
self.clean_data.setFont(QFont('等线', 11))
self.clean_data.setGeometry(155, 10, 130, 30)
self.clean_data.clicked.connect(self.show_data_cleaning)
self.zip_data = QPushButton('压缩/解压', self)
self.zip_data.setFont(QFont('等线', 11))
self.zip_data.setGeometry(300, 10, 130, 30)
self.zip_data.clicked.connect(self.show_zip_page)
self.zip_data = QPushButton('合并excel', self)
self.zip_data.setFont(QFont('等线', 11))
self.zip_data.setGeometry(445, 10, 130, 30)
self.zip_data.clicked.connect(self.select_directory)
self.line = SeparatorLine(self)
self.line.setGeometry(150, 0, 2, 10000) # 添加垂直分割线
self.line_horizontal = SeparatorLine1(self)
# noinspection PyTypeChecker
self.line_horizontal.setGeometry(150, self.height() / 8, 10000, 2) # 添加水平分割线
def show_data_cleaning(self):
if not self.data_cleaning:
self.data_cleaning = DataCleaning()
self.stacked_widget.addWidget(self.data_cleaning)
self.stacked_widget.setCurrentWidget(self.data_cleaning)
def show_zip_page(self):
if not self.Compression:
self.Compression = CompressionDecompression()
self.stacked_widget.addWidget(self.Compression)
self.stacked_widget.setCurrentWidget(self.Compression)
def select_directory(self):
if not self.file_directory:
self.file_directory = SelectDirectory()
self.stacked_widget.addWidget(self.file_directory)
self.stacked_widget.setCurrentWidget(self.file_directory)
# 清洗
class DataCleaning(Layout):
def __init__(self):
super().__init__()
self.save_path = None
self.file_path = None
self.layout = QVBoxLayout()
self.progress_bar = QProgressBar(self)
self.progress_bar.setGeometry(155, 235, 668, 28)
self.progress_bar.setVisible(False)
self.button_select = QPushButton("打开", self)
self.button_select.setFont(QFont('等线', 11))
self.button_select.setGeometry(10, 60, 130, 30)
self.button_select.clicked.connect(self.select_file)
self.button_save = QPushButton("另存为", self)
self.button_save.setFont(QFont('等线', 11))
self.button_save.setGeometry(10, 95, 130, 30)
self.button_save.clicked.connect(self.save_file)
self.button_run = QPushButton("开始清洗", self)
self.button_run.setFont(QFont('等线', 11))
self.button_run.setGeometry(10, 130, 130, 30)
self.button_run.clicked.connect(self.run)
self.button = QPushButton("清洗附加选项", self)
self.button.setFont(QFont('等线', 11))
self.button.setGeometry(155, 60, 130, 30)
self.button.clicked.connect(self.button_clicked)
self.label_1_1 = QLabel(self)
self.label_1_1.setGeometry(290, 50, 700, 50)
self.label_1_1.setFont(QFont("等线", 14)) # 设置字体和字号
self.label_1_1.setVisible(False)
self.input_text = QLineEdit(self) # 创建一个输入框
self.input_text.setGeometry(155, 125, 130, 30)
self.input_text.setVisible(False)
self.input_text.setValidator(QIntValidator())
self.label_info = QLabel("清洗指定列", self) # 创建一个标签
self.label_info.setFont(QFont('等线', 11))
self.label_info.setGeometry(155, 95, 130, 30) # 设置标签位置和大小
self.label_info.setVisible(False) # 初始时隐藏标签
self.label_info_1 = QLabel("根据指定列去重", self)
self.label_info_1.setFont(QFont('等线', 11))
self.label_info_1.setGeometry(155, 155, 130, 30)
self.label_info_1.setVisible(False)
self.input_text_1 = QLineEdit(self)
self.input_text_1.setGeometry(155, 185, 130, 30)
self.input_text_1.setVisible(False)
self.input_text_1.setValidator(QIntValidator())
# 指示灯
self.red_light = QLabel(self)
self.red_light.setGeometry(10, 170, 20, 20)
self.red_light.setStyleSheet("background-color: red")
self.red_light.setVisible(False)
self.green_light = QLabel(self)
self.green_light.setGeometry(35, 170, 20, 20)
self.green_light.setStyleSheet("background-color: green")
self.green_light.setVisible(False)
self.line_horizontal_1 = SeparatorLine1(self)
self.line_horizontal_1.setGeometry(150, 225, 10000, 2)
self.line_horizontal_1.setVisible(False)
def button_clicked(self):
self.label_1_1.setText('(温馨提示,下方的输入框只能输入数字,以下参数非必须参数,可不填)')
self.input_text.setVisible(not self.input_text.isVisible()) # 切换输入框的可见性
self.label_info.setVisible(self.input_text.isVisible())
self.input_text_1.setVisible(not self.input_text_1.isVisible()) # 切换输入框的可见性
self.label_info_1.setVisible(self.input_text_1.isVisible())
self.label_1_1.setVisible(not self.label_1_1.isVisible())
self.label_1_1.setVisible(self.label_1_1.isVisible())
def select_file(self): # 选择文件
file_dialog = QFileDialog()
file_path, _ = file_dialog.getOpenFileName(self, "选择文件", "", "All Files (*)")
if file_path:
self.file_path = file_path
def save_file(self): # 保存文件
file_dialog = QFileDialog()
file_dialog.setWindowTitle("另存为") # 修改对话框标题
file_dialog.setNameFilters(["Text Files (*.txt)", "CSV Files (*.csv)",
"Excel Files (*.xlsx *.xls)", "JSON Files (*.json *.jsonl)"])
file_dialog.selectNameFilter("Excel Files (*.xlsx")
if file_dialog.exec():
self.save_path = file_dialog.selectedFiles()[0]
if os.path.exists(self.save_path):
# noinspection PyUnresolvedReferences
result = QMessageBox.warning(self, "警告", "文件已存在,是否覆盖?",
QMessageBox.Yes | QMessageBox.No)
# noinspection PyUnresolvedReferences
if result == QMessageBox.Yes:
bag.Bag.save_excel([], self.save_path)
else:
self.save_path = ''
else:
bag.Bag.save_excel([], self.save_path)
def run(self): # 开始清洗
resp = judge(self.file_path)
total = len(resp)
def clean_1(ls, col_number, col_name):
self.red_light.setVisible(True)
result = []
for i, info in enumerate(tqdm(ls)):
mid = []
for value in info[col_number].split('\n'):
if re.sub(r'\s', '', value):
text = value.lstrip(',.?!;:,。?;:')
text1 = text.strip()
text2 = emoji.replace_emoji(text1, replace='<emoji>')
mid.append(text2)
else:
pass
info[col_number] = '\n'.join(mid)
result.append(info)
# 更新进度条的值
progress = int((i + 1) / total * 100)
self.progress_bar.setValue(progress)
QApplication.processEvents() # 刷新界面
sign = []
new_result = [item for item in result if item[col_name] not in sign and not sign.append(item[col_name])]
self.red_light.setVisible(False)
self.green_light.setVisible(True)
if bool(self.save_path):
bag.Bag.save_excel(new_result, self.save_path)
else:
self.save_file()
bag.Bag.save_excel(new_result, self.save_path)
time.sleep(1)
self.green_light.setVisible(False)
def clean_2(ls, col_number):
self.red_light.setVisible(True)
result = []
for i, info in enumerate(tqdm(ls)):
mid = []
for value in info[col_number].split('\n'):
if re.sub(r'\s', '', value):
text = value.lstrip(',.?!;:,。?;:')
text1 = text.strip()
text2 = emoji.replace_emoji(text1, replace='<emoji>')
mid.append(text2)
else:
pass
info[col_number] = '\n'.join(mid)
result.append(info)
progress = int((i + 1) / total * 100)
self.progress_bar.setValue(progress)
QApplication.processEvents() # 刷新界面
self.red_light.setVisible(False)
self.green_light.setVisible(True)
if bool(self.save_path):
bag.Bag.save_excel(result, self.save_path)
else:
self.save_file()
bag.Bag.save_excel(result, self.save_path)
time.sleep(1)
self.green_light.setVisible(False)
def clean_3(ls, col_name):
self.red_light.setVisible(True)
result = []
for i, info in enumerate(tqdm(ls)):
mid1 = []
for info1 in info:
mid = []
for value in info1.split('\n'):
if re.sub(r'\s', '', value):
text = value.lstrip(',.?!;:,。?;:')
text1 = text.strip()
text2 = emoji.replace_emoji(text1, replace='<emoji>')
mid.append(text2)
else:
pass
mid1.append('\n'.join(mid))
result.append(mid1)
progress = int((i + 1) / total * 100)
self.progress_bar.setValue(progress)
QApplication.processEvents() # 刷新界面
sign = []
new_result = [item for item in result if item[col_name] not in sign and not sign.append(item[col_name])]
self.red_light.setVisible(False)
self.green_light.setVisible(True)
if bool(self.save_path):
bag.Bag.save_excel(new_result, self.save_path)
else:
self.save_file()
bag.Bag.save_excel(new_result, self.save_path)
time.sleep(1)
self.green_light.setVisible(False)
def clean_4(ls):
self.red_light.setVisible(True)
result = []
for i, info in enumerate(tqdm(ls)):
mid1 = []
for info1 in info:
mid = []
for value in info1.split('\n'):
if re.sub(r'\s', '', value):
text = value.lstrip(',.?!;:,。?;:')
text1 = text.strip()
text2 = emoji.replace_emoji(text1, replace='<emoji>')
mid.append(text2)
else:
pass
mid1.append('\n'.join(mid))
result.append(mid1)
progress = int((i + 1) / total * 100)
self.progress_bar.setValue(progress)
QApplication.processEvents() # 刷新界面
self.red_light.setVisible(False)
self.green_light.setVisible(True)
if bool(self.save_path):
bag.Bag.save_excel(result, self.save_path)
else:
self.save_file()
bag.Bag.save_excel(result, self.save_path)
time.sleep(1)
self.green_light.setVisible(False)
additional_options = self.input_text.text() # 获取输入框的文本内容
additional_options_1 = self.input_text_1.text()
col_num = additional_options
tag_name = additional_options_1
"""判断是否夹带清洗附加条件"""
target = bool(col_num)
target_1 = bool(tag_name)
# 进度条
self.progress_bar.setVisible(True)
self.line_horizontal_1.setVisible(True)
# 处理异常
try:
if target and target_1: # 清洗指定列以及根据某某标签去重
clean_1(resp, int(col_num)-1, int(tag_name)-1)
elif target and not target_1: # 清洗子指定列,不去重
clean_2(resp, int(col_num)-1)
elif not target and target_1: # 清洗全文再根据某某列去重
clean_3(resp, int(tag_name)-1)
else: # 清洗全文,不去重
clean_4(resp)
QMessageBox.information(self, "完成", "数据清洗完成")
except Exception as e:
error_message = str(e)
QMessageBox.critical(self, "错误", error_message)
# 压缩、解压
class CompressionDecompression(Layout):
def __init__(self):
super().__init__()
self.layout = QVBoxLayout()
self.file_path = None
self.button_compress = QPushButton("压缩文件", self)
self.button_compress.setFont(QFont('等线', 11))
self.button_compress.setGeometry(10, 60, 130, 30)
self.button_compress.clicked.connect(self.on_button_clicked)
self.button_extract = QPushButton("解压文件", self)
self.button_extract.setFont(QFont('等线', 11))
self.button_extract.setGeometry(10, 95, 130, 30)
self.button_extract.clicked.connect(self.on_button_clicked1)
def on_button_clicked(self):
selected_file = bool(self.file_path)
if selected_file:
extract_path = QFileDialog.getExistingDirectory(self, "另存为")
if bool(extract_path):
unzip_file(selected_file, extract_path)
QMessageBox.information(self, 'success', '解压成功!')
else:
QMessageBox.critical(self, 'error', '解压失败')
else:
self.select_file()
extract_path = QFileDialog.getExistingDirectory(self, "另存为")
if bool(extract_path):
unzip_file(self.file_path, extract_path)
QMessageBox.information(self, 'success', '解压成功!')
else:
QMessageBox.critical(self, 'error', '解压失败')
def on_button_clicked1(self):
selected_file = bool(self.file_path)
if selected_file:
extract_path = QFileDialog.getExistingDirectory(window, "选择文件夹", options=QFileDialog.ShowDirsOnly)
if bool(extract_path):
self.compress_to_zip(selected_file, extract_path)
QMessageBox.information(self, 'success', '压缩成功!')
else:
QMessageBox.critical(self, 'error', '压缩失败')
else:
self.select_folder()
extract_path = QFileDialog.getExistingDirectory(window, "选择文件夹", options=QFileDialog.ShowDirsOnly)
if bool(extract_path):
self.compress_to_zip(self.file_path, extract_path)
QMessageBox.critical(self, 'success', '压缩成功!')
else:
QMessageBox.critical(self, 'error', '压缩失败')
def select_file(self): # 选择文件
file_dialog = QFileDialog()
file_path, _ = file_dialog.getOpenFileName(self, "选择文件", "", "All Files (*)")
if file_path:
self.file_path = file_path
def select_folder(self):
file_path, _ = QFileDialog.getExistingDirectory(window, "选择文件夹", options=QFileDialog.ShowDirsOnly)
if file_path:
self.file_path = file_path
def compress_to_zip(self, file_path, zip_path):
with zipfile.ZipFile(zip_path, 'w') as zipf:
zipf.write(file_path, arcname='compressed_file.txt')
# 合并表格
# noinspection PyMethodMayBeStatic
class SelectDirectory(Layout):
def __init__(self):
super().__init__()
self.layout = QVBoxLayout()
self.button_extract = QPushButton("选择合并文件路径", self)
self.button_extract.setFont(QFont('等线', 11))
self.button_extract.setGeometry(10, 60, 130, 30)
self.button_extract.clicked.connect(self.choose_folder)
self.progress_bar = QProgressBar(self)
self.progress_bar.setGeometry(155, 100, 668, 28)
self.progress_bar.setVisible(False)
self.red_light = QLabel(self)
self.red_light.setGeometry(10, 100, 20, 20)
self.red_light.setStyleSheet("background-color: red")
self.red_light.setVisible(False)
self.green_light = QLabel(self)
self.green_light.setGeometry(35, 100, 20, 20)
self.green_light.setStyleSheet("background-color: green")
self.green_light.setVisible(False)
def choose_folder(self):
# noinspection PyUnresolvedReferences
self.red_light.setVisible(True)
self.progress_bar.setVisible(True)
folder = QFileDialog.getExistingDirectory(window, "选择文件夹", options=QFileDialog.ShowDirsOnly)
save_path = folder+'\\'+'合并结果.xlsx'
# 清空前一次合并结果
if os.path.isfile(save_path):
os.remove(save_path)
else:
pass
file_name = os.listdir(folder)
consolidated_number = file_name # 合并索引,后期需要可优化
total = len(consolidated_number)
workbook = xlsxwriter.Workbook(save_path, options={'strings_to_urls': False})
worksheet = workbook.add_worksheet('Sheet1')
count_1 = 0
n = 0
for i, _ in enumerate(tqdm(consolidated_number)):
path = folder + '\\' + _.replace('\n', '')
book = openpyxl.load_workbook(path)
sheet = book['Sheet1']
rows = sheet.max_row
columns = sheet.max_column
a = 1
if n == 0:
a = 0
for k in range(a, rows):
for j in range(columns):
worksheet.write(n, j, str(sheet.cell(k + 1, j + 1).value))
n = n + 1
count_1 += 1
book.close()
# 更新进度条的值
progress = int((i + 1) / total * 100)
self.progress_bar.setValue(progress)
QApplication.processEvents() # 刷新界面
workbook.close()
self.red_light.setVisible(False)
self.green_light.setVisible(True)
QMessageBox.information(self, "success", f'合并完成,合并文件路径:{folder}/合并文件.xlsx')
time.sleep(3)
self.green_light.setVisible(False)
@staticmethod
def save_excel(_ls, path, _sheet_name='Sheet1', batch_size=10000):
_book = xlsxwriter.Workbook(path, options={'strings_to_urls': False})
_sheet = _book.add_worksheet(_sheet_name)
batch_size = batch_size
for i in range(0, len(_ls), batch_size):
batch_data = _ls[i:i + batch_size]
for _i in range(len(batch_data)):
for __i in range(len(batch_data[_i])):
_sheet.write(_i + i, __i, json.dumps(batch_data[_i][__i], ensure_ascii=False, indent=2))
_book.close()
if __name__ == "__main__":
app = QApplication(sys.argv)
window = Layout()
window.show()
sys.exit(app.exec())
最后,如果你觉得本教程对你有所帮助,不妨点赞并关注我的CSDN账号。我会持续为大家带来更多有趣且实用的教程和资源。谢谢大家的支持!