抓取CSV文件指定内容到Excel表格中

两只阳
已于 2024-04-28 19:25:33 修改
阅读量90
点赞数 4
文章标签： excel python windows
于 2024-04-11 13:32:54 首次发布
本文链接：https://blog.csdn.net/weixin_41803650/article/details/137635976
版权
import pandas as pd  
from openpyxl import load_workbook  
import os
import openpyxl 
from openpyxl.drawing.image import Image
import sys
from openpyxl.styles import Font, Border, Side  
from openpyxl.styles import Alignment  
from openpyxl.styles import PatternFill  
import random 
import glob
import natsort

SHEET_NAMES = ['BK11', 'P', 'H', 'W', 'T', 'HL', 'J', 'Z', 'C', 'R', 'DB', 'LV']
BK_MAP = {'B0B2ZTZZ3F': 'BK11', 'B0C32WC8S2': 'W', 'B0BRPMS42P': 'H', 'B0C8MCJX7M': 'J',
       'B0C32W5HR7': 'HL', 'B0C8MJD479': 'Z', 'B0BJ6W8XGL': 'P', 'B0C32YF1RK': 'T', 'B0CNK95SS7': 'C'}

# 取出文件夹中的csv文件，按数字顺序排序，并给出开始的时间。
# 遍历解析csv文件，得到每个工作表中的关键数值。
# 为了方便Excel表格的数据写入，那就按工作表来存放数值。
# 每个工作表，如BK11中，对每个值，存放：时间，流量(会话次数)，销量(已订购商品数量)
class ParseCsvs2Excel():
    def __init__(self, csv_folder='./csv_folder', pattern=True) -> None:
        self._get_input()
        self.csv_folder = csv_folder
        self.bk11_s = []
        self.p_s = []
        self.h_s = []
        self.w_s = []
        self.t_s = []
        self.hl_s = []
        self.j_s = []
        self.z_s = []
        self.c_s = []
        self.r_s = []
        self.db_s = []

        self.pattern = pattern
        self._set_color()
                
        # 获取当前工作目录  
        current_dir = os.getcwd()  

        # 使用glob模块搜索当前目录下的所有.xlsx文件  
        xlsx_files = glob.glob(os.path.join(current_dir, '*.xlsx'))  
        # 打印找到的文件, 一般只有1个，先取1个吧  
        if len(xlsx_files) > 1: 
            print("存在多个xlsx格式的文件, 请小芳芳保留1个, 不然跑不通")
            sys.exit(-1)
        self.xlsx_path = xlsx_files[0]                  
        self.workbook = openpyxl.load_workbook(self.xlsx_path)  

        # 在当前文件夹中创建new_xlsw文件夹，用于存放新生成的xlsw表格数据。
        self.save_xlsw_dir = os.path.join(current_dir, 'new_xlsw')
        os.makedirs(self.save_xlsw_dir, exist_ok=True)
         
        
    def _set_color(self):
        # 定义“红橙黄绿青蓝紫”对应的RGB颜色值  
        colors = {  
            '红': "E60000",  # 暗红色，降低了亮度  
            '橙': "FFA500",  # 橙色保持不变，因为它通常已经比较柔和  
            '黄': "FFFF99",  # 淡黄色，提高了亮度但降低了饱和度  
            '绿': "00B300",  # 深绿色，降低了亮度  
            '青': "00E6E6",  # 淡青色，提高了亮度但降低了饱和度  
            '蓝': "0066FF",  # 深蓝色，降低了亮度  
            '紫': "9966CC",  # 淡紫色，提高了亮度并稍微改变了色调  
        }
        
        # 随机选择一个颜色  
        random_color = random.choice(list(colors.values()))          
        # 创建填充样式  
        self.fill = PatternFill(start_color=random_color, end_color=random_color, fill_type="solid") 
        # 创建边框样式  
        self.border = Border(left=Side(border_style='thin', color='000000'),  
                        right=Side(border_style='thin', color='000000'),  
                        top=Side(border_style='thin', color='000000'),  
                        bottom=Side(border_style='thin', color='000000'))


    # 1先解析csv的指定内容到BK11中去 2解析更多内容，解析到子表格 3解析多个csv
    def main(self):# 20240305  
        files = os.listdir(self.csv_folder)
        if len(files) > 10:
            print(f'小芳芳: CSV文件超过10个, 现在可以分析了')
        # 筛选出.csv文件，并排序好。
        csv_s = []
        for file in files:
            if file.endswith('.csv'): 
                csv_s.append(file)           
        # new_csv_s = sorted(csv_s)
        # 使用natsort进行排序  
        new_csv_s = natsort.natsorted(csv_s)        
        new_csv_s = [new_csv_s[-1]] + new_csv_s[:-1]

        for file in new_csv_s:
            csv_path = os.path.join(self.csv_folder, file)
            # 读取CSV文件  
            print(f'Read csv path: {csv_path}')
            self.parse_csv(csv_path, self.start_date)
            self.start_date += 1
        self.deal_multi_sheet_excel()

    def _get_input(self):
        date_input = input("请输入日期(如: 20240306)（格式：YYYYMMDD）：\n")  
        try:  
            print('输入长度:',len(date_input))
            if (len(date_input) >= 8):
                # 尝试将输入的日期转换为整数  
                self.start_date = int(date_input[0:8]) 
                print(f'输入的参数为:{self.start_date}') 
            else:
                print("小芳芳, 输入长度必须为8, 如: 20240306。否则无法运行。")
                return
        except ValueError:  
            # 如果转换失败，打印错误信息并退出  
            print("输入的日期格式不正确，请使用YYYYMMDD的格式输入。")  
            return          

    def parse_csv(self, csv_file, date):
        df = pd.read_csv(csv_file) # df: DataFrame
        print('df index:', df.index)
        print(df.iloc[0,1])
        asin_lines = df['（子）ASIN']
        session_count = df['会话次数 – 总计']
        ordered_count = df['已订购商品数量']
        for i, asin in enumerate(asin_lines):
            if asin in BK_MAP:
                bk_name = BK_MAP[asin] 
                bk = []
                bk.append(date)
                session = str(session_count.iloc[i])
                print(f'session: {session}')
                # 原本是用于将表格中的去除。这里加个判断。
                if ',' in session:
                    session = session.replace(',', '')
                bk.append(int(session))
                bk.append(int(ordered_count.iloc[i]))
                print(f'{bk_name}, value: {bk}') 

                if bk_name == 'BK11': # 
                    self.bk11_s.append(bk)
                elif bk_name == 'P':
                    self.p_s.append(bk)
                elif bk_name == 'H':
                    self.h_s.append(bk)
                elif bk_name == 'W':
                    self.w_s.append(bk)
                elif bk_name == 'T':
                    self.t_s.append(bk)
                elif bk_name == 'HL':                
                    self.hl_s.append(bk)
                elif bk_name == 'J':    
                    self.j_s.append(bk)
                elif bk_name == 'Z':
                    self.z_s.append(bk)
                elif bk_name == 'C':
                    self.c_s.append(bk)
                elif bk_name == 'R':
                    self.r_s.append(bk)
                elif bk_name == 'DB':
                    self.db_s.append(bk)                

    def deal_multi_sheet_excel(self):  
        # self.workbook = openpyxl.load_workbook(self.xlsx_path)  
        # 遍历所有工作表  
        last_time = ''
        for sheet_name in self.workbook.sheetnames:  
            sheet = self.workbook[sheet_name]  
            print(f"工作表: {sheet_name}")  
            if sheet_name in SHEET_NAMES: #不处理“关键词”，“竞品分析表”等表格。 
                cell2_time_index, cell2_sales_index, cell3_flow_index, cell3_convert_index, cell2_remark_index = self.find_target_index(sheet)
                # 存储了如BK11多天的数据，[[时间，流量，销量], []]
                sheet_infos = self.get_sheet_infos(sheet_name)
                for sheet_info in sheet_infos:
                    sheet_time = str(sheet_info[0]) # 20240314
                    last_time = sheet_time
                    sheet_year = int(sheet_time[0:4])
                    sheet_month = int(sheet_time[4:6])
                    sheet_day = int(sheet_time[6:8])

                    sheet_flow = sheet_info[1]
                    sheet_sale = sheet_info[2]
                    # 遍历工作表的行（从第四行开始，即数据行）  
                    for index, row in enumerate(sheet.iter_rows(min_row=4, values_only=False)):  # 使用values_only=False以获取单元格对象  
                        time_cell = row[cell2_time_index]  # 时间所在索引  
                        cell_time = time_cell.value  
                        if cell_time is None: break # 空的值，就不继续遍历了 
                        year, month, day = cell_time.year, cell_time.month, cell_time.day 
                        # 读取所在列的格式，仅读取一次。字体，线条，对齐
                        if index == 0:
                            flow_font = row[cell3_flow_index].font  
                            sales_font = row[cell2_sales_index].font  
                            convert_font = row[cell3_convert_index].font
                            target_border = self.get_cell_border(row[cell3_flow_index]) #
                            font_alignment = row[cell3_flow_index].alignment
                            sales_alignment = row[cell2_sales_index].alignment
                            convert_alignment = row[cell3_convert_index].alignment
                        # 按照时间，找到所在行，然后修改其中的“流量，销量，转化率”的值。
                        if year==sheet_year and month==sheet_month and day==sheet_day:  # 检查时间是否匹配
                            if self.pattern: self.fill_color(sheet, index+4, cell2_remark_index+2) # 给所在行上色，要额外加4行                                                      
                            flow_cell = row[cell3_flow_index]  
                            sales_cell = row[cell2_sales_index]  
                            convert_cell = row[cell3_convert_index]  
                            print(f"时间:{year}-{month}-{day}, 修改前--流量:{flow_cell.value}, 销量:{sales_cell.value}, 转化率:{convert_cell.value}")  
                            # 修改流量和销量的值  
                            flow_cell.value = sheet_flow                              
                            sales_cell.value = sheet_sale
                            convert_cell.value = round(sales_cell.value/flow_cell.value, 4)
                            print(f"修改后-流量:{flow_cell.value}, 销量:{sales_cell.value}, 转化率:{convert_cell.value}")  
                            # 格式刷
                            flow_cell.font = Font(name=flow_font.name, size=flow_font.size, bold=flow_font.bold, italic=flow_font.italic, color=flow_font.color)            
                            sales_cell.font = Font(name=sales_font.name, size=sales_font.size, bold=sales_font.bold, italic=sales_font.italic, color=sales_font.color)            
                            convert_cell.font = Font(name=convert_font.name, size=convert_font.size, bold=convert_font.bold, italic=convert_font.italic, color=convert_font.color)            
                            flow_cell.border = target_border
                            sales_cell.border = target_border
                            convert_cell.border = target_border
                            flow_cell.alignment = Alignment(horizontal=font_alignment.horizontal,  
                                                            vertical=font_alignment.vertical,  
                                                            text_rotation=font_alignment.text_rotation,  
                                                            wrap_text=font_alignment.wrap_text,  
                                                            shrink_to_fit=font_alignment.shrink_to_fit,  
                                                            indent=font_alignment.indent) 
                            sales_cell.alignment = Alignment(horizontal=sales_alignment.horizontal,  
                                                            vertical=sales_alignment.vertical,  
                                                            text_rotation=sales_alignment.text_rotation,  
                                                            wrap_text=sales_alignment.wrap_text,  
                                                            shrink_to_fit=sales_alignment.shrink_to_fit,  
                                                            indent=sales_alignment.indent)
                            convert_cell.alignment = Alignment(horizontal=convert_alignment.horizontal,  
                                                            vertical=convert_alignment.vertical,  
                                                            text_rotation=convert_alignment.text_rotation,  
                                                            wrap_text=convert_alignment.wrap_text,  
                                                            shrink_to_fit=convert_alignment.shrink_to_fit,  
                                                            indent=convert_alignment.indent)                                                        
                            
                        # else:  
                        #     print(f"{sheet_name}工作表, 单元格时间:{year}-{month}-{day}, 未找到所有需要的列。")  
          
        # 保存修改后的工作簿 
        file_name = os.path.basename(self.xlsx_path) 
        save_path = os.path.join(self.save_xlsw_dir, file_name)
        save_path = f'{save_path[:-5]}_{last_time}.xlsx'
        self.workbook.save(save_path)
        print(f'小芳芳, 新的Excel文件名及保存地址: {save_path}')                 
        
    def fill_color(self, sheet, row, last_col):
        # # 定义要填充的颜色  
        # fill_color = "FFFF00"  # 黄色，这里是RGB的十六进制表示，但没有alpha通道，所以实际上是BGR顺序  
        # fill = PatternFill(start_color=fill_color, end_color=fill_color, fill_type="solid")  
  
        # 给第5行的前20列上颜色  
        for col in range(1, last_col):  # 在Excel中列是从1开始的，所以范围是1到20  
            cell = sheet.cell(row, column=col)  
            cell.fill = self.fill      
            cell.border = self.border  # 应用边框样式                

    def get_cell_border(self, source_cell):
        source_border = source_cell.border          
        # 复制边框可能会稍微复杂一些，因为您需要复制每条边的属性  
        top = Side(style=source_border.top.style, color=source_border.top.color)  
        right = Side(style=source_border.right.style, color=source_border.right.color)  
        bottom = Side(style=source_border.bottom.style, color=source_border.bottom.color)  
        left = Side(style=source_border.left.style, color=source_border.left.color)  
        
        target_border = Border(top=top, right=right, bottom=bottom, left=left)
        return target_border     

    def find_target_index(self, sheet):
        cell2_time_index = -1
        cell2_sales_index = -1
        cell3_flow_index = -1
        cell3_convert_index = -1
        cell2_remark_index = -1            
        for i,cell in enumerate(sheet[2]):
            if cell.value == '时间':
                print(f'时间列索引: {cell.value}, index: {i}')
                cell2_time_index = i
            if cell.value == '销量':
                print(f'销量列索引: {cell.value}, index: {i}')
                cell2_sales_index = i
            if cell.value == '备注':
                print(f'备注列索引: {cell.value}, index: {i}')
                cell2_remark_index = i
        for j,cell in enumerate(sheet[3]):
            if cell.value == '流量':
                print(f'流量列索引: {cell.value}, index: {j}')
                cell3_flow_index = j    
            if cell.value == '转化率':
                cell3_convert_index = j
        if cell2_time_index<0 or cell2_sales_index<0 or cell3_flow_index<0 or cell3_convert_index<0:
            print(f'异常, 有需要的列索引未找到: {cell2_time_index}, {cell2_sales_index}, {cell3_flow_index}, {cell3_convert_index}')
            sys.exit(-1)  
        return cell2_time_index, cell2_sales_index, cell3_flow_index, cell3_convert_index, cell2_remark_index   

    def get_sheet_infos(self, sheet_name):
            sheet_informations = []
            if sheet_name == 'BK11': # ['BK11', 'P', 'H', 'W', 'T', 'HL', 'J', 'Z', 'C', 'R', 'DB', 'LV']
                sheet_informations = self.bk11_s
            elif sheet_name == 'P':
                sheet_informations = self.p_s
            elif sheet_name == 'H':
                sheet_informations = self.h_s
            elif sheet_name == 'W':
                sheet_informations = self.w_s
            elif sheet_name == 'T':
                sheet_informations = self.t_s
            elif sheet_name == 'HL':
                sheet_informations = self.hl_s
            elif sheet_name == 'J':
                sheet_informations = self.j_s
            elif sheet_name == 'Z':
                sheet_informations = self.z_s
            elif sheet_name == 'C':
                sheet_informations = self.c_s
            elif sheet_name == 'R':
                sheet_informations = self.r_s
            elif sheet_name == 'DB':
                sheet_informations = self.db_s
            return sheet_informations



if __name__ == '__main__':
    parseCsvs = ParseCsvs2Excel(pattern=False)
    parseCsvs.main()