pandas合并excel

最新推荐文章于 2024-08-21 14:05:40 发布

python_tty

最新推荐文章于 2024-08-21 14:05:40 发布

阅读量994

点赞数 2

分类专栏： pandas 文章标签： pandas excel

本文链接：https://blog.csdn.net/python_tty/article/details/102398443

版权

pandas 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

工作中有个需求需要把100多个excel文件合并成一个，并按照指定的列排序，查了查，用pandas处理做好。
备注：111个excel文件，总共有200多万条数据，101M数据
代码：

import os
import sys
import pandas as pd
import datetime

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(BASE_DIR)

EXECL_DIRS = ["xiaoying", "xiaoshu"]     # xlsx文件所在的目录
EXECL_FIELS_MAP = {"xueqian": "学前包含图片的题目", "xiaoyu": "小语包含图片的题目", "xiaoying": "小英包含图片的题目", "xiaoshu": "小数包含图片的题目"}


def combine_execl_files():
    """
    组合execl
    :return: 
    """
    def deal_url(val):
        if not val.startswith('"'):
            val = '"'+val+'"'
        return val

    for item in EXECL_DIRS:
        dfs = []
        print("begin to deal %s" % item)
        start_time = datetime.datetime.now()
        path_dir = os.path.join(BASE_DIR, 'tools/%s/'% item)
        files = os.listdir(path_dir)
        for file in files:
            file_path = os.path.join(path_dir, file)
            try:
                df = pd.read_excel(file_path)
                if not df.empty:
                    df['图片url'] = df['图片url'].apply(deal_url)
                    dfs.append(df)
            except Exception as exc:
                print("error file_path: %s     error:%s" % (file_path, str(exc)))

        print("total df number: %s" % len(dfs))
        new_df = pd.concat(dfs, ignore_index=True)
        new_df = new_df.sort_values(by=['图片url'])      # 按某一列进行排序
        print("total rows: %s   total cost time:%s " % (len(new_df), datetime.datetime.now()-start_time))

        df_cnt = len(new_df)
        step = 500000          # 数据超过200万条，故分批处理
        numbers = int(df_cnt/step) + 1
        for i in range(1, numbers+1):
            start = (i-1) * step
            end = i * step
            print("start:%s   end:%s " % (start, end))
            sub_df = new_df[start:end]
            a_file_name = os.path.join(BASE_DIR, 'tools/%s_%s.xlsx' % (EXECL_FIELS_MAP.get(item), i))
            sub_df.to_excel(a_file_name, index=False, encoding='utf8')
            print("write data to excel success !!!")


if __name__ == "__main__":
    combine_execl_files()