（已完成）使用pandas实现批量excel导入，合并为csv导出（含配套数据）

最新推荐文章于 2024-05-15 13:58:42 发布

Turambar_Zheng

最新推荐文章于 2024-05-15 13:58:42 发布

阅读量1.4k

点赞数 2

分类专栏： Python 文章标签： python pandas excel csv

本文链接：https://blog.csdn.net/Turambar_Zheng/article/details/121675183

版权

Python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Task：将压缩包中的excel文件合并为一个csv并导出

思路：

使用os.listdir获取指定目录下的excel文件名
pandas读取excel文件，将读取到的多个dataframe合并为一个dataframe（用到多线程）
dataframe导出为csv

数据：配套数据下载链接

代码：

import pandas as pd
import os
import threading
from concurrent.futures import ThreadPoolExecutor

lock = threading.Lock()


def get_file_list(path):
    file_list = os.listdir(path)
    lis = []
    for i in file_list:
        if i.split('.')[1] == 'xls' or i.split('.')[1] == 'xlsx':
            lis.append(path+i)
    print(f'total files:{len(lis)}')
    return lis


def get_excel(file):
    column = ['ZIPCODE', 'AGI_STUB', 'N1', 'MARS2', 'PREP', 'N2', 'NUMDEP', 'A00100', 'N00200', 'A00200', 'N00300',
              'A00300', 'N00600', 'A00600', 'N00650', 'A00650', 'N00900', 'A00900', 'SCHF', 'N01000', 'A01000',
              'N01400',
              'A01400', 'N01700', 'A01700', 'N02300', 'A02300', 'N02500', 'A02500', 'N03300', 'A03300', 'N04470',
              'A04470',
              'N18425', 'A18425', 'N18450', 'A18450', 'N18500', 'A18500', 'N18300', 'A18300', 'N19300', 'A19300',
              'N19700',
              'A19700', 'N04800', 'A04800', 'N07100', 'A07100', 'N07220', 'A07220', 'N07180', 'A07180', 'N07260',
              'A07260',
              'N59660', 'A59660', 'N59720', 'A59720', 'N11070', 'A11070', 'N09600', 'A09600', 'N06500', 'A06500',
              'N10300',
              'A10300', 'N11901', 'A11901', 'N11902', 'A11902']
    object_colunmns = ['STATEFIPS', 'STATE', 'ZIPCODE']
    with lock:
        df = pd.read_excel(file)
        df2 = df.iloc[5:-14]
        df2.columns = column
        df2 = df2.dropna(subset=['AGI_STUB'])
        df2 = df2.drop(df2[df2['AGI_STUB'] == 'Total'].index)
        df2['AGI_STUB'].replace(['$1 under $25,000', '$25,000 under $50,000', '$50,000 under $75,000',
                                '$75,000 under $100,000', '$100,000 under $200,000', '$200,000 or more'],
                               range(1, 7), inplace=True)
        df2.insert(0, 'STATEFIPS', file.rsplit("zp")[-1][0:2])
        df2.insert(1, 'STATE', df.columns[0][0:2])
        for i in set(df2.columns) - set(object_colunmns):
            df2[i] = df2[i].astype('int')
    return df2


def work(in_path):
    dfs = pd.DataFrame()
    with ThreadPoolExecutor() as pool:
        results = pool.map(get_excel, get_file_list(in_path))
        for result in results:
            dfs = dfs.append(result)
    return dfs


def to_allagi(df, out_path):
    df.to_csv(f'{out_path}allagi.csv', index= False)


def to_allnoagi(df, out_path):
    df = df.groupby(['STATEFIPS', 'STATE', 'ZIPCODE']).sum()
    df.to_csv(f'{out_path}allnoagi.csv')


if __name__ == '__main__':
    # 需要处理的文件路径
    in_path = r'C:/Users/Administrator/work/file/'
    # 需要导出的文件路径
    out_path = r'C:/Users/Administrator/work/'
    data = work(in_path)
    to_allagi(data, out_path)
    to_allnoagi(data, out_path)
    print('Finish')