(已完成)使用pandas实现批量excel导入,合并为csv导出(含配套数据)

Task:将压缩包中的excel文件合并为一个csv并导出

思路:

  1. 使用os.listdir获取指定目录下的excel文件名
  2. pandas读取excel文件,将读取到的多个dataframe合并为一个dataframe(用到多线程)
  3. dataframe导出为csv

数据:配套数据下载链接

代码:

import pandas as pd
import os
import threading
from concurrent.futures import ThreadPoolExecutor

lock = threading.Lock()


def get_file_list(path):
    file_list = os.listdir(path)
    lis = []
    for i in file_list:
        if i.split('.')[1] == 'xls' or i.split('.')[1] == 'xlsx':
            lis.append(path+i)
    print(f'total files:{len(lis)}')
    return lis


def get_excel(file):
    column = ['ZIPCODE', 'AGI_STUB', 'N1', 'MARS2', 'PREP', 'N2', 'NUMDEP', 'A00100', 'N00200', 'A00200', 'N00300',
              'A00300', 'N00600', 'A00600', 'N00650', 'A00650', 'N00900', 'A00900', 'SCHF', 'N01000', 'A01000',
              'N01400',
              'A01400', 'N01700', 'A01700', 'N02300', 'A02300', 'N02500', 'A02500', 'N03300', 'A03300', 'N04470',
              'A04470',
              'N18425', 'A18425', 'N18450', 'A18450', 'N18500', 'A18500', 'N18300', 'A18300', 'N19300', 'A19300',
              'N19700',
              'A19700', 'N04800', 'A04800', 'N07100', 'A07100', 'N07220', 'A07220', 'N07180', 'A07180', 'N07260',
              'A07260',
              'N59660', 'A59660', 'N59720', 'A59720', 'N11070', 'A11070', 'N09600', 'A09600', 'N06500', 'A06500',
              'N10300',
              'A10300', 'N11901', 'A11901', 'N11902', 'A11902']
    object_colunmns = ['STATEFIPS', 'STATE', 'ZIPCODE']
    with lock:
        df = pd.read_excel(file)
        df2 = df.iloc[5:-14]
        df2.columns = column
        df2 = df2.dropna(subset=['AGI_STUB'])
        df2 = df2.drop(df2[df2['AGI_STUB'] == 'Total'].index)
        df2['AGI_STUB'].replace(['$1 under $25,000', '$25,000 under $50,000', '$50,000 under $75,000',
                                '$75,000 under $100,000', '$100,000 under $200,000', '$200,000 or more'],
                               range(1, 7), inplace=True)
        df2.insert(0, 'STATEFIPS', file.rsplit("zp")[-1][0:2])
        df2.insert(1, 'STATE', df.columns[0][0:2])
        for i in set(df2.columns) - set(object_colunmns):
            df2[i] = df2[i].astype('int')
    return df2


def work(in_path):
    dfs = pd.DataFrame()
    with ThreadPoolExecutor() as pool:
        results = pool.map(get_excel, get_file_list(in_path))
        for result in results:
            dfs = dfs.append(result)
    return dfs


def to_allagi(df, out_path):
    df.to_csv(f'{out_path}allagi.csv', index= False)


def to_allnoagi(df, out_path):
    df = df.groupby(['STATEFIPS', 'STATE', 'ZIPCODE']).sum()
    df.to_csv(f'{out_path}allnoagi.csv')


if __name__ == '__main__':
    # 需要处理的文件路径
    in_path = r'C:/Users/Administrator/work/file/'
    # 需要导出的文件路径
    out_path = r'C:/Users/Administrator/work/'
    data = work(in_path)
    to_allagi(data, out_path)
    to_allnoagi(data, out_path)
    print('Finish')

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Turambar_Zheng

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值