Task:将压缩包中的excel文件合并为一个csv并导出
思路:
- 使用os.listdir获取指定目录下的excel文件名
- pandas读取excel文件,将读取到的多个dataframe合并为一个dataframe(用到多线程)
- dataframe导出为csv
![](https://i-blog.csdnimg.cn/blog_migrate/d1fbf7ef55e573032ce6731a7dc521ea.png)
代码:
import pandas as pd
import os
import threading
from concurrent.futures import ThreadPoolExecutor
lock = threading.Lock()
def get_file_list(path):
file_list = os.listdir(path)
lis = []
for i in file_list:
if i.split('.')[1] == 'xls' or i.split('.')[1] == 'xlsx':
lis.append(path+i)
print(f'total files:{len(lis)}')
return lis
def get_excel(file):
column = ['ZIPCODE', 'AGI_STUB', 'N1', 'MARS2', 'PREP', 'N2', 'NUMDEP', 'A00100', 'N00200', 'A00200', 'N00300',
'A00300', 'N00600', 'A00600', 'N00650', 'A00650', 'N00900', 'A00900', 'SCHF', 'N01000', 'A01000',
'N01400',
'A01400', 'N01700', 'A01700', 'N02300', 'A02300', 'N02500', 'A02500', 'N03300', 'A03300', 'N04470',
'A04470',
'N18425', 'A18425', 'N18450', 'A18450', 'N18500', 'A18500', 'N18300', 'A18300', 'N19300', 'A19300',
'N19700',
'A19700', 'N04800', 'A04800', 'N07100', 'A07100', 'N07220', 'A07220', 'N07180', 'A07180', 'N07260',
'A07260',
'N59660', 'A59660', 'N59720', 'A59720', 'N11070', 'A11070', 'N09600', 'A09600', 'N06500', 'A06500',
'N10300',
'A10300', 'N11901', 'A11901', 'N11902', 'A11902']
object_colunmns = ['STATEFIPS', 'STATE', 'ZIPCODE']
with lock:
df = pd.read_excel(file)
df2 = df.iloc[5:-14]
df2.columns = column
df2 = df2.dropna(subset=['AGI_STUB'])
df2 = df2.drop(df2[df2['AGI_STUB'] == 'Total'].index)
df2['AGI_STUB'].replace(['$1 under $25,000', '$25,000 under $50,000', '$50,000 under $75,000',
'$75,000 under $100,000', '$100,000 under $200,000', '$200,000 or more'],
range(1, 7), inplace=True)
df2.insert(0, 'STATEFIPS', file.rsplit("zp")[-1][0:2])
df2.insert(1, 'STATE', df.columns[0][0:2])
for i in set(df2.columns) - set(object_colunmns):
df2[i] = df2[i].astype('int')
return df2
def work(in_path):
dfs = pd.DataFrame()
with ThreadPoolExecutor() as pool:
results = pool.map(get_excel, get_file_list(in_path))
for result in results:
dfs = dfs.append(result)
return dfs
def to_allagi(df, out_path):
df.to_csv(f'{out_path}allagi.csv', index= False)
def to_allnoagi(df, out_path):
df = df.groupby(['STATEFIPS', 'STATE', 'ZIPCODE']).sum()
df.to_csv(f'{out_path}allnoagi.csv')
if __name__ == '__main__':
# 需要处理的文件路径
in_path = r'C:/Users/Administrator/work/file/'
# 需要导出的文件路径
out_path = r'C:/Users/Administrator/work/'
data = work(in_path)
to_allagi(data, out_path)
to_allnoagi(data, out_path)
print('Finish')