题记:之前写的拆分excel是一个表一个表格输入,这样不方便,遇到多表时耗费的时间长。现在改进是写入文件夹,然后用os.walk遍历存储所有excel表再进行字符串的截取,从而实现批量操作excel表。
import pandas as pd
from time import *
import os
begin_time = time() #程序运行时间
files = []
file_path = r'C:\Users\user\Desktop\test\PO0028'
excel_data_num = 2000 # 拆分每个excel里的数据量
def SequenceNums(sta, end):
numbers = []
for i in range(sta, end + 1):
numbers.append(i)
return numbers
for dirpath,dirnames,filenames in os.walk(file_path):
for file in filenames:
if file.endswith('.xlsx') or file.endswith('.xls'):
files.append(dirpath + '\\' + file)
for f in files:
p1 = ''
str1 = f.split("\\")
for i in range(str1.__len__() - 1):
p1 += str1[i] + '\\'
dirPath = p1 + '\\'
dirPath2 = f[:f.index(".xls")]+'\\\\'
path = f #excel路径
orgName = pd.read_excel(path)
SerialNumber_list = list(orgName['ID'].drop_duplicates())
excel_alldata_num = SerialNumber_list.__len__() # excel的总数据量
if excel_alldata_num >excel_data_num:
BarCode_list = list(orgName['BarCode'].drop_duplicates())
os.mkdir(dirPath2)
BarCode = BarCode_list[0]
print(SerialNumber_list)
print(SerialNumber_list[0])
excel_num = excel_alldata_num // excel_data_num
if excel_alldata_num % excel_data_num != 0:
excel_num += 1
print("生成总数量:"+str(excel_num))
#pd.read_excel(path, skiprows = 0, nrows=80)表示不跳过任何行,取前80行的数据;
#pd.read_excel(path, skiprows = 80, nrows=20)表示跳过前80行,取剩下20行的数据。
#pd.read_excel(path, skiprows = 0, nrows=100)表示不跳过,读取全部数据。
for i in range(excel_num):
#for j in range(excel_data_num*i,excel_data_num*(i+1)-1):
print(i)
if i == 0:
tempdata = pd.read_excel(path, dtype='object',skiprows=0, nrows=excel_data_num)
writer = pd.ExcelWriter(dirPath2 + str(BarCode) + '-' + str(SerialNumber_list[0]) + '-' + str(SerialNumber_list[excel_data_num]-1) + '.xlsx')
elif i != excel_num-1:
tempdata = pd.read_excel(path,dtype='object',skiprows=SequenceNums(1,excel_data_num*i),nrows=excel_data_num)
writer = pd.ExcelWriter(dirPath2 + str(BarCode) + '-' + str(SerialNumber_list[excel_data_num*i]) + '-' + str(SerialNumber_list[excel_data_num*i+excel_data_num-1])+ '.xlsx')
else:
tempdata = pd.read_excel(path,dtype='object',skiprows=SequenceNums(1,excel_data_num*i), nrows=excel_alldata_num-1)
writer = pd.ExcelWriter(dirPath2 + str(BarCode) + '-'+ str(SerialNumber_list[excel_data_num*i]) + '-' + str(SerialNumber_list[excel_alldata_num-1]) + '.xlsx')
tempdata.to_excel(writer, index=False)
writer.save()
writer.close()
end_time = time() #程序运行结束时间
run_time = end_time - begin_time
print('程序运行总时间:',run_time)