def get_rows(file_name):
count = 0
for count, line in enumerate(open(file_name, 'r', encoding='utf-8')):
count = count + 1
return count
def split_file(file_name):
count = get_rows(file_name) # 获取文件行数
print('count: %s' % count)
split_num = count//100000 + 1 # 每10w行存一份子文件
print('split_num: %s' % split_num)
nums = [(count*i//split_num) for i in range(1, split_num+1)] # 每份子文件最后对应的行数
print('nums:%s' % nums)
current_line = 0 # 记录当前读取的行
i = 0 # 记录当前是第几个子文件
data_list = []
with open(file_name, 'r', encoding='utf-8') as f1:
for line in f1: # 逐行读取原文件内容
data_list.append(line) # 存到临时变量
current_line = current_line + 1
if current_line in nums: # 如果当前行已经达到了之前计算的nums对应的行,则把临时数据全部写入对应子文件
print('len(data_list): %s' % len(data_list))
i = i + 1
save_file_name = file_name.split('.txt')[0] + '_' + str(i) + '.log' # 创建子文件名
print('save_file_name: %s' % save_file_name)
with open(save_file_name, 'w', encoding = 'utf-8') as f2:
for l in data_list:
f2.write(l)
data_list = [] # 清空临时变量,用于下一个循环获取内容,写入下一个子文件
split_file('test.txt')
Python3大文件分割成多个子文件
最新推荐文章于 2023-12-23 22:06:29 发布