大文件分割成很多个zip,每个不超过500MB
文档:大文件分割成.note
链接:http://note.youdao.com/noteshare?id=b5ebb2bae4ba292c55a557f087b8a389&sub=2CD0525FC1D2411D98F61D0EC502281A
添加链接描述
import os
import zipfile
def split_file_into_zip(file_path, output_dir, max_size=500):
# 获取文件名和扩展名
file_name = os.path.basename(file_path)
file_name_base, file_ext = os.path.splitext(file_name)
# 创建存放分割后压缩文件的文件夹
os.makedirs(output_dir, exist_ok=True)
print("创建存放分割后压缩文件的文件夹",output_dir)
# 打开原始文件
with open(file_path, 'rb') as f:
# 获取文件大小
file_size = os.path.getsize(file_path)
# 计算每个分块的大小(单位: 字节)
chunk_size = int(max_size * 1024 * 1024)
# 计算需要分割的文件数目
num_files = file_size // chunk_size
if file_size % chunk_size > 0:
num_files += 1
# 分割文件
for i in range(num_files):
# 创建分割后的文件名称
# padded_num = num.zfill(4)
num_str=str(i+1)
num_str_zfill=num_str.zfill(4)
# split_file_name = f'{file_name_base}_{i+1}{file_ext}.zip'
split_file_name = f'{file_name_base}_{num_str_zfill}{file_ext}.zip'
# 创建zip文件
zip_file_path = os.path.join(output_dir, split_file_name)
zip_file = zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED)
# 写入文件数据到zip
data = f.read(chunk_size)
zip_file.writestr(file_name, data)
# 关闭zip文件
zip_file.close()
print(zip_file_path,"done ")
print("文件分割完成!",output_dir)
import os
import zipfile
def unzip_files(zip_dir, output_dir):
print("zip_dir",zip_dir)
print("output_dir",output_dir)
# 获取所有zip文件
zip_files = [f for f in os.listdir(zip_dir) if f.endswith('.zip')]
# 创建存放解压后文件的文件夹
os.makedirs(output_dir, exist_ok=True)
# 解压每个zip文件
for zip_file_name in zip_files:
zip_file_path = os.path.join(zip_dir, zip_file_name)
print("now ",zip_file_path)
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
# 解压缩zip文件到指定目录
zip_file.extractall(output_dir)
print("文件解压完成!",output_dir)
# 示例用法
# unzip_files('path/to/your/zip/files', 'path/to/output/directory')
import os
import shutil
def add_zip_extension(directory):
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if os.path.isfile(filepath) and not filename.endswith('.zip'):
new_filepath = filepath + '.zip'
shutil.move(filepath, new_filepath)
# 指定目录路径
# directory_path = '/path/to/your/directory'
# 调用函数
# add_zip_extension(rf"D:\project\python\llama\llama-2-7b\parts")
def zip_do():
# zip_big_file_to_parts
# llama_dir=rf'D:\project\python\llama'
# # base_dir=rf'D:\project\python\llama\llama-2-7b'
# # base_dir=rf'{llama_dir}/llama-2-7b'
# base_dir=rf'{llama_dir}/llama-2-70b-chat'
# idx=0
# out_dir= rf'{base_dir}/parts/parts_{idx}'
# # os.makedirs(out_dir,exist_ok=True)
# file_path=rf"{base_dir}/consolidated.0{idx}.pth"
file_path=rf"E:\addd\model\j05025\model\TheBloke\Llama-2-13B-chat-GGML\llama-2-13b-chat.ggmlv3.q2_K.bin"
basename=os.path.basename(file_path)
out_dir=rf"E:\addd\model\j05025\model\TheBloke\Llama-2-13B-chat-GGML\parts/{basename}"
# "E:\addd\model\j05025\model\TheBloke\Llama-2-13B-chat-GGML\llama-2-13b-chat.ggmlv3.q2_K.bin"
# "D:\project\python\llama\llama-2-70b-chat\consolidated.00.pth"
os.makedirs(out_dir,exist_ok=True)
# 示例用法
split_file_into_zip(file_path,out_dir)
# D:\project\python\llama\zip_big_file_to_parts.py
# python zip_big_file_to_parts.py
# unzip_files(rf"D:\project\python\llama\llama-2-7b\parts",rf"D:\project\python\llama\llama-2-7b\whole")
import os
import zipfile
# file_names = ['consolidated.00_1.pth.zip'
# , 'consolidated.00_10.pth.zip', 'consolidated.00_11.pth.zip'
# , 'consolidated.00_12.pth.zip', 'consolidated.00_13.pth.zip'
# , 'consolidated.00_14.pth.zip','consolidated.00_2.pth.zip'
# , 'consolidated.00_21.pth.zip']
def sort_file_names(file_names):
sorted_file_names = sorted(file_names, key=lambda x: int(x.split('_')[-1].split('.')[0]))
return sorted_file_names
def merge_files(zip_dir, output_file,do_merge=False):
print("zip_dir",zip_dir)
# 获取所有zip文件
zip_files = [f for f in os.listdir(zip_dir) if f.endswith('.zip')]
len_zip_files=len(zip_files)
zip_files=sort_file_names(zip_files)
print(zip_files)
# 创建一个空的输出文件\
if not do_merge:
return
with open(output_file, 'wb') as merged_file:
# 合并每个zip文件的内容
for zip_file_name in zip_files:
zip_file_path = os.path.join(zip_dir, zip_file_name)
print(f"now( /{len_zip_files}) ",zip_file_path)
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
# 获取zip文件中的第一个文件名称
file_name = zip_file.namelist()[0]
# 读取该文件的内容并写入到输出文件中
file_data = zip_file.read(file_name)
merged_file.write(file_data)
print("文件合成完成!",output_file)
# md5sum -c checklist.chk
# md5sum -c checklist_whoe.chk
# 示例用法
def do_merge(idx=0,base_model_dir="/j05025/llama/llama-2-13b",parts_dir_name="parts"):
parts_dir=f"{base_model_dir}/{parts_dir_name}/parts_{idx}"
out_file_name=f"{base_model_dir}/consolidated.0{idx}.pth"
# out_file_name="/j05025/llama/llama-2-7b/consolidated.00_whole.pth"
# merge_files(rf"D:\project\python\llama\llama-2-7b\parts",rf"D:\project\python\llama\llama-2-7b\whole/consolidated.00_whole_2.pth",do_merge=True)
merge_files(parts_dir,out_file_name,do_merge=True)
# python /j05025/llama/zip_big_file_to_parts.py
# zip_do()
# do_merge(idx=1)
zip_do()
"""
Lenovo@DESKTOP-QD78231 MINGW64 /d/project/python/llama/llama-2-7b/whole (main)
$ md5sum -c checklist_whoe.chk
consolidated.00_whole_2.pth: OK
md5sum: params.json: No such file or directory
params.json: FAILED open or read
md5sum: WARNING: 1 listed file could not be read
Lenovo@DESKTOP-QD78231 MINGW64 /d/project/python/llama/llama-2-7b/whole (main)
$
"""
"""
cd /j05025/llama/llama-2-7b
md5sum -c checklist-whoe.chk
/j05025/llama/llama-2-7b/checklist-whoe.chk
(base) root@9gk32co08roud-0:/j05025/llama# cd /j05025/llama/llama-2-7b
(base) root@9gk32co08roud-0:/j05025/llama/llama-2-7b# md5sum -c checklist-whoe.chk
consolidated.00_whole.pth: OK
params.json: OK
"""
"""
cd /j05025/llama
python zip_big_file_to_parts.py
"""
# /j05025/llama/zip_big_file_to_parts.py
# cd /j05025/llama
# python zip_big_file_to_parts.py