在项目中涉及到要将大量markdown文件清洗并转化为预训练json格式:
[
{"text": "document"},
{"text": "document"}
]
清洗markdown文件:
import re
import os
import glob
import shutil
# 清洗文本的函数
def clean_markdown(content):
# 移除LaTeX符号和特殊字符
content = re.sub(r'\$.*?\$', '', content) # 移除$...$中的内容
content = re.sub(r'\\\w+', '', content) # 移除反斜杠后的单词
content = re.sub(r'\[\[.*?\]\]', '', content) # 移除双中括号内容
content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # 移除图片链接
content = re.sub(r'\[.*?\]\(.*?\)', '', content) # 移除超链接
content = re.sub(r'\s+', ' ', content) # 合并多余空白为一个空格
content = content.strip() # 去除首尾空白
return content
# 清洗单个文件并保存到新位置的函数
def clean_and_save_file(old_file_path, new_directory):
with open(old_file_path, 'r', encoding='utf-8') as file:
content = file.read()
cleaned_content = clean_markdown(content)
# 构建新文件的路径
new_file_path = os.path.join(new_directory, os.path.basename(old_file_path))
# 确保目录存在
os.makedirs(new_directory, exist_ok=True)
# 将清洗后的内容写入新文件
with open(new_file_path, 'w', encoding='utf-8') as file:
file.write(cleaned_content)
print(f"文件清洗已保存: {new_file_path}")
# 指定包含Markdown文件的目录
source_directory_path = '/home/yunfei/outsdg15_smni-md'
# 指定新目录来存放清洗后的文件
new_directory_path = 'clean_MD_dir'
# 如果新目录不存在,则创建它
if not os.path.exists(new_directory_path):
os.makedirs(new_directory_path, exist_ok=True)
# 遍历源目录中的所有Markdown文件并清洗
for old_file_path in glob.glob(os.path.join(source_directory_path, '*.md')):
clean_and_save_file(old_file_path, new_directory_path)
print(f"所有的markdown文件清洗完成并保存在:{new_directory_path}")
清洗完成之后需要将markdown按完整的句子切块转化成预训练的json格式:
import os
import json
import nltk
def read_markdown_chunks_by_sentence(file_path, chunk_size=512):
"""按句子切块,组合多个句子,直到达到指定的块大小"""
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
# 使用nltk按句子分割文本
sentences = nltk.sent_tokenize(text)
chunk = ""
for sentence in sentences:
if len(chunk) + len(sentence) <= chunk_size:
chunk += sentence + " "
else:
yield chunk.strip() # 返回完整的块
chunk = sentence + " " # 开始新的块
if chunk: # 返回最后一个块
yield chunk.strip()
def read_markdown_chunks_by_paragraph(file_path):
"""按指定的块大小切分Markdown文件的文本"""
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read() # 读取整个文件的文本
# 以段落为单位分割
paragraphs = text.split("\n\n")
for paragraph in paragraphs:
yield paragraph.strip() # 去除多余的空白字符
def process_markdown_files_to_json(input_dir, output_file, method="sentence", chunk_size=512):
"""根据指定方法(按句子或段落)将Markdown文件切块并转换为预训练所需的JSON格式"""
data = [] # 用于存储切块后的文本
for filename in os.listdir(input_dir):
if filename.endswith('.md'): # 仅处理Markdown文件
file_path = os.path.join(input_dir, filename)
print(f"处理文件: {filename}")
if method == "sentence":
# 按句子切块
for chunk in read_markdown_chunks_by_sentence(file_path, chunk_size=chunk_size):
data.append({"text": chunk})
elif method == "paragraph":
# 按段落切块
for paragraph in read_markdown_chunks_by_paragraph(file_path):
data.append({"text": paragraph})
# 将数据写入JSON文件
with open(output_file, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=2)
print(f"已将处理后的数据保存到 {output_file}")
# 调用函数,指定Markdown文件所在目录和输出的JSON文件
input_directory = '/home/yunfei/Qwen/clean_MD' # 替换为你的Markdown文件所在目录
output_json_file = '/home/yunfei/LLaMA-Factory/data/clean_MD.json' # 输出的JSON文件名
process_markdown_files_to_json(input_directory, output_json_file, chunk_size=512)