将markdown文件按句子切块转换成预训练格式

408Killer

于 2024-10-10 16:14:13 发布

阅读量25

点赞数 1

分类专栏：预训练数据清洗文章标签： python 深度学习人工智能机器学习自然语言处理 nlp

本文链接：https://blog.csdn.net/qq_55773484/article/details/142824648

版权

预训练同时被 2 个专栏收录

2 篇文章 0 订阅

订阅专栏

数据清洗

1 篇文章 0 订阅

订阅专栏

在项目中涉及到要将大量markdown文件清洗并转化为预训练json格式：

[
  {"text": "document"},
  {"text": "document"}
]

清洗markdown文件：

import re
import os
import glob
import shutil

# 清洗文本的函数
def clean_markdown(content):
    # 移除LaTeX符号和特殊字符
    content = re.sub(r'\$.*?\$', '', content)  # 移除$...$中的内容
    content = re.sub(r'\\\w+', '', content)  # 移除反斜杠后的单词
    content = re.sub(r'\[\[.*?\]\]', '', content)  # 移除双中括号内容
    content = re.sub(r'!\[.*?\]\(.*?\)', '', content)  # 移除图片链接
    content = re.sub(r'\[.*?\]\(.*?\)', '', content)  # 移除超链接
    content = re.sub(r'\s+', ' ', content)  # 合并多余空白为一个空格
    content = content.strip()  # 去除首尾空白
    return content

# 清洗单个文件并保存到新位置的函数
def clean_and_save_file(old_file_path, new_directory):
    with open(old_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    cleaned_content = clean_markdown(content)
    # 构建新文件的路径
    new_file_path = os.path.join(new_directory, os.path.basename(old_file_path))
    # 确保目录存在
    os.makedirs(new_directory, exist_ok=True)
    # 将清洗后的内容写入新文件
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)
    print(f"文件清洗已保存: {new_file_path}")

# 指定包含Markdown文件的目录
source_directory_path = '/home/yunfei/outsdg15_smni-md'
# 指定新目录来存放清洗后的文件
new_directory_path = 'clean_MD_dir'

# 如果新目录不存在，则创建它
if not os.path.exists(new_directory_path):
    os.makedirs(new_directory_path, exist_ok=True)

# 遍历源目录中的所有Markdown文件并清洗
for old_file_path in glob.glob(os.path.join(source_directory_path, '*.md')):
    clean_and_save_file(old_file_path, new_directory_path)

print(f"所有的markdown文件清洗完成并保存在：{new_directory_path}")

清洗完成之后需要将markdown按完整的句子切块转化成预训练的json格式：

import os
import json
import nltk

def read_markdown_chunks_by_sentence(file_path, chunk_size=512):
    """按句子切块，组合多个句子，直到达到指定的块大小"""
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # 使用nltk按句子分割文本
    sentences = nltk.sent_tokenize(text)
    
    chunk = ""
    for sentence in sentences:
        if len(chunk) + len(sentence) <= chunk_size:
            chunk += sentence + " "
        else:
            yield chunk.strip()  # 返回完整的块
            chunk = sentence + " "  # 开始新的块

    if chunk:  # 返回最后一个块
        yield chunk.strip()

def read_markdown_chunks_by_paragraph(file_path):
    """按指定的块大小切分Markdown文件的文本"""
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()  # 读取整个文件的文本
    # 以段落为单位分割
    paragraphs = text.split("\n\n")
    
    for paragraph in paragraphs:
        yield paragraph.strip()  # 去除多余的空白字符


def process_markdown_files_to_json(input_dir, output_file, method="sentence", chunk_size=512):
    """根据指定方法（按句子或段落）将Markdown文件切块并转换为预训练所需的JSON格式"""
    data = []  # 用于存储切块后的文本

    for filename in os.listdir(input_dir):
        if filename.endswith('.md'):  # 仅处理Markdown文件
            file_path = os.path.join(input_dir, filename)
            print(f"处理文件: {filename}")

            if method == "sentence":
                # 按句子切块
                for chunk in read_markdown_chunks_by_sentence(file_path, chunk_size=chunk_size):
                    data.append({"text": chunk})
            elif method == "paragraph":
                # 按段落切块
                for paragraph in read_markdown_chunks_by_paragraph(file_path):
                    data.append({"text": paragraph})

    # 将数据写入JSON文件
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=2)

    print(f"已将处理后的数据保存到 {output_file}")

# 调用函数，指定Markdown文件所在目录和输出的JSON文件
input_directory = '/home/yunfei/Qwen/clean_MD'  # 替换为你的Markdown文件所在目录
output_json_file = '/home/yunfei/LLaMA-Factory/data/clean_MD.json'  # 输出的JSON文件名
process_markdown_files_to_json(input_directory, output_json_file, chunk_size=512)