通过python实现指定目录下md文件转换为json或者TXT文件
1.markdown 转txt
```python
import os
import re
import markdown
import html2text
def traverse_dir_files(root_dir, ext=None, is_sorted=True):
"""
列出文件夹中的文件, 深度遍历
:param root_dir: 根目录
:param ext: 后缀名
:param is_sorted: 是否排序,耗时较长
:return: [文件路径列表, 文件名称列表]
"""
names_list = []
paths_list = []
for parent, _, fileNames in os.walk(root_dir):
for name in fileNames:
if name.startswith('.'): # 去除隐藏文件
continue
if ext: # 根据后缀名搜索
if name.endswith(tuple(ext)):
names_list.append(name)
paths_list.append(os.path.join(parent, name))
else:
names_list.append(name)
paths_list.append(os.path.join(parent, name))
if not names_list: # 文件夹为空
return paths_list, names_list
# if is_sorted:
# paths_list, names_list = sort_two_list(paths_list, names_list)
print(paths_list)
return paths_list
def remove_code_blocks(text):
return re.sub(r'```(.*?)```', '', text, flags=re.DOTALL)
def md_to_txt2(md_file, txt_file):
txt_content = ''
title = os.path.basename(md_file).replace('.md','').strip()
# 读取Markdown文件内容
with open(md_file, 'r', encoding='utf-8') as f:
markdown_text = f.read()
# 将Markdown转换为HTML
html_content = markdown.markdown(markdown_text)
# 使用html2text将HTML转换为纯文本
text_converter = html2text.HTML2Text()
text_converter.ignore_links = True
plain_text = text_converter.handle(html_content)
os.makedirs(os.path.dirname(txt_file), exist_ok=True) # 如果目录不存在则创建目录
with open(os.path.join(txt_file,title+'.txt'), 'w', encoding='utf-8') as f:
f.write(plain_text)
print("转换完成:%s" % (md_file))
def readlist(path, txt_dir):
path_list = traverse_dir_files(root_dir=path, ext='.md')
res = []
for path_str in path_list:
try:
md_to_txt2(path_str, txt_dir)
except Exception as e:
print(path_str + '---------error-----------')
print(e)
if __name__ == '__main__':
md_file = 'C:\\Users\\MuMu\\Desktop\\md' # Markdown文件路径
txt_dir = 'C:\\Users\\MuMu\\Desktop\\TXT' # 转换后的纯文本文件存放的目录路径
readlist(md_file, txt_dir)
2.markdown 转json
import os
import re
import json
import requests
import base64
# 主程序
md_directory = 'C:\\Users\\MuMu\\Desktop\\md' # 指定.md文件目录
json_directory = 'C:\\Users\\MuMu\\Desktop\\TXT' # 指定生成.json文件的根目录
def parse_markdown_file(file_path):
"""解析单个.md文件并生成分页结果"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
result = []
pages = content.split('------------------------------------------------------------------') # 假设用横线分隔页码
for idx, page in enumerate(pages):
page_num = f"Page_{str(idx + 1).zfill(3)}"
page_result = []
# 匹配标题(假设标题格式为 ## 或 # )
titles = re.findall(r'##?\s+(.+)', page)
for title in titles:
page_result.append({
"type": "title",
"text": title.strip()
})
# 匹配列表
lists = re.findall(r'^\d+\.\s+(.+)', page, re.MULTILINE)
for lst in lists:
page_result.append({
"type": "list",
"text": lst.strip()
})
# 匹配表格
table_pattern = re.compile(r'<table>.*?</table>', re.DOTALL)
# 查找所有 <table></table> 标签内容
tables = table_pattern.findall(content)
for table in tables:
page_result.append({
"type": "table",
"text": table.strip()
})
# 匹配普通文本
# texts = re.split(r'##?\s+.+', page)
# for text in texts:
# page_result.append({
# "type": "text",
# "text": text.strip()
# })
# 匹配普通文本,移除图片URL后的内容
page_no_url = remove_image_urls(page)
texts = re.split(r'##?\s+.+', page_no_url)
for text in texts:
cleaned_text = text.strip()
if cleaned_text: # 忽略空白文本
page_result.append({
"type": "text",
"text": cleaned_text
})
# 添加图片信息
page_result.extend(extract_image_data(page))
# 构建分页结果
result.append({
"page_num": page_num,
"result": page_result
})
return result
def save_json(data, output_file):
"""将数据保存为JSON文件"""
os.makedirs(os.path.dirname(output_file), exist_ok=True) # 创建输出目录
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def process_md_file(md_file, json_root):
"""处理单个.md文件并生成对应的.json文件"""
result = {
"code": 1,
"message": "操作成功",
"file_name": os.path.basename(md_file),
"result": parse_markdown_file(md_file)
}
# 生成对应的json文件路径
relative_path = os.path.relpath(md_file, md_directory)
json_file = os.path.join(json_root, os.path.splitext(relative_path)[0] + ".json")
# 保存结果到json文件
save_json(result, json_file)
def process_md_files(directory, json_root):
"""遍历目录下所有.md文件并生成对应的.json文件"""
for root, _, files in os.walk(directory):
for file_name in files:
if file_name.endswith(".md"):
md_file = os.path.join(root, file_name)
process_md_file(md_file, json_root)
# 解析目录下的.md文件并生成对应的.json文件
process_md_files(md_directory, json_directory)
print("解析完成,所有结果已保存到对应的json文件中")
2.1.编辑脚本,修改下面内容
md_directory = ‘/opt/markdown/md’ # 指定.md文件目录 json_directory =
‘/data/json/’ # 指定生成.json文件的根目录
2.2.创建导出json目录
mkdir -p /data/json/
2.3.执行脚本
python process_md_to_json.py