'''正文'''
import os
import re
import json
def find_first_chapter_title(lines, patterns1, patterns2, file_id, file_name):
first_title = None
first_title_index = -1 # 记录匹配到的标题的行数下标
# 仅在前50行内进行匹配
for idx, line in enumerate(lines[:58]):
line = line.strip()
matched11 = False
for pattern in patterns1:
match = re.match(pattern, line)
if match:
first_title = line
first_title_index = idx # 记录匹配到标题的行数下标
matched11 = True
break
if matched11:
break
if not first_title:
for idx, line in enumerate(lines[:58]):
line = line.strip()
for pattern in patterns2:
match = re.match(pattern, line)
if match:
first_title = line
first_title_index = idx # 记录匹配到标题的行数下标
break
if first_title:
break
if first_title:
return {"id": file_id, "file_name": file_name, "first_title": first_title, "index": first_title_index}
else:
return None
def process_txt_file(file_path, output_data):
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
patterns1 = [
# 第一种类型:以 "第" 开头的章节标题
r'(第[零一二三四五六七八九十百千万]+[卷回集章]).*?\s*(.*?)\s*$',
r'(第?\d+[章节回卷集]).*?\s*(.*?)\s*$',
# 第二种类型:以 "Volume" 开头的标题
r'(Volume\s*Ⅰ{0,3}Ⅱ{0,3}Ⅲ{0,3}Ⅳ{0,3}Ⅴ{0,3}Ⅵ{0,3}Ⅶ{0,3}Ⅷ{0,3}Ⅸ{0,3}Ⅹ{0,3}Ⅺ{0,3}Ⅻ{0,3}).*?\s*(.*?)\s*$',
# 第三种类型:以 "Chapter" 开头的标题
r'(Chapter\d+).*?\s*(.*?)\s*$',
# 第四种类型:其他形式的标题
r'(第|章)(?:[零一二三四五六七八九十百千万\d\s]+)\s+(.*?)$',
r'楔子\s*(.*?)\s*$',
r'序章\s*(.*?)\s*$',
r'序\s*(.*?)\s*$',
]
patterns2 = [
r'^[0-9].*?\s*(.*?)\s*$',
r'第[0-9].*?\s*(.*?)\s*$',
r'^☆\s*(.*?)\s*$',
r'^(.*?[零一二三四五六七八九十百千万]+章).*?\s*(.*?)\s*$',
]
result = find_first_chapter_title(lines, patterns1, patterns2, len(output_data) + 1,
os.path.basename(file_path))
if result:
output_data.append(result)
def process_folder(folder_path, output_file_path):
output_data = []
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith('.txt'):
file_path = os.path.join(root, file)
process_txt_file(file_path, output_data)
# 将结果写入json文件
with open(output_file_path, 'w', encoding='utf-8') as output_file:
json.dump(output_data, output_file, ensure_ascii=False, indent=4)
folder_path = ''
output_file_path = 'first_title326.json'
process_folder(folder_path, output_file_path)
##############################################################################
import os
import json
def load_json_data(json_file):
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def save_text_to_file(text, output_folder, file_name):
output_file_path = os.path.join(output_folder, file_name)
with open(output_file_path, 'w', encoding='utf-8') as f:
f.write(text)
def extract_text_from_json(json_file, input_folder, output_folder):
data = load_json_data(json_file)
for item in data:
file_name = item['file_name']
index = item['index']
# 读取原始文件内容
file_path = os.path.join(input_folder, file_name)
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 根据索引截取文本
text_lines = lines[index:]
# 保存截取后的文本到新文件
save_text_to_file(''.join(text_lines), output_folder, file_name)
json_file = 'first_title326.json' # JSON文件路径
input_folder = ''
output_folder = ''
if not os.path.exists(output_folder):
os.makedirs(output_folder)
extract_text_from_json(json_file, input_folder, output_folder)
'''结尾'''
import os
import json
import re
def find_last_chapter_title(lines, patterns1, patterns2, file_id, file_name):
last_title = None
last_title_index = -1 # 记录匹配到的标题的行数下标
# 在最后30行内进行匹配
for idx, line in enumerate(lines[-30:]):
line = line.strip()
for pattern in patterns1:
match = re.match(pattern, line)
if match:
if "第" not in line and "章" not in line:
last_title = line
last_title_index = idx + len(lines) - 30 # 记录匹配到标题的行数下标
break
if last_title:
break
if not last_title:
for idx, line in enumerate(lines[-30:]):
line = line.strip()
for pattern in patterns2:
match = re.match(pattern, line)
if match:
last_title = line
last_title_index = idx + len(lines) - 30 # 记录匹配到标题的行数下标
break
if last_title:
break
return {"id": file_id, "file_name": file_name, "last_title": last_title, "index": last_title_index}
def process_txt_file(file_path, output_data):
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
patterns1 = [
r'.*(正文完|正文结束|完结|结局|剧终|全书终|全书完|[\((]完[\))]).*'
]
patterns2 = [
# r'书香门第.*?整理.*?附.*?本作品来自互联网.*?内容版权归作者所有.*?$',
r'书香门第.*?整理.*?$',
# r'.*?本作品来自互联网.*?内容版权归作者所有.*?$',
r'.*?本作品来自互联网.*?$',
r'.*?版权归.*?$',
r'.*?本书由.*?$',
r'.*?本图书由.*?$',
r'作者有话.*?$',
]
result = find_last_chapter_title(lines, patterns1, patterns2, len(output_data) + 1,
os.path.basename(file_path))
if result:
output_data.append(result)
def process_folder(folder_path, output_file_path):
output_data = []
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith('.txt'):
file_path = os.path.join(root, file)
process_txt_file(file_path, output_data)
with open(output_file_path, 'w', encoding='utf-8') as output_file:
json.dump(output_data, output_file, ensure_ascii=False, indent=4)
folder_path = ''
output_file_path = '结尾326.json'
process_folder(folder_path, output_file_path)
#############################################
import os
import json
def load_json_data(json_file):
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def save_text_to_file(text, output_folder, file_name):
output_file_path = os.path.join(output_folder, file_name)
with open(output_file_path, 'w', encoding='utf-8') as f:
f.write(text)
def extract_text_from_json(json_file, input_folder, output_folder):
data = load_json_data(json_file)
for item in data:
file_name = item['file_name']
index = item['index']
# last_title = item['last_title']
# 读取原始文件内容
file_path = os.path.join(input_folder, file_name)
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 根据索引截取文本
text_lines = lines[:index-1] # 截取索引下标之前的文本(不包括索引行)
# 保存截取后的文本到新文件
save_text_to_file(''.join(text_lines), output_folder, file_name)
# 如果没有匹配到,也保存一个空文件
if not text_lines:
empty_file_path = os.path.join(output_folder, f'empty_{file_name}')
open(empty_file_path, 'w', encoding='utf-8').close()
json_file = '结尾326.json'
input_folder = ''
output_folder = ''
if not os.path.exists(output_folder):
os.makedirs(output_folder)
extract_text_from_json(json_file, input_folder, output_folder)
'''清洗'''
import os
import re
import json
import string
def contains_keywords(line):
"""判断一行文本是否包含关键字"""
keywords = ['&#', 'html', '<\p>', '"', '>', '<', '&#' ]
for keyword in keywords:
if keyword in line:
return True
return False
def contains_keywords1(line):
"""判断一行文本是否包含关键字"""
keywords = [
'求关注', '求收藏', '未完待续', '免费内容', '付费内容', '网站链接', '网址链接', 'continue', '更新最快最全的网站','好几天没更新','好几天没见到更新了','版权归作者或出版社',
'ps', '未完待续', 'www' , '本章未完待续','定时更新','继续更新','推荐和收藏','更新到','出版社','作者','新文','最后一篇','最近超忙','请大家原谅',
'作者有话', '拉票', 'Ps', '红包', '更新','上架感言','书友','日更','diǎn','几更','这本书','存稿','以上所有内容','求票票','求首订','剧情需要',
'红包','更新', 'www', 'cn','纯属虚构','如有雷同','恢复更新','百度','搜索','作者','求花花','付费','未完','待续','新书发布','求关注','大家多多支持','开新文',
'求收藏', '未完待续', '免费内容', '付费内容', '网站链接', '网址链接', 'continue', r'更新最快最全的网站', '好几天没更新', '好几天没见到更新了', '版权归作者或出版社','PS',
]
for keyword in keywords:
if keyword in line:
return True
return False
def count_characters(txt_path):
"""统计文本文件中除符号外的字符数量"""
char_count_no_symbols = 0
with open(txt_path, 'r', encoding='utf-8') as file:
text = file.read()
char_count_no_symbols = len([char for char in text if char.isalnum() or char.isspace()])
return char_count_no_symbols
def count_letters(text):
"""统计文本中的英文字母数量"""
letters_list = [char for char in text if char.lower() in string.ascii_lowercase]
return len(letters_list)
def process_html_encoding(input_folder, output_folder):
"""处理HTML编码的文本"""
result_html = []
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
txt_path = os.path.join(input_folder, filename)
array_elements = []
with open(txt_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
for line in lines:
if contains_keywords(line):
array_elements.append(line.strip())
if array_elements:
result_html.append({
'txt_file_name': filename,
'array_length': len(array_elements),
'array_elements': array_elements
})
else:
output_path = os.path.join(output_folder, filename)
with open(txt_path, 'r', encoding='utf-8') as input_file, open(output_path, 'w',
encoding='utf-8') as output_file:
output_file.write(input_file.read())
# 写入HTML编码JSON文件
output_file_html = os.path.join(output_folder, '处理的html编码.json')
with open(output_file_html, 'w', encoding='utf-8') as json_file:
json.dump(result_html, json_file, ensure_ascii=False, indent=4)
def copy_large_files(input_folder, output_folder, threshold=5000):
"""复制大于指定字符数阈值的文本文件"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
txt_path = os.path.join(input_folder, filename)
char_count = count_characters(txt_path)
if char_count > threshold:
output_path = os.path.join(output_folder, filename)
with open(txt_path, 'r', encoding='utf-8') as input_file, open(output_path, 'w',
encoding='utf-8') as output_file:
output_file.write(input_file.read())
def process_keywords(input_folder, output_folder):
"""处理关键词的文本"""
result_keywords = []
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
txt_path = os.path.join(input_folder, filename)
output_file_path = os.path.join(output_folder, filename)
array_elements = []
with open(txt_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
content_list = []
for line in lines:
if contains_keywords1(line):
# if line.strip() in ["抬头看了看窗外"]:
patterns = [
r'(第[零一二三四五六七八九十百千万]+[卷回集篇节章]).*?\s*(.*?)\s*$',
r'(第?\d+[章节回卷集篇节]).*?\s*(.*?)\s*$',
]
for pattern in patterns:
match = re.match(pattern, line.strip())
if match:
print(line.strip())
content_list.append(line)
else:
array_elements.append(line.strip())
content_list.append(line.replace(line.strip(), ""))
else:
content_list.append(line)
with open(output_file_path, 'w', encoding='utf-8') as output_file:
output_file.write(''.join(content_list))
if array_elements:
result_keywords.append({
'txt_file_name': filename,
'array_length': len(array_elements),
'array_elements': array_elements
})
# 写入关键词JSON文件
output_file_keywords = os.path.join(output_folder, '处理关键词.json')
with open(output_file_keywords, 'w', encoding='utf-8') as json_file:
json.dump(result_keywords, json_file, ensure_ascii=False, indent=4)
def process_letter_count(input_folder, output_folder):
"""处理英文字母数量的文本"""
result_letters = []
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
txt_path = os.path.join(input_folder, filename)
with open(txt_path, 'r', encoding='utf-8') as file:
text = file.read()
letters_count = count_letters(text)
if letters_count <= 25:
output_path = os.path.join(output_folder, filename)
with open(output_path, 'w', encoding='utf-8') as output_file:
output_file.write(text)
result_letters.append({
'txt_file_name': filename,
'letters_count': letters_count,
})
# 写入英文字母数量统计JSON文件
output_file_letters = os.path.join(output_folder, '英文字母数量.json')
with open(output_file_letters, 'w', encoding='utf-8') as json_file:
json.dump(result_letters, json_file, ensure_ascii=False, indent=4)
def process_symbol(input_folder, output_folder):
"""
处理文本中一行只有标点符号的情况,并将处理后的文本保存到输出文件夹中。
"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
input_file_path = os.path.join(input_folder, filename)
output_file_path = os.path.join(output_folder, filename)
# 处理文本并写入新文件
with open(input_file_path, 'r', encoding='utf-8') as input_file, open(output_file_path, 'w', encoding='utf-8') as output_file:
output_lines = []
for line in input_file:
# 检查一行中是否包含字母或数字字符,如果有则写入新文件
if any(char.isalnum() for char in line):
output_lines.append(line)
output_file.writelines(output_lines)
def process_blank_line(input_folder, output_folder):
"""
去除文本中的连续空白行,并将处理后的文本保存到输出文件夹中。
"""
# 创建输出文件夹
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 遍历输入文件夹中的所有txt文件
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
input_file_path = os.path.join(input_folder, filename)
output_file_path = os.path.join(output_folder, filename)
with open(input_file_path, 'r', encoding='utf-8') as infile:
lines = infile.readlines()
# 去除连续的空白行
cleaned_lines = []
prev_line_empty = False
for line in lines:
if line.strip() == '' and prev_line_empty:
continue
cleaned_lines.append(line)
prev_line_empty = (line.strip() == '')
# 写入新文件
with open(output_file_path, 'w', encoding='utf-8') as outfile:
outfile.writelines(cleaned_lines)
def process_japanese_emojis(input_folder, output_folder):
"""
遍历指定文件夹中的所有txt文件,查找并删除包含日本文字表情的文本行,将处理后的文本保存到输出文件夹中。
"""
japanese_emojis = {
'(๑•̀ㅂ•́)و✧', 'ヽ(´▽`)/', '(。・ω・。)ノ♡', '(*˘︶˘*).。.:*♡', 'ლ(´ڡ`ლ)', '(ノ*>∀<)ノ♡', '(≧◡≦)', '(*´∀`)~♥',
'o(* ̄▽ ̄*)ブ', '(>_<)', '⊙▂⊙', '(* ̄︿ ̄)', ' ̄□ ̄||', 'ʕ͡ຈ͡ຈʔ', '(⁄ ⁄•⁄ω⁄•⁄ ⁄)', '(*´ω`*)', '(≡ω≡.)',
'(o・ω・o)', '( ´◡‿ゝ◡`)', '(╯✧▽✧)╯', '(✯◡✯)', '(๑˃̵ᴗ˂̵)و', '(≧∇≦)/', '(⁄ ⁄>⁄ ▽ ⁄<⁄ ⁄)', '(〃°ω°〃)', '(人•͈ᴗ•͈)',
'(⁄ ⁄^⁄ᗨ⁄^⁄ ⁄)', '(╯✧∇✧)╯', '(*  ̄3)(ε ̄ *)', '(´∩。• ᵕ •。∩`)', '(⌒▽⌒)', '(°◡°♡)', '( ̄ω ̄)', '(^▽^)', '(¬‿¬)',
'(/≧▽≦)/', '(=^・ω・^)y=', '(゚ω゚)', '(≧ω≦)', '(^▽^)', '(*≧ω≦)', '(´。• ω •。`)', '(*°▽°*)', '(o^∇^o)', '(o´□`o)',
'(^ω^)', '(^ω^)', '(〃ω〃)', '(⌒ω⌒)', '(o・ω・o)', '(o_ _)ノ彡☆', '(@^◡^)', '(@´ー`@)', '(*´▽`*)', '( ´ ▽ ` )',
'( ̄▽ ̄)', '(*¯︶¯*)', '( ̄□ ̄」)', '( ̄▽ ̄)/♫•*¨*•.¸¸♪', 'ε(´סּ︵סּ`)з', '(´• ω •`)ノ', '(っ˘ω˘ς )', '(。♥‿♥。)',
'(。◕‿◕。)', '(つ≧▽≦)つ', '(●´ω`●)', '(づ。◕‿‿◕。)づ', '(づ ̄ ³ ̄)づ', '(≡^∇^≡)', '(つ✧ω✧)つ', '(=ↀωↀ=)', '(●⌒∇⌒●)',
'(◕‿◕)', '(◕ᴗ◕✿)', '(*^▽^)/', '( ̄ε ̄@)', '(๑╹ω╹๑)', '(*´・д・)ノ', '(╯°Д°)╯︵/(.□ . )', '(ノ´▽`)ノ♪', '(ノ゚▽゚)ノ',
'(*^▽^*)', '(o´∀`)o', '( ̄□ ̄)', '(o・ω・o)', '( ´◡‿ゝ◡`)', '(o^∇^o)', '( ´ ▽ ` )', '(✯◡✯)', '(つ✧ω✧)つ', '(●´ω`●)',
'(つ≧▽≦)つ', '(◕ᴗ◕✿)', '(o・ω・o)', '(。•̀ᴗ-)✧', '(゚ω゚)', '( ´ ▽ ` )', '(=^・ω・^)y=', '(^ω^)', '(つ✧ω✧)つ',
'(o^▽^o)', '(o´□`o)', '(o_ _)ノ彡☆', '(っ˘ω˘ς )', '(。♥‿♥。)', '(つ≧▽≦)つ', '(●´ω`●)', '(◕‿◕)', '(^▽^)',
'(o^∇^o)', '(=^・ω・^)y=', '( ̄ω ̄)', '(o´∀`)b', '( ´ ▽ ` )', '(^▽^)', '(●ˇ∀ˇ●)', '(つ≧▽≦)つ', '(o・ω・o)',
'(^◡^ )', '(´∀`)♡', '(●ˇ∀ˇ●)', '(^◡^ )', '( ^ω^ )', '( ̄▽ ̄)ノ', '(▰˘︹˘▰)', '( ˘▽˘)っ♨', '(๑˃̵ᴗ˂̵)', '( ´ ω ` )',
'(●ˇ∀ˇ●)', '( ´ ▽ ` )ノ', '(o^∇^o)', '(o・ω・o)', '(▰˘︹˘▰)', '(^◡^ )', '( ^ω^ )', '(▰˘︹˘▰)', '( ˘▽˘)っ♨', '(^◡^ )',
'(=^-ω-^=)', '(≧∇≦)', '( ˘▽˘)っ♨', '( •ω•ฅ).。.:*♡', '(o´ω`o)', '(๑╹ω╹๑)', '( •ω•ฅ).。.:*♡', '(๑•ω•́ฅ)',
'(๑•̀ㅂ•́)و✧'}
# 创建输出文件夹
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 遍历输入文件夹中的所有txt文件
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
input_path = os.path.join(input_folder, filename)
output_path = os.path.join(output_folder, filename)
# 读取原始文件内容并处理日本文字表情
with open(input_path, 'r', encoding='utf-8') as input_file:
lines = input_file.readlines()
modified_lines = [line for line in lines if not any(emoji in line for emoji in japanese_emojis)]
# 将处理后的文本写入新文件
with open(output_path, 'w', encoding='utf-8') as output_file:
output_file.writelines(modified_lines)
def replace_symbols(input_folder, output_folder):
"""
遍历指定文件夹中的所有txt文件,将文本中的英文符号替换成中文符号,并保存到输出文件夹中。
"""
symbol_mapping = {
'.': '。', # 英文句号转中文句号
',': ',', # 英文逗号转中文逗号
';': ';', # 英文分号转中文分号,以此类推
':': ':',
'?': '?',
'!': '!',
'...': '……', # 三个英文点号转中文省略号,根据实际需求添加其他映射关系
'"': '“', # 英文双引号转中文双引号
"'": '‘', # 英文单引号转中文单引号
'(': '(',
')': ')',
'[': '【',
']': '】',
'{': '{',
'}': '}'
}
# 创建输出文件夹
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 遍历输入文件夹中的所有txt文件
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
input_path = os.path.join(input_folder, filename)
output_path = os.path.join(output_folder, filename)
# 读取原始文件内容并替换符号
with open(input_path, 'r', encoding='utf-8') as input_file:
text = input_file.read()
modified_text = text
for key, value in symbol_mapping.items():
if key == '"' or key == "'": # 对单引号和双引号进行转义
modified_text = modified_text.replace(key, '\\' + value)
else:
modified_text = modified_text.replace(key, value)
# 将修改后的文本写入新文件
with open(output_path, 'w', encoding='utf-8') as output_file:
output_file.write(modified_text)
def process_colon_lines(input_folder, output_folder):
"""
删除文本中符合条件的带冒号的行,并将处理后的文本保存到输出文件夹中。
"""
# 创建输出文件夹
# 创建输出文件夹
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 遍历输入文件夹中的所有txt文件
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
input_file_path = os.path.join(input_folder, filename)
output_file_path = os.path.join(output_folder, filename)
# 处理文本并写入新文件
with open(input_file_path, 'r', encoding='utf-8') as input_file:
lines = input_file.readlines()
# 处理连续大于3行带有中文或英文冒号的文本
processed_lines = []
count_consecutive_colon = 0 # 连续冒号行计数器
for line in lines:
if ':' in line :
count_consecutive_colon += 1
if count_consecutive_colon > 3:
# 连续大于3行带冒号,用空白替代
processed_lines.append('\n')
continue
else:
count_consecutive_colon = 0 # 重置计数器
processed_lines.append(line)
# 写入新文件
with open(output_file_path, 'w', encoding='utf-8') as output_file:
output_file.writelines(processed_lines)
def extract_non_chinese(input_folder, output_folder):
"""提取非中文的文本内容"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
txt_path = os.path.join(input_folder, filename)
try:
# 读取文件内容
with open(txt_path, 'r', encoding='utf-8') as file:
txt_content = file.read()
# 使用正则表达式提取非中文文本
regex1 = r'\b[a-zA-Z]+\b'
match1_count = len(re.findall(regex1, txt_content))
regex2 = r'&#[0-9]+;|&[a-zA-Z]+;'
match2_count = len(re.findall(regex2, txt_content))
if match1_count <= 5 and match2_count == 0:
# 将符合条件的txt另存到新的文件夹中
output_path = os.path.join(output_folder, filename)
with open(output_path, 'w', encoding='utf-8') as output_file:
output_file.write(txt_content)
except Exception as e:
print(e)
def main_process(input_folder_yuanshi, output_folder_final):
# 第一步:处理HTML编码的文本
output_folder_html = '处理编码后文件'
process_html_encoding(input_folder_yuanshi, output_folder_html)
# 第二步:复制大于5000字符的文本文件到新的文件夹中
output_folder_html_5000 = '小于5000字符'
copy_large_files(output_folder_html, output_folder_html_5000, threshold=5000)
# 第三步:处理关键词的文本
output_folder_out320 = '处理关键词后的文件'
process_keywords(output_folder_html_5000, output_folder_out320)
# 第四步:去除控制面板的行
output_folder_colon_lines = '去除控制面板的文件'
process_colon_lines(output_folder_out320, output_folder_colon_lines)
# 第五步:处理颜文字
output_folder_emojis = '处理颜文字后的文件'
process_japanese_emojis(output_folder_colon_lines, output_folder_emojis)
# 第六步:处理英转中标点
output_folder_symbols = '处理英转中标点后的文件'
replace_symbols(output_folder_emojis, output_folder_symbols)
# 第七步:处理英文字母数量小于等于25的文本
output_folder_zimu = '字母小于25的文件'
process_letter_count(output_folder_symbols, output_folder_zimu)
# 第八步:去除一行只包含标点符号的文本
output_folder_symbol = '去除一行只标点符号的文件'
process_symbol(output_folder_zimu, output_folder_symbol)
# 第九步:去除连续的空白行
output_folder_blank_line = '去除连续的空白行的文件'
process_blank_line(output_folder_symbol, output_folder_blank_line)
# 第十步:复制大于5000字符的文本文件到新的文件夹中
output_folder_html_5000_2 = '小于5000字符_2'
copy_large_files(output_folder_blank_line, output_folder_html_5000_2, threshold=5000)
# 第十一步:提取非中文文本
output_folder_non_chinese = '非中文文本'
extract_non_chinese(output_folder_html_5000_2, output_folder_non_chinese)
# 最终结果保存到output_folder_final中
os.rename(output_folder_non_chinese, output_folder_final)
# 设置初始文件夹路径
input_folder_yuanshi = '结尾'
# 设置最终结果保存的文件夹路径
output_folder_final = '最终处理结果'
# 执行主要处理流程
main_process(input_folder_yuanshi, output_folder_final)
'''
分章节写入json
'''
import json
import re
import os
def parse_txt_to_json(file_path):
all_books = []
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
chapter_id = 1
title_list = [] # 存储标题及其位置的列表
for index, line in enumerate(lines):
line = line.strip()
# 匹配到全文的章节标题
patterns = [
# 第一种类型:以 "第" 开头的章节标题
r'^(.*?第[零一二三四五六七八九十百千万]+[卷回集篇节章]).*?\s*(.*?)\s*$',
r'^(.*?第?\d+[章节回卷集篇节]).*?\s*(.*?)\s*$',
# 第二种类型:以 "Volume" 开头的标题
r'^(.*?Volume\s*Ⅰ{0,3}Ⅱ{0,3}Ⅲ{0,3}Ⅳ{0,3}Ⅴ{0,3}Ⅵ{0,3}Ⅶ{0,3}Ⅷ{0,3}Ⅸ{0,3}Ⅹ{0,3}Ⅺ{0,3}Ⅻ{0,3}).*?\s*(.*?)\s*$',
# 第三种类型:以 "Chapter" 开头的标题
r'^(.*?Chapter\d+).*?\s*(.*?)\s*$',
# 第四种类型:其他形式的标题
r'^[0-9].*?\s*(.*?)\s*$',
r'^☆\s*(.*?)\s*$',
r'^楔子\s*(.*?)\s*$',
r'^序章\s*(.*?)\s*$',
r'^序\s*(.*?)\s*$',
r'^(.*?[零一二三四五六七八九十百千万]+章).*?\s*(.*?)\s*$',
r'^(?:第|章)(?:[零一二三四五六七八九十百千万\d\s]+)\s+(.*?)$',
r'第[0-9].*?\s*(.*?)\s*$',
]
for pattern in patterns:
match = re.match(pattern, line)
if match:
if len(line)<25:
title_list.append((match.group(), index)) # 将标题及其位置添加到列表
break
# 遍历标题列表并提取章节内容
for idx, (title, pos) in enumerate(title_list):
next_pos = title_list[idx+1][1] if idx+1 < len(title_list) else len(lines)
chapter_content = ''.join(lines[pos+1:next_pos]) # 提取标题下一行到下一个标题之间的内容
chapter_name = title.strip()
book_data = {
"chapter_id": chapter_id,
"chapter_name": chapter_name,
"content": chapter_content
}
all_books.append(book_data)
# print(all_books)
chapter_id += 1
return all_books
def process_folder(folder_path, output_file_path):
all_books = []
category_id = 1 # 根据需要设置类别ID
book_id = 1
chapter_id = 1
for root, dirs, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
chapters = parse_txt_to_json(file_path)
# 添加判断,如果章节内容为空则跳过
for chapter in chapters:
chapter["content"] = chapter["content"].replace(" ", "").replace("\n", "")
if chapter['content']:
chapter_data = {
"category_id": category_id,
"book_id": book_id,
"chapter_id": chapter_id,
"book_name": os.path.splitext(file)[0], # 使用文件名作为书名
"chapter_name": chapter['chapter_name'],
"content": chapter['content']
}
all_books.append(chapter_data)
chapter_id += 1
book_id += 1
chapter_id = 1
with open(output_file_path, 'w', encoding='utf-8') as output_file:
json.dump(all_books, output_file, ensure_ascii=False, indent=4)
folder_path = '最终处理结果'
output_file_path = '最终处理结果.json'
process_folder(folder_path, output_file_path)