import os
import re
import json
def process_sensitive(input_folder, sensitive_file_path, output_folder):
"""替换文本中的敏感词并记录替换信息"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
replace_info = [] # 用于记录替换信息
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
input_file_path = os.path.join(input_folder, filename)
output_file_path = os.path.join(output_folder, filename)
with open(input_file_path, 'r', encoding='utf-8') as input_file, \
open(output_file_path, 'w', encoding='utf-8') as output_file:
txt_content = input_file.read()
# 加载敏感词文件
with open(sensitive_file_path, 'r', encoding='utf-8') as sensitive_file_obj:
sensitive_words = {line.strip() for line in sensitive_file_obj if line.strip()}
# 使用正则表达式替换敏感词
for word in sensitive_words:
replace_count = len(re.findall(re.escape(word), txt_content, re.IGNORECASE))
if replace_count > 0:
# 获取敏感词前后三个字
match_pattern = r"(.{0,5})" + re.escape(word) + r"(.{0,5})"
matches = re.finditer(match_pattern, txt_content, re.IGNORECASE)
for match in matches:
before_word = match.group(1)
after_word = match.group(2)
replace_info.append({
'file_name': filename,
'sensitive_word': word,
'txt_content': before_word + word + after_word
})
# 替换敏感词
txt_content = re.sub(re.escape(word), '*' * len(word), txt_content,
flags=re.IGNORECASE)
# 将替换后的文本写入输出文件
output_file.write(txt_content)
# 将替换信息写入json文件
replace_info_file = os.path.join(output_folder, 'replace_info.json')
with open(replace_info_file, 'w', encoding='utf-8') as json_file:
json.dump(replace_info, json_file, ensure_ascii=False, indent=4)
output_sensitive = 'D:\\2024work\\wangzhe\\清洗过程\\处理敏感词后的文件'
sensitive_file_path = 'D:\\2024work\\wangzhe\\清洗小说\\敏感词\\sensitive_words.txt'
process_sensitive("D:\\2024work\\wangzhe\\清洗过程\\处理关键词后的文件", sensitive_file_path, output_sensitive)
敏感词参考以下:
GitHub - 57ing/Sensitive-word: 收集的一些敏感词汇,挺全的,还细分了暴恐词库、反动词库、民生词库、色情词库、贪腐词库、其他词库等