import jieba import jieba.posseg as pseg import re def replace_with_asterisks(match): return '*' * len(match.group()) def remove_personal_info(text): # 使用jieba对文本进行分词,同时获取词性 words_with_flags = pseg.cut(text) # 遍历分词结果,将'nr'(人名)替换为相应数量的'*' filtered_sentence = ''.join(word if flag != 'nr' else '*' * len(word) for word, flag in words_with_flags) # 电话、身份证号码、电子邮箱、QQ号码的简单正则表达式 regex_dict = { "phone": r"1[3-9]\d{9}", "employee_id": r"[a-z0-9]{4,50}", "id_card": r"[\dXx]", "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "qq": r"[1-9][0-9]{4,10}(?!\.com)" } for info, regex in regex_dict.items(): filtered_sentence = re.sub(regex, replace_with_asterisks, filtered_sentence, flags=re.IGNORECASE) return filtered_sentence with open('input.txt', 'r', encoding='utf-8') as f: text_with_info = f.read() filtered_text = remove_personal_info(text_with_info) print(filtered_text) with open('filtered_text.txt', 'w', encoding='utf-8') as f: f.write(filtered_text)
存在的问题,可能邮箱被分词写死,@qq.com替换不了