import re email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' phone_pattern = r'\b\d{10,13}\b' name_pattern = r'\b[A-Za-z ]+\b' # 可根据需要自行修改,这里只是一个示例 id_pattern = r'\b\d{15}(\d{2}[0-9X])?\b' workid_pattern = r'\b[A-Za-z0-9]{5,10}\b' # 修改以匹配含英文大小写的工号 qq_pattern = r'\b[1-9][0-9]{4,}\b' privacy_patterns = [email_pattern, phone_pattern, name_pattern, id_pattern, workid_pattern, qq_pattern] def mask_privacy(text, patterns): for pattern in patterns: # Find all the matches matches = re.findall(pattern, text) for match in matches: # Replace each character in the match with '*' masked = '*' * len(match) # Replace the match in the text with the masked string text = text.replace(match, masked) return text with open("input.txt", "rt", encoding="utf-8") as input_file: text = input_file.read() masked_text = mask_privacy(text, privacy_patterns) with open("output.txt", "wt", encoding="utf-8") as output_file: output_file.write(masked_text)
姓名:张三,张三没有替换掉
name_pattern = r'\b[A-Za-z]+(?:\s[A-Za-z]+)*\b|\b[\u4e00-\u9fa5]{2,4}\b'
这里限制了2-4个中文字符,但是其他的也被匹配掉了