import re import opencc from bs4 import BeautifulSoup def remove_html_tags(text): soup = BeautifulSoup(text, 'html.parser') return soup.get_text() def remove_emoji(text): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', text) def remove_url(text): url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') return url_pattern.sub(r'', text) def remove_code(text): code_pattern = re.compile(r'```[\s\S]*?```') return code_pattern.sub(r'', text) def traditional_to_simplified(text): converter = opencc.OpenCC('t2s.json') return converter.convert(text) if __name__ == '__main__': example_text = """您好,這裡是一個測試文本。我們將檢查和清理樣式 <h1>HTML header</h1> <div>在這裡</div> <p>https://www.example.com 是網址</p> 😊👍 同时保留 Markdown 格式 查看代码: ``` def example(): print("Hello, world!") ```""" cleaned_text = traditional_to_simplified(example_text) cleaned_text = remove_html_tags(cleaned_text) cleaned_text = remove_emoji(cleaned_text) cleaned_text = remove_url(cleaned_text) cleaned_text = remove_code(cleaned_text) print(cleaned_text)
方法二:下面是txt导入格式
import re import opencc from bs4 import BeautifulSoup def remove_html_tags(text): soup = BeautifulSoup(text, 'html.parser') return soup.get_text() def remove_emoji(text): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', text) def remove_url(text): url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') return url_pattern.sub(r'', text) def remove_code(text): # Change *? to * to capture all occurrences of code blocks code_pattern = re.compile(r'```[\s\S]*```') return code_pattern.sub(r'', text) def traditional_to_simplified(text): converter = opencc.OpenCC('t2s.json') return converter.convert(text) def read_text_from_file(file_path): with open(file_path, "r", encoding='utf-8') as text_file: return text_file.read() def write_text_to_file(file_path, text): with open(file_path, "w", encoding='utf-8') as text_file: text_file.write(text) if __name__ == '__main__': input_file_path = "input123.txt" # replace with your input file path output_file_path = "output123.txt" # replace with your output file path text = read_text_from_file(input_file_path) # Replace multiple newline characters with a single newline text = re.sub(r'\n+', '\n', text) cleaned_text = traditional_to_simplified(text) cleaned_text = remove_html_tags(cleaned_text) cleaned_text = remove_emoji(cleaned_text) cleaned_text = remove_url(cleaned_text) cleaned_text = remove_code(cleaned_text) write_text_to_file(output_file_path, cleaned_text)