python清除文本里有html，代码片段的代码

boy687687

已于 2023-11-16 00:24:27 修改

阅读量218

点赞数

文章标签： python html 开发语言

于 2023-11-15 23:23:18 首次发布

本文链接：https://blog.csdn.net/boy687687/article/details/134431670

版权

文章介绍了如何使用Python库如BeautifulSoup和正则表达式去除HTML标签、表情符号、URL和代码块，同时演示了传统到简体中文的转换，以及处理文件输入和输出的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import re
import opencc
from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)

def remove_code(text):
    code_pattern = re.compile(r'```[\s\S]*?```')
    return code_pattern.sub(r'', text)

def traditional_to_simplified(text):
    converter = opencc.OpenCC('t2s.json')
    return converter.convert(text)

if __name__ == '__main__':
    example_text = """您好，這裡是一個測試文本。我們將檢查和清理樣式
    <h1>HTML header</h1>
    <div>在這裡</div>
    <p>https://www.example.com 是網址</p>
    😊👍
    同时保留 Markdown 格式
    查看代码:
    ```
    def example():
        print("Hello, world!")
    ```"""

    cleaned_text = traditional_to_simplified(example_text)
    cleaned_text = remove_html_tags(cleaned_text)
    cleaned_text = remove_emoji(cleaned_text)
    cleaned_text = remove_url(cleaned_text)
    cleaned_text = remove_code(cleaned_text)

    print(cleaned_text)

方法二：下面是txt导入格式

import re
import opencc
from bs4 import BeautifulSoup


def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()


def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_url(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)


def remove_code(text):
    # Change *? to * to capture all occurrences of code blocks
    code_pattern = re.compile(r'```[\s\S]*```')
    return code_pattern.sub(r'', text)


def traditional_to_simplified(text):
    converter = opencc.OpenCC('t2s.json')
    return converter.convert(text)


def read_text_from_file(file_path):
    with open(file_path, "r", encoding='utf-8') as text_file:
        return text_file.read()


def write_text_to_file(file_path, text):
    with open(file_path, "w", encoding='utf-8') as text_file:
        text_file.write(text)


if __name__ == '__main__':
    input_file_path = "input123.txt"  # replace with your input file path
    output_file_path = "output123.txt"  # replace with your output file path

    text = read_text_from_file(input_file_path)

    # Replace multiple newline characters with a single newline
    text = re.sub(r'\n+', '\n', text)

    cleaned_text = traditional_to_simplified(text)
    cleaned_text = remove_html_tags(cleaned_text)
    cleaned_text = remove_emoji(cleaned_text)
    cleaned_text = remove_url(cleaned_text)
    cleaned_text = remove_code(cleaned_text)

    write_text_to_file(output_file_path, cleaned_text)