python清除文本里有html,代码片段的代码

import re
import opencc
from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)

def remove_code(text):
    code_pattern = re.compile(r'```[\s\S]*?```')
    return code_pattern.sub(r'', text)

def traditional_to_simplified(text):
    converter = opencc.OpenCC('t2s.json')
    return converter.convert(text)

if __name__ == '__main__':
    example_text = """您好,這裡是一個測試文本。我們將檢查和清理樣式
    <h1>HTML header</h1>
    <div>在這裡</div>
    <p>https://www.example.com 是網址</p>
    😊👍
    同时保留 Markdown 格式
    查看代码:
    ```
    def example():
        print("Hello, world!")
    ```"""

    cleaned_text = traditional_to_simplified(example_text)
    cleaned_text = remove_html_tags(cleaned_text)
    cleaned_text = remove_emoji(cleaned_text)
    cleaned_text = remove_url(cleaned_text)
    cleaned_text = remove_code(cleaned_text)

    print(cleaned_text)

方法二:下面是txt导入格式

import re
import opencc
from bs4 import BeautifulSoup


def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()


def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_url(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)


def remove_code(text):
    # Change *? to * to capture all occurrences of code blocks
    code_pattern = re.compile(r'```[\s\S]*```')
    return code_pattern.sub(r'', text)


def traditional_to_simplified(text):
    converter = opencc.OpenCC('t2s.json')
    return converter.convert(text)


def read_text_from_file(file_path):
    with open(file_path, "r", encoding='utf-8') as text_file:
        return text_file.read()


def write_text_to_file(file_path, text):
    with open(file_path, "w", encoding='utf-8') as text_file:
        text_file.write(text)


if __name__ == '__main__':
    input_file_path = "input123.txt"  # replace with your input file path
    output_file_path = "output123.txt"  # replace with your output file path

    text = read_text_from_file(input_file_path)

    # Replace multiple newline characters with a single newline
    text = re.sub(r'\n+', '\n', text)

    cleaned_text = traditional_to_simplified(text)
    cleaned_text = remove_html_tags(cleaned_text)
    cleaned_text = remove_emoji(cleaned_text)
    cleaned_text = remove_url(cleaned_text)
    cleaned_text = remove_code(cleaned_text)

    write_text_to_file(output_file_path, cleaned_text)
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值