python拆分句子、去除句子符号等并分词

最新推荐文章于 2024-05-18 11:20:08 发布

小炫y

最新推荐文章于 2024-05-18 11:20:08 发布

阅读量786

点赞数 7

文章标签： python

本文链接：https://blog.csdn.net/weixin_44740756/article/details/135362584

版权

本文介绍了如何使用Python中的正则表达式、nltk库和jieba分词工具对文本进行拆分，按指定长度批量处理，并进行HTML标签、链接、特殊字符和停用词的清洗，以得到干净的中文文本数据。

摘要由CSDN通过智能技术生成

import re

def split_text_into_batches(text, max_tokens_per_batch):
    # 定义一个正则表达式，在中文标点符号处拆分句子
    sentence_splitter = re.compile(r'(?<=[。！？])')

    # 将文本拆分为句子
    sentences = [sentence.strip() for sentence in sentence_splitter.split(text) if sentence.strip()]

    # 初始化变量
    batches = []
    current_batch = ""

    for sentence in sentences:
        if len(current_batch) + len(sentence) <= max_tokens_per_batch:
            current_batch += sentence + " "
        else:
            # 找到距离 max_tokens_per_batch 限制最近的标点符号
            last_punctuation_index = max(current_batch.rfind('。'), current_batch.rfind('！'), current_batch.rfind('？'))

            # 如果限制范围内没有标点符号，就在最后一个空格处拆分
            split_index = last_punctuation_index if last_punctuation_index != -1 else current_batch.rfind(' ')

            # 将批次添加到拆分索引处
            batches.append(current_batch[:split_index].strip())

            # 新批次从拆分索引开始
            current_batch = sentence + " "

    if current_batch.strip():  # 确保不将空字符串添加到批次中
        batches.append(current_batch.strip())

    return batches

text = ""

max_tokens_per_batch = 20
batches = split_text_into_batches(text, max_tokens_per_batch)
print("Batches:", batches)

import re
import nltk
import jieba
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def clean_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text

def remove_links(text):
    clean_text = re.sub(r'http\S+', '', text)
    return clean_text

def remove_special_characters(text):
    clean_text = ''.join(char for char in text if char not in string.punctuation)
    return clean_text

def remove_extra_whitespace(text):
    clean_text = ' '.join(text.split())
    return clean_text

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    clean_text = ' '.join(word for word in word_tokens if word.lower() not in stop_words)
    return clean_text

def clean_chinese_text(text):
    # 清除HTML标签
    cleaned_text = clean_html_tags(text)

    # 去除链接
    cleaned_text = remove_links(cleaned_text)

    # 去除特殊字符
    cleaned_text = remove_special_characters(cleaned_text)

    # 去除额外的空白
    cleaned_text = remove_extra_whitespace(cleaned_text)

    # 去除停用词
    cleaned_text = remove_stopwords(cleaned_text)
    # 使用jieba进行分词
    word_list = jieba.lcut(cleaned_text)

    # 拼接成清洗后的文本
    cleaned_text = ' '.join(word_list)

    return cleaned_text

input_text =""

cleaned_text = clean_chinese_text(input_text)
print(cleaned_text)