nltk文本清洗，包括正则式删除情态动词、过去式等_怎么清洗掉有连词和介词的三连词 python nltk-CSDN博客

本文链接：https://blog.csdn.net/weston95/article/details/132724681

该文详细描述了使用Python和NLTK库对印尼新闻文本进行预处理，包括去除停用词、词形还原，最后计算词频并输出至CSV文件的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from collections import Counter

# 下载NLTK的停用词和词形还原器
nltk.download ( 'stopwords' )
nltk.download ( 'punkt' )
nltk.download ( 'wordnet' )

# 初始化词形还原器和停用词列表
lemmatizer = WordNetLemmatizer ( )
stop_words = set ( stopwords.words ( "english" ) )

# 自定义要清洗的关键词
custom_stop_words = set ( [
    "indonesia", "highspeed", "jakartabandung", "year", "railway", "rail", "jakarta", "bandung", "hsr",
    "high spped rail", "high spped railway", "lcm", "lao", "also", "said", "mr", "Jakarta-Bandung High-Speed Railway"
] )

# 读取Excel文件
file_path = "Indonesia news overall.xlsx"
df = pd.read_excel ( file_path )

# 合并标题和内容列
df["merged_text"] = df["标题"].astype ( str ) + " " + df["内容"].astype ( str )

# 合并所有文本为一个文本块
all_text = " ".join ( df["merged_text"] )


# 清洗文本的函数
def clean_text(text):
    # 使用正则表达式去除标点符号
    text = re.sub ( r'[^\w\s]', '', text )

    # 清除情态动词
    text = re.sub ( r'\b(?:would|might|should|will|shall|can|could|may|must)\b', '', text )

    # 分词
    words = word_tokenize ( text.lower ( ) )

    # 过滤停用词、自定义关键词、数字、单个字母和两个字母
    words = [lemmatizer.lemmatize ( word ) for word in words if
             word not in stop_words and word not in custom_stop_words and not word.isdigit ( ) and len ( word ) > 2]

    return words


# 清洗整个文本块
cleaned_all_text = clean_text ( all_text )

# 计算词频（TF）
word_freq = Counter ( cleaned_all_text )

# 将词频字典按值（TF值）从大到小排列
sorted_word_freq = dict ( sorted ( word_freq.items ( ), key=lambda item: item[1], reverse=True ) )

# 创建DataFrame
tf_df = pd.DataFrame ( list ( sorted_word_freq.items ( ) ), columns=["Word", "TF"] )

# 保存DataFrame为CSV文件
output_csv_path = "indonesia_tf_result.csv"
tf_df.to_csv ( output_csv_path, index=False )

# 输出文件保存成功信息
print ( f"TF Results saved to {output_csv_path}" )