import nltk import pandas as pd from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords import re from collections import Counter # 下载NLTK的停用词和词形还原器 nltk.download ( 'stopwords' ) nltk.download ( 'punkt' ) nltk.download ( 'wordnet' ) # 初始化词形还原器和停用词列表 lemmatizer = WordNetLemmatizer ( ) stop_words = set ( stopwords.words ( "english" ) ) # 自定义要清洗的关键词 custom_stop_words = set ( [ "indonesia", "highspeed", "jakartabandung", "year", "railway", "rail", "jakarta", "bandung", "hsr", "high spped rail", "high spped railway", "lcm", "lao", "also", "said", "mr", "Jakarta-Bandung High-Speed Railway" ] ) # 读取Excel文件 file_path = "Indonesia news overall.xlsx" df = pd.read_excel ( file_path ) # 合并标题和内容列 df["merged_text"] = df["标题"].astype ( str ) + " " + df["内容"].astype ( str ) # 合并所有文本为一个文本块 all_text = " ".join ( df["merged_text"] ) # 清洗文本的函数 def clean_text(text): # 使用正则表达式去除标点符号 text = re.sub ( r'[^\w\s]', '', text ) # 清除情态动词 text = re.sub ( r'\b(?:would|might|should|will|shall|can|could|may|must)\b', '', text ) # 分词 words = word_tokenize ( text.lower ( ) ) # 过滤停用词、自定义关键词、数字、单个字母和两个字母 words = [lemmatizer.lemmatize ( word ) for word in words if word not in stop_words and word not in custom_stop_words and not word.isdigit ( ) and len ( word ) > 2] return words # 清洗整个文本块 cleaned_all_text = clean_text ( all_text ) # 计算词频(TF) word_freq = Counter ( cleaned_all_text ) # 将词频字典按值(TF值)从大到小排列 sorted_word_freq = dict ( sorted ( word_freq.items ( ), key=lambda item: item[1], reverse=True ) ) # 创建DataFrame tf_df = pd.DataFrame ( list ( sorted_word_freq.items ( ) ), columns=["Word", "TF"] ) # 保存DataFrame为CSV文件 output_csv_path = "indonesia_tf_result.csv" tf_df.to_csv ( output_csv_path, index=False ) # 输出文件保存成功信息 print ( f"TF Results saved to {output_csv_path}" )