import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
# 读取Excel文件
file_path = 'Laos news overall.xlsx'
sheet_name = 'Sheet1' # 修改为实际的工作表名称
data = pd.read_excel(file_path, sheet_name=sheet_name)
# 清除NaN值,将它们替换为一个空字符串
data['内容'].fillna('', inplace=True)
# NLTK停用词
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# 使用NLTK进行文本清洗和分词
def preprocess_text(text):
text = re.sub(r'[^\w\s]', '', text) # 去除标点符号
text = text.lower() # 转换为小写
tokens = nltk.word_tokenize(text) # 分词
tokens = [word for word in tokens if word not in stop_words] # 去除停用词
return " ".join(tokens)
data['cleaned_content'] = data['内容'].apply(preprocess_text)
# 将分词结果保存为CSV文件
data.to_csv('news_cleaned_tokens.csv', index=False)