import pandas as pd import re import nltk from nltk.corpus import stopwords from transformers import BertTokenizer # 读取Excel文件 file_path = 'Laos news overall.xlsx' sheet_name = 'Sheet1' # 修改为实际的工作表名称 data = pd.read_excel(file_path, sheet_name=sheet_name) # 清除NaN值,将它们替换为一个空字符串 data['内容'].fillna('', inplace=True) # NLTK停用词 nltk.download('stopwords') stop_words = set(stopwords.words('english')) # 使用NLTK进行文本清洗和分词 def preprocess_text(text): text = re.sub(r'[^\w\s]', '', text) # 去除标点符号 text = text.lower() # 转换为小写 tokens = nltk.word_tokenize(text) # 分词 tokens = [word for word in tokens if word not in stop_words] # 去除停用词 return " ".join(tokens) data['cleaned_content'] = data['内容'].apply(preprocess_text) # 使用BERT进行分词 bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') data['bert_tokens'] = data['cleaned_content'].apply(lambda tokens: " ".join(bert_tokenizer.tokenize(" ".join(tokens)))) # 将分词结果保存为CSV文件 data.to_csv('news_bert_tokens.csv', index=False)
文本清洗,nltk和bert相互打架记录
最新推荐文章于 2024-07-22 21:21:48 发布