引用库文件:
import re
import nltk
import xlrd
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import json
from tqdm import tqdm
文本来源于一个xlsx文件中,先读取出文本:
workbook = xlrd.open_workbook("tweets_2.xlsx")
worksheet = workbook.sheet_by_index(0)
tweets = worksheet.col_values(4)[1:]
停用词集合:
stop = set(stopwords.words('english'))
统计文本中的词并标准化:
filter_tweets = []
fdists = []
ps = PorterStemmer()
for tweet in tqdm(tweets):
#文本过滤
filter_tweet = re.sub(r'[^a-zA-Z0-9\s]', '', string=tweet)
#去除停用词
filter_tweet = [word for word in filter_tweet.split(' ') if word not in stop]
temp_tweet = []
#标准化
for w in filter_tweet:
rootWord = ps.stem(w)
temp_tweet.append(rootWord)
filter_tweet = temp_tweet
filter_tweets += filter_tweet
统计词频并从大到小排序:
fdist = FreqDist(filter_tweets)
sorted_fdist = sorted(fdist.items(), key=lambda x: x[1], reverse=True)