## 对word进行去掉punctuation、叠字 &等处理defpreprocess_word(word):# Remove punctuation
word = word.strip('\'"?!,.():;')# Convert more than 2 letter repetitions to 2 letter# funnnnny --> funny
word = re.sub(r'(.)\1+', r'\1\1', word)# Remove - & '
word = re.sub(r'(-|\')','', word)return word
##判断是否为无效worddefis_valid_word(word):# Check if word begins with an alphabetreturn(re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word)isnotNone)
##对tweet进行 小写、去除url @ 、# 等处理defclean_text(tweet):
processed_tweet =[]# Convert to lower case
tweet = tweet.lower()# Replaces URLs with the word URL
tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))',' ', tweet)# Replace @handle with the word USER_MENTION
tweet = re.sub(r'@[\S]+','USER_MENTION', tweet)# Replaces #hashtag with hashtag
tweet = re.sub(r'#(\S+)',' ', tweet)# Remove RT (retweet)
tweet = re.sub(r'\brt\b',' ', tweet)# Replace 2+ dots with space
tweet = re.sub(r'\.{2,}',' ', tweet)# Strip space, " and ' from tweet
tweet = tweet.strip(' "\'')# Replace emojis with either EMO_POS or EMO_NEG
tweet = handle_emojis(tweet)# Replace multiple spaces with a single space
tweet = re.sub(r'\s+',' ', tweet)
words = tweet.lower().split()
words =[w for w in words if w notin stopwords]for word in words:
word = preprocess_word(word)if is_valid_word(word):
processed_tweet.append(word)return processed_tweet
data['clean_review']=data.Tweet.apply(clean_text)
data.to_csv(handel_file, index=False)
from nltk.tag import pos_tag
sentences=data.clean_review.tolist()# print(sentences)
words=[]for sent in sentences:for word in sent:
words.append(word)# print(words)
word_tag=pos_tag(words)print(word_tag)