统计词频
英文文本词频统计
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# 读取数据
def get_data(file_path):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
text = file.read().strip()
return text
# 英文缩写替换
def replace_abbreviations(text):
text = text.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
.replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
.replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
.replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
.replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")\
.replace("didn't", "did not").replace("hadn't", "had not").replace("waht's", "what is").replace("couldn't", "could not")\
.replace("you'll", "you will").re