该项目我是在谷歌的Colab平台完成的,首先,要先查看数据
import os
os.chdir("drive/Colab Notebooks/NLP/Rotten Tomatoes movie review")
import pandas as pd
train = pd.read_csv("train.tsv", sep='\t')
test = pd.read_csv("test.tsv", sep='\t')
print(train.head(5))
import refrom nltk.corpus
import stopwords
# 定义清洗函数
def review_to_words(raw_review):
letters_only = re.sub('[^a-zA-Z]', ' ', raw_review) # 只保留字母
words = letters_only.lower().split() # 转换成小写字母
stops = set(stopwords.words('english')) # 加载停止词
meaningful_words = [w for w in words if w not in stops] # 去除停止词
return (" ".join(meaningful_words))
# 数据清洗
import nltk
nltk.download('stopwords')
clean_train_reviews = []
num_reviews = train["Phrase"].size
for i in range(0, num_reviews):
if (i+1) % 10000 == 0:
print("Review %d of %d\n" % (i+1, num_reviews))
clean_train_reviews.append(review_to_words(train["Phrase"][i]))
# 查看清洗后的数据
print(clean_train_reviews[0])
# 'series escapades demonstrating adage good goose also good gander occasionally amuses none amounts much story'
# 构造分词器
nltk.download('punkt')
tokenizer = nltk.word_tokenize
word = []
for i in range(len(clean_train_reviews)):
word+=(tokenizer(clean_train_reviews[i]))
# 创建词汇表
word_set = list(set(word)) # len(word_set) = 14992
# 由于上述操作使得所有词汇都存入了一个列表,而我们的训练数据需要保持原始数据的形状,即保证每条评论在一个列表中,需要进行下面的操作
word_df = []
for i in range(len(clean_train_reviews)):
word_df.append(tokenizer(clean_train_reviews[i]))
# 下面是关键的一步,将所有的词汇转换为one-hot向量
for i in range(len(word_df)):
for w in word_df[i]:
if w in word_set:
word_df[i