版权声明:本文为博主原创文章,未经博主允许不得转载。
文章目录
一、项目介绍
1.1 数据介绍
- 【数据集】Amazon 500000评论
1.2 项目步骤
- 数据预处理
- 构建Seq2Seq模型
- 训练网络
- 测试效果
二、代码实现
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import time
from nltk.corpus import stopwords
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
2.1 加载数据集
reviews = pd.read_csv('./data/reviews.csv')
数据预览如下图:
# Check for any nulls values
reviews.isnull().sum()
2.2 数据预处理
2.2.1 特征处理
# Remove null values and unneeded features
# 删除空值的行
reviews = reviews.dropna()
# 删除不需要的列
reviews = reviews.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], axis=1)
2.2.2 全部转换成小写 / 连词转换 / 去停用词(只在描述中去掉)
contractions = {
"ain't": 'am not',
"aren't": "are not",
"can't": "cannot",
"can't've": "connot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}
def clean_text(text, remove_stopwords = True):
'''Remove unwanted characters, stopwords and format the text to create fewer nulls word embeddings'''
# Convert words to lower case
text = text.lower()
# Replace contractions with their longer forms
if True:
text = text.split()
next_text = []
for word in text:
if word in contractions:
next_text.append(contractions[word])
else:
next_text.append(word)
text = " ".join(next_text)
# Format words and remove unwanted characters ???
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub(r'\<a href', ' ', text)
text = re.sub(r'&', '', text)
text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/"]', ' ', text)
text = re.sub(r'<br />', ' ', text)
text = re.sub(r'\'', ' ', text)
# Optionally, remove stop words
if remove_stopwords:
text = text.split()
stops = set(stopwords.words("english"))
text = [w for w in text if w not in stops]
text = " ".join(text)
return text
- We will remove the stopwords from the texts because they do not provide much use for training our model.
- However,we will keep them for our summaries so that they sound more like natural phrases.(保留Summary列的停用词)
# Clean the summaries and texts
clean_summaries = []
for summary in reviews['Summary']:
clean_summaries.append(clean_text(summary, remove_stopwords=False))
print('Summaries are complete.')
clean_texts = []
for text in reviews['Text']:
clean_texts.append(clean_text(text))
print('Texts are complete.')
2.2.3 统计
# Count the number of occurrences of each word in s set of text
def count_words(count_dict, text):
for sentence in text:
for