数据预处理:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))
import nltk
nltk.download('stopwords')
打印结果:
TensorFlow Version: 1.8.0 [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Administrator\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
reviews = pd.read_csv("Reviews.csv")
reviews=reviews.iloc[0:10000,:]
# Remove null values and unneeded features
reviews = reviews.dropna()
reviews = reviews.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator',
'Score','Time'], 1)
reviews = reviews.reset_index(drop=True)
reviews.head()
打印结果:
# Inspecting some of the reviews
for i in range(2):
print("Review #",i+1)
print(reviews.Summary[i])
print(reviews.Text[i])
print()
打印结果:
Review # 1 Good Quality Dog Food I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most. Review # 2 Not as Advertised Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",