在本项目“Review-Generator”中,在“data”目录的记事本文件中保存了用户对商品的评论。我们使用TensorFlow中的Keras生成目标产品评论,以创建无监督的深度学习模型。
(1)编写文件createData.py处理“data”目录中的数据,将文本内容拆分为评论,并清除每一行的标点符号和大写字母,最后对每条评论换行处理。代码如下:
reviewData = open("data/data.txt", 'r')
cleanData = open("data/cleanData.txt", 'w')
cleanDataPunc = open("data/cleanData_withPunc.txt", 'w')
cleanData.truncate(0)
onReview = 0
# 迭代每一行,将文本拆分为评论,并清除每一行的标点符号和大写字母
for line in reviewData:
curLine = ""
for word in line.split():
if(word == "\"reviewText\":"):
onReview = 1
if(word == "\"summary\":" or word == "\"overall\":"):
onReview = 0
break
if(onReview == 1):
curLine += word + " "
# 删除新字符行
curLine = curLine.replace('\\n', ' ')
curLine = curLine.replace(' ', ' ')
cleanDataPunc.write(curLine[15:-3])
cleanDataPunc.write("\n")
#去掉标点符号
curLine = curLine.translate(str.maketrans('', '', string.punctuation))
#修剪线的起点和终点
cleanData.write(curLine[11:])
#添加换行符
cleanData.write("\n")
cleanData.close()
(2)编写文件revgen.py,加载使用训练好的模型fake_review.hdf5分类处理用户的评论 代码如下:
def create_dataset(window_size):
text = open('data/cleanData.txt').readlines()
text2 = open('data/cleanData_withPunc.txt').read()
print('corpus length:', len(text))
chars = sorted(list(set(text2)))
print(chars)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
step = 1
sentences = []
next_chars = []
for reviews in text:
#在“干净数据”部分中,审阅长度 > 40
for i in range(0, len(reviews) - window_size + 1, step):
sentences.append(reviews[i: i + window_size])
next_chars.append(reviews[i + 1:i + 1 + window_size])
X = np.zeros((len(sentences), window_size, len(chars)), dtype=np.bool) # 40排len(chars) col, one-hot米线
y = np.zeros((len(sentences), window_size, len(chars)), dtype=np.bool) # y也是一个序列,或1个热向量的序列
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
X[i, t, char_indices[char]] = 1
print(X.shape)
for i, sentence in enumerate(next_chars):
for t, char in enumerate(sentence):
y[i, t, char_indices[char]] = 1
print(y.shape)
return len(chars), X, y, char_indices, indices_char
def create_model(input_dimension, epoch_num):
print('Create the model')
model = Sequential()
model.add(LSTM(512, input_shape=(None,input_dimension), return_sequences=True)) #将版本更改为2.0.0input_dim=input_dimension,
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(input_dimension, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print(model.summary())
filepath = "fake_review.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(X, y, epochs=epoch_num, batch_size=128, callbacks=callbacks_list)
return model
def generate_fake_review(seed_str, input_dimension,model,char_indices,indices_char,sentence_num):
print("seed string -->", seed_str)
print('The generated text is:')
generateText = seed_str
for i in range(10000):
# generate sentence_num sentences
if generateText.count('.')> sentence_num:
break
x = np.zeros((1, len(seed_str), input_dimension))
for t, char in enumerate(seed_str):
x[0, t, char_indices[char]] = 1.
preds = model.predict(x, verbose=0)[0]
next_index = np.argmax(preds[len(seed_str) - 1])
next_char = indices_char[next_index]
seed_str = seed_str + next_char
generateText = generateText + next_char
if i % 100 == 0:
print(i/100)
print(generateText)
return generateText
if __name__ == '__main__':
input_dimension, X, y, char_indices, indices_char = create_dataset(window_size=40)
model = create_model(input_dimension, epoch_num= 5)
model = load_model('fake_review.hdf5')
fake_text = generate_fake_review('i love the sushi',input_dimension, model, char_indices, indices_char, 10)
text_file = open("Output.txt", "w")
text_file.write("GenerateText : \n %s" % fake_text)
text_file.close()