(12-2)TensorFlow无监督学习实战:训练商品评论模型

在本项目“Review-Generator”中,在“data”目录的记事本文件中保存了用户对商品的评论。我们使用TensorFlow中的Keras生成目标产品评论,以创建无监督的深度学习模型。

1)编写文件createData.py处理“data”目录的数据,将文本内容拆分为评论,并清除每一行的标点符号和大写字母,最后对每条评论换行处理。代码如下:

reviewData = open("data/data.txt", 'r')
cleanData = open("data/cleanData.txt", 'w')
cleanDataPunc = open("data/cleanData_withPunc.txt", 'w')
cleanData.truncate(0)

onReview = 0
# 迭代每一行,将文本拆分为评论,并清除每一行的标点符号和大写字母
for line in reviewData:
    curLine = ""
    for word in line.split():
        if(word == "\"reviewText\":"):
            onReview = 1
        if(word == "\"summary\":" or word == "\"overall\":"):
            onReview = 0
            break
        if(onReview == 1):
            curLine += word + " "         
    # 删除新字符行
    curLine = curLine.replace('\\n', ' ')
    curLine = curLine.replace('  ', ' ')
    cleanDataPunc.write(curLine[15:-3])
    cleanDataPunc.write("\n")
    
    #去掉标点符号
    curLine = curLine.translate(str.maketrans('', '', string.punctuation))
    #修剪线的起点和终点
    cleanData.write(curLine[11:])

    #添加换行符
    cleanData.write("\n")
cleanData.close()

2)编写文件revgen.py,加载使用训练好的模型fake_review.hdf5分类处理用户的评论 代码如下:

def create_dataset(window_size):

    text = open('data/cleanData.txt').readlines()
    text2 = open('data/cleanData_withPunc.txt').read()
    print('corpus length:', len(text))

    chars = sorted(list(set(text2)))
    print(chars)

    print('total chars:', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))

    step = 1
    sentences = []
    next_chars = []

    for reviews in text:
        #在“干净数据”部分中,审阅长度 > 40
        for i in range(0, len(reviews) - window_size + 1, step):

            sentences.append(reviews[i: i + window_size])
            next_chars.append(reviews[i + 1:i + 1 + window_size])


    X = np.zeros((len(sentences), window_size, len(chars)), dtype=np.bool)  # 40排len(chars) col, one-hot米线
    y = np.zeros((len(sentences), window_size, len(chars)), dtype=np.bool)  # y也是一个序列,或1个热向量的序列
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
    print(X.shape)

    for i, sentence in enumerate(next_chars):
        for t, char in enumerate(sentence):
            y[i, t, char_indices[char]] = 1

    print(y.shape)

    return len(chars),  X, y, char_indices, indices_char

def create_model(input_dimension,  epoch_num):
    print('Create the model')
    model = Sequential()
    model.add(LSTM(512, input_shape=(None,input_dimension), return_sequences=True)) #将版本更改为2.0.0input_dim=input_dimension,
    model.add(Dropout(0.2))
    model.add(LSTM(512, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(Dense(input_dimension, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    print(model.summary())

    filepath = "fake_review.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]

    model.fit(X, y, epochs=epoch_num, batch_size=128, callbacks=callbacks_list)


    return model

def generate_fake_review(seed_str, input_dimension,model,char_indices,indices_char,sentence_num):
    print("seed string -->", seed_str)
    print('The generated text is:')

    generateText = seed_str

    for i in range(10000):
        # generate sentence_num sentences
        if generateText.count('.')> sentence_num:
            break

        x = np.zeros((1, len(seed_str), input_dimension))
        for t, char in enumerate(seed_str):
            x[0, t, char_indices[char]] = 1.

        preds = model.predict(x, verbose=0)[0]

        next_index = np.argmax(preds[len(seed_str) - 1])
        next_char = indices_char[next_index]
        seed_str = seed_str + next_char

        generateText = generateText + next_char

        if i % 100 == 0:
            print(i/100)

    print(generateText)
    return generateText


if __name__ == '__main__':
    input_dimension, X, y, char_indices, indices_char = create_dataset(window_size=40)
    model = create_model(input_dimension, epoch_num= 5)
    model = load_model('fake_review.hdf5')
    fake_text = generate_fake_review('i love the sushi',input_dimension, model, char_indices, indices_char, 10)

    text_file = open("Output.txt", "w")
    text_file.write("GenerateText : \n  %s" % fake_text)
    text_file.close()

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

码农三叔

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值