python自然语言情感_Python自然语言处理实战(8):情感分析技术

实战电影评论情感分析

情感分析是一段文字表达的情绪状态。其中,一段文本可以使一个句子、一个段落或者一个文档。主要涉及两个问题:文本表达和文本分类。在深度学习出现之前,主流的表示方法有BOW(词袋模型)和topic model(主题模型),分类模型主要有SVM和LR。

载入数据:IMDB情感分析数据集,训练集和测试集分别包含了25000条已标注的电影评论,满分了10分,小于等于4为负面评论。

# -*- coding: utf-8 -*-

import numpy as np

# 加载已训练好的词典向量模型,包含400000的文本向量,每行有50维的数据

words_list = np.load('wordsList.npy')

print('载入word列表')

words_list = words_list.tolist() # 转化为list

words_list = [word.decode('UTF-8') for word in words_list]

word_vectors = np.load('wordVectors.npy')

print('载入文本向量')

print(len(words_list))

print(word_vectors.shape)

Home_index = words_list.index("home")

print(word_vectors[Home_index])

# 加载电影数据

import os

from os.path import isfile, join

pos_files = ['pos/' + f for f in os.listdir('pos/') if isfile(join('pos/', f))]

neg_files = ['neg/' + f for f in os.listdir('neg/') if isfile(join('neg/', f))]

num_words = []

for pf in pos_files:

with open(pf, "r", encoding='utf-8') as f:

line = f.readline()

counter = len(line.split())

num_words.append(counter)

print('正面评价完结')

for pf in neg_files:

with open(pf, "r", encoding='utf-8') as f:

line = f.readline()

counter = len(line.split())

num_words.append(counter)

print('负面评价完结')

num_files = len(num_words)

print('文件总数', num_files)

print('所有的词的数量', sum(num_words))

print('平均文件词的长度', sum(num_words)/len(num_words))

'''

# 可视化

import matplotlib

import matplotlib.pyplot as plt

matplotlib.use('qt4agg')

# 指定默认字体

matplotlib.rcParams['font.sans-serif'] = ['SimHei']

matplotlib.rcParams['font.family'] = 'sans-serif'

#%matplotlib inline

plt.hist(num_words, 50, facecolor='g')

plt.xlabel('文本长度')

plt.ylabel('频次')

plt.axis([0, 1200, 0, 8000])

plt.show()

'''

# 大部分文本都在230之内

max_seg_len = 300

# 将文本生成一个索引矩阵,得到一个25000x300矩阵

import re

strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentence(string):

string = string.lower().replace("

", " ")

return re.sub(strip_special_chars, "", string.lower())

print('保存idxMatrix...')

max_seg_num = 300

ids = np.zeros((num_files, max_seg_num), dtype="int32")

file_count = 0

'''

for pf in pos_files:

with open(pf, "r", encoding="utf-8") as f:

indexCounter = 0

line = f.readline()

cleanedLine = cleanSentence(line)

split = cleanedLine.split()

for word in split:

try:

ids[file_count][indexCounter] = words_list.index(word)

except ValueError:

ids[file_count][indexCounter] = 399999 # 未知的词

indexCounter = indexCounter + 1

if indexCounter >= max_seg_num:

break

file_count = file_count + 1

print(file_count)

print('保存完成1')

for nf in neg_files:

with open(nf, "r", encoding="utf-8") as f:

indexCounter = 0

line = f.readline()

cleanedLine = cleanSentence(line)

split = cleanedLine.split()

for word in split:

try:

ids[file_count][indexCounter] = words_list.index(word)

except ValueError:

ids[file_count][indexCounter] = 399999 # 未知的词

indexCounter = indexCounter + 1

if indexCounter >= max_seg_num:

break

file_count = file_count + 1

# 保存到文件

np.save('idxMatrix', ids)

print('保存完成2')

'''

# 模型设置

batch_size = 24

lstm_units = 64

num_labels = 2

iterations = 200000

max_seg_num = 250

ids = np.load('idsMatrix.npy')

# 返回一个数据集的迭代器, 返回一批训练集合

from random import randint

def get_train_batch():

labels = []

arr = np.zeros([batch_size, max_seg_num])

for i in range(batch_size):

if (i % 2 == 0):

num = randint(1, 11499)

labels.append([1, 0])

else:

num = randint(13499, 24999)

labels.append([0, 1])

arr[i] = ids[num-1: num]

return arr, labels

def get_test_batch():

labels = []

arr = np.zeros([batch_size, max_seg_num])

for i in range(batch_size):

num = randint(11499, 13499)

if (num <= 12499):

labels.append([1, 0])

else:

labels.append([0, 1])

arr[i] = ids[num-1:num]

return arr, labels

num_dimensions = 300 # Dimensions for each word vector

import tensorflow as tf

tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batch_size, num_labels])

input_data = tf.placeholder(tf.int32, [batch_size, max_seg_num])

data = tf.Variable(tf.zeros([batch_size, max_seg_num, num_dimensions]), dtype=tf.float32)

data = tf.nn.embedding_lookup(word_vectors, input_data)

# 配置LSTM网络

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstm_units)

lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75) # 避免一些过拟合

value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

# 第一个输出可以被认为是最后的隐藏状态,该向量将重新确定维度,然后乘以一个权重加上偏置,获得最终的label

weight = tf.Variable(tf.truncated_normal([lstm_units, num_labels]))

bias = tf.Variable(tf.constant(0.1, shape=[num_labels]))

value = tf.transpose(value, [1, 0, 2])

last = tf.gather(value, int(value.get_shape()[0]) - 1)

prediction = (tf.matmul(last, weight) + bias)

# 预测函数以及正确率评估参数

correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))

accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# 将标准的交叉熵损失函数定义为损失值,选择Adam作为优化函数

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))

optimizer = tf.train.AdamOptimizer().minimize(loss)

#sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement, log_device_placement))

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False))

#saver = tf.train.Saver()

#saver.restore(sess, tf.train.latest_checkpoint('models'))

iterations = 10

for i in range(iterations):

next_batch, next_batch_labels = get_test_batch()

print("正确率:", (sess.run(

accuracy, {input_data: next_batch, labels: next_batch_labels})) * 100)

'''

# 使用tensorboard可视化损失值和正确值

import datetime

sess = tf.InteractiveSession()

#tf.device("/cpu:0")

saver = tf.train.Saver()

sess.run(tf.global_variables_initializer())

tf.summary.scalar('Loss', loss)

tf.summary.scalar('Accuracy', accuracy)

merged = tf.summary.merge_all()

logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"

writer = tf.summary.FileWriter(logdir, sess.graph)

for i in range(iterations):

# 下个批次的数据

nextBatch, nextBatchLabels = get_train_batch();

sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})

# 每50次写入一次leadboard

if (i % 50 == 0):

summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})

writer.add_summary(summary, i)

# 每10,000次保存一个模型

if (i % 10000 == 0 and i != 0):

save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)

print("saved to %s" % save_path)

writer.close()

'''

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值