python机器学习——文本情感分析(英文文本情感分析)

本人机器学习课程的小作业,记录一下,希望可以帮到一些小伙伴。
项目介绍,给一段英文文本(英文影评评论)来预测情感是正向还是负向
模型使用的是LSTM+RNN。
代码包括数据处理,模型训练,对新数据做出预测,并将预测结果(正向情感)保存到result.txt中
软件:anaconda3

代码下载链接:下载链接
word文档(文本情感色彩分类技术报告)下载链接:下载链接

一.数据集介绍

数据集链接: https://pan.baidu.com/s/1oIXkaL_SL9GSN3S56ZwvWQ
提取码: qgtg

训练集labeledTrainData.tsv(24500条带标签的训练数据)
id sentiment review 分别表示:每段文本的唯一ID,情感色彩类别标签,待分析的文本数据。
在这里插入图片描述
测试集(testData.tsv:22000条无标签测试数据)
在这里插入图片描述

二.代码详解

import numpy as np
import tensorflow as tf
wordsList=np.load('C:/NLP/wordsList.npy') #包含40万个词的python列表
wordsList=np.load('C:/NLP/wordsList.npy') #包含40万个词的python列表
wordsList=wordsList.tolist()
wordsList=[word.decode('UTF-8') for word in wordsList]
wordVectors=np.load('C:/NLP/wordVectors.npy')
baseballIndex=wordsList.index('baseball')
print(wordVectors[baseballIndex])
import pandas as pd
#读入数据
df=pd.read_csv('C:/NLP/labeledTrainData.tsv',sep='\t',escapechar='\\')

numDimensions=300
print("down")

maxSeqLength=250
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
#清洗数据,HTML字符使用空格替代
def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

# #生成索引矩阵,得到24500*250的索引矩阵
# ids=np.zeros((24500,maxSeqLength),dtype='int32')
# #print(ids.shape) #输出结果为(24500,250)

# fileCounter=0
# for pf in range(0,len(df)): 
#     #print(pf)
#     indexCounter=0
#     cleanedLine=cleanSentences(df['review'][pf])
#     split=cleanedLine.split()
#     for word in split:
#         try:
#             #print('111')
#             ids[fileCounter][indexCounter]=wordsList.index(word)
#         except ValueError:
#             ids[fileCounter][indexCounter]=399999
#         indexCounter=indexCounter+1
#         if indexCounter>=maxSeqLength:
#             break        
#     fileCounter=fileCounter+1 
# print('down1')
# np.save('C:/NLP/idsMatrix',ids)
#上述注释后,将生成的索引矩阵保存到idsMatrix.npy文件中。避免了每次都要生成索引矩阵
ids=np.load('C:/NLP/idsMatrix.npy')
print(ids.shape)

#辅助函数
from random import randint
def getTrainBatch():
    labels=[]
    arr=np.zeros([batchSize,maxSeqLength])
    i=0
    for i in range(0,32):
        j=0
        while j<1:
            num=randint(1,19600)
            if df['sentiment'][num-1]==1:
                j=j+1
                labels.append([1,0])
                arr[2*i]=ids[num-1:num]
        j=0
        while j<1:
            num=randint(1,19600)
            if df['sentiment'][num-1]==0:
                j=j+1
                labels.append([0,1])
                arr[2*i+1]=ids[num-1:num]
    return arr,labels
print('down')


batchSize=64 #批处理大小
lstmUnits=64 #LSTM单元个数
numClasses=2 #分类类别
iterations=50000 #训练次数
print('down')
import tensorflow as tf

tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [None, numClasses])
input_data = tf.placeholder(tf.int32, [None, maxSeqLength])
data = tf.Variable(
    tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
#lstmCell = tf.contrib.rnn.LSTMCell(lstm_units)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

pre=tf.nn.softmax(prediction)
print(pre)

correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
    logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

print('down')

sess=tf.InteractiveSession()
saver=tf.train.Saver()
sess.run(tf.global_variables_initializer())
for i in range(0,40001):
    nextBatch,nextBatchLabels=getTrainBatch();
    #print(nextBatch.shape)
    #print(nextBatchLabels)
    sess.run(optimizer,{input_data:nextBatch,labels:nextBatchLabels})
    if(i%1000==0 and i!=0):
        loss_=sess.run(loss,{input_data:nextBatch,labels:nextBatchLabels})
        accuracy_=sess.run(accuracy,{input_data:nextBatch,labels:nextBatchLabels})
        print("i...{}".format(i),"loss... {}".format(loss_),"accuracy {}".format(accuracy_))
    if(i%10000==0 and i!=0):
        save_path=saver.save(sess,"C:/NLP/models1/pretrained_lstm.ckpt",global_step=i)
        print("saved to %s"%save_path)
print('down')
sess=tf.InteractiveSession()
saver=tf.train.Saver()
saver.restore(sess,"C:/NLP/models1/pretrained_lstm.ckpt-10000")
print('down')

#预测
in3 = np.load('C:/NLP/test_review.npy')


x_test=in3

print(x_test.shape)

arr=np.zeros([1,maxSeqLength])
print(arr.shape)
numpy_data=[]
sum=0
for i in range(0,2):
    arr = np.array([x_test[i]])
    print("result for test:",(sess.run(pre,{input_data:arr})))
    numpy_data.append(sess.run(pre,{input_data:arr}))
    #print("labels1:",labels1,"....result for test:",(sess.run(pre,{input_data:arr,labels:labels1})),"accuracy for test:",(sess.run(accuracy,{input_data:arr,labels:labels1})))
    #sum=sum+sess.run(accuracy,{input_data:arr,labels:labels1})
#print(sum)
print('down')
result_data=[]
#print(in3.shape)
for i in range(0,22000):
    arr = np.array([x_test[i]])
    result_data.append(sess.run(pre,{input_data:arr}))
print('down')
f=open('C:/NLP/possubmission.txt','w')
for i in range (22000):
    #print(i)
    f.write('\n'+str(result_data[i][0][0]))
f.close()
print('down')

最终possubmission.txt中的内容如下(存储的是为正向情感的概率):
在这里插入图片描述

已标记关键词 清除标记
<p style="font-size:medium;"> <span style="font-size:18px;color:#FF0000;">课程目标</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">学习完本门课程,您将对自然语言处理技术有更深入的了解, </span><span style="font-size:18px;">掌握基于深度学习情感分析方法;课程基于</span><span style="font-size:18px;">PyTorch</span><span style="font-size:18px;">主流框架实现,其中涉及深度学习主流框架</span><span style="font-size:18px;">LSTM</span><span style="font-size:18px;">模型以及自然语言处理的词向量;</span><span style="font-size:18px;">彻底掌握</span><span style="font-size:18px;">中文情感分析。</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;color:#FF0000;">适用人群</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">想要从事NLP的在校学生、NLP研发工程师</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">自然语言处理从业者、深度学习爱好者</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;color:#FF0000;">课程简介</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">NLP领域的热门应用,常用在舆情分析,文章分类,智能</span><span style="font-size:18px;">客服,情感分析等</span><span style="font-size:18px;">多个场景</span><span style="font-size:18px;">。情感分析作为</span><span style="font-size:18px;">自然语言处理的基础技术之一</span><span style="font-size:18px;">,常被用于电商评论、舆情监控、</span><span style="font-size:18px;color:#FF0000;">微博评论情感分析</span><span style="font-size:18px;">、话题监督等领域,</span><span style="font-size:18px;">因此深入</span><span style="font-size:18px;">掌握情感分析技术</span><span style="font-size:18px;">,是作为自然语言处理从</span><span style="font-size:18px;">业者必备技能</span><span style="font-size:18px;">,本课程以案例驱动出发,结合多个项目实战案例,覆盖多种算法,</span><span style="font-size:18px;">如</span><span style="font-size:18px;">RNN</span><span style="font-size:18px;">,</span><span style="font-size:18px;">LSTM</span><span style="font-size:18px;">等</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;color:#FF0000;">课程要求:</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">(1)开发环境:python版本:Python3.7; </span><span style="font-size:18px;color:#FF0000;">torch 版本:</span><span style="font-size:18px;color:#FF0000;">1.</span><span style="font-size:18px;color:#FF0000;">3</span><span style="font-size:18px;color:#FF0000;">.0+; torch</span><span style="font-size:18px;color:#FF0000;">text</span><span style="font-size:18px;color:#FF0000;">版本</span><span style="font-size:18px;color:#FF0000;">:</span><span style="font-size:18px;color:#FF0000;">0.</span><span style="font-size:18px;color:#FF0000;">3</span><span style="font-size:18px;color:#FF0000;">.0</span><span style="font-size:18px;color:#FF0000;">+</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">(2)开发工具:Pycharm;</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">(3)学员基础:需要一定的Python基础,及深度学习基础;</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">(4)学员收货:</span><span style="font-size:18px;">掌握深度学习情感分类关键</span><span style="font-size:18px;">技术;</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">(5)学员资料:内含完整程序源码和数据集;</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;">(6)课程亮点:专题技术,完整案例,全程实战操作,徒手撸代码。</span> </p> <p style="font-size:medium;"> <span style="font-size:18px;"><br /></span> </p> <p style="text-align:center;font-size:medium;"> <span style="font-size:18px;"><img src="https://img-bss.csdn.net/202002100142351682.png" alt="" /><br /></span> </p> <p style="text-align:center;font-size:medium;"> <span style="font-size:18px;"><img src="https://img-bss.csdn.net/202002100143361272.png" alt="" /><br /></span> </p> <p style="text-align:center;font-size:medium;"> <span style="font-size:18px;"><img src="https://img-bss.csdn.net/202002100144109896.png" alt="" /><br /></span> </p> <p style="text-align:center;font-size:medium;"> <span style="font-size:18px;"><img src="https://img-bss.csdn.net/202002100144545929.png" alt="" /><br /></span> </p> <p style="text-align:left;font-size:medium;"> <span style="font-size:32px;">案例5-情感分析功能点</span> </p> <p style="text-align:center;font-size:medium;"> <img src="https://img-bss.csdn.net/202002131018235991.png" alt="" /></p> <p style="text-align:center;font-size:medium;"> <br /></p> <p style="text-align:center;font-size:medium;"> <span style="font-size:18px;"><br /></span> </p> <p style="text-align:center;font-size:medium;"> <span style="font-size:18px;"><br /></span> </p> <p style="text-align:center;font-size:medium;"> <span style="font-size:18px;"><br /></span> </p>
相关推荐
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页