import warnings
#控制警告错误的输出
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.models import word2vec
import jieba
import tensorflow as tf
import numpy as np
import time
#模块random包含以各种方式生成随机数的函数,其中的randint()返回一个位于指定范围内的整数
from random import randint
#shuffle() 方法将序列的所有元素随机排序。
from random import shuffle
#----------------------------------
#通过读取名字为停用词.txt的文件来返回停用词
def makeStopWord():
with open('停用词.txt','r',encoding = 'utf-8') as f:
lines = f.readlines()
stopWord = []
for line in lines:
words = jieba.lcut(line,cut_all = False)
for word in words:
stopWord.append(word)
return stopWord
def words2Array(lineList):
linesArray=[]
wordsArray=[]
steps = []
for line in lineList:
t = 0
p = 0
for i in range(MAX_SIZE):#一条评价最多容纳的单词数目(25个,多退少补)
if i<len(line):
try:#添加每个行的每个词的词向量
wordsArray.append(model.wv.word_vec(line[i]))
p = p + 1
except KeyError:
t=t+1
continue
else:#一句话不够25个词的,用200维的词向量代表的词来补够
wordsArray.append(np.array([0.0]*dimsh))
for i in range(t):
wordsArray.append(np.array([0.0]*dimsh))
steps.append(p)#统计一句话包含多少个有效词(即扣除非补齐的词)
linesArray.append(wordsArray)#从第一句话开始,每句话用25行200列的矩阵来表示,直到遍历所有的句子。
wordsArray = []
linesArray = np.array(linesArray)#三维矩阵
steps = np.array(steps)#统计每一句话中的有效词放到数组中,数组的元素个数为句子的个数
return linesArray, steps
def convert2Data(posArray, negArray, posStep, negStep):
randIt = []
data = []
steps = []
labels = []
for i in range(len(posArray)):
#积极评价:25*200的矩阵,有效词的个数,标签,如果是分3类,标签就为[1,0,0]
randIt.append([posArray[i], posStep[i], [1,0]])
for i in range(len(negArray)):#消极评价:25*200的矩阵,有效词的个数,标签
randIt.append([negArray[i], negStep[i], [0,1]])
shuffle(randIt)#随机混乱
for i in range(len(randIt)):
data.append(randIt[i][0])#每一句话的25*200的矩阵表示,放到data中
steps.append(randIt[i][1])#每一句话的有效词的个数,放到step中
labels.append(randIt[i][2])#每一句话的标签,放到label中
data = np.array(data)
steps = np.array(steps)
return data, steps, labels
def getWords(file):
wordList = []
trans = []
lineList = []
with open(file,'r',encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
#去掉句子末尾的空格符
trans = jieba.lcut(line.replace('\n',''), cut_all = False)
for word in trans:
if word not in stopWord:
wordList.append(word)
lineList.append(wordList)
wordList = []
return lineList
def makeData(posPath,negPath):
#获取词汇,返回类型为[[word1,word2...],[word1,word2...],...]
pos = getWords(posPath)
print("The positive data's length is :",
LSTM模型---情感分析(文本评价分类)
最新推荐文章于 2024-08-07 15:23:57 发布