代码参考链接:https://blog.csdn.net/sinat_34611224/article/details/83147812
所需要的语料模型可以从上面的链接获取
from allennlp.modules.elmo import Elmo, batch_to_ids
import re
import nltk
import numpy as np
from torch import nn
import tensorflow as tf
import torch.nn.functional as F
import torch
import pandas as pd
# 去除停用词
from nltk.corpus import stopwords
num_filters=1024
num_layers=4 #highway网络中设置的神经网络层数
num_highways=1024
output_dim=200 #最后输出的维度
class Highway(nn.Module):
"""
Args:
num_layers (int): Number of layers in the highway encoder.
hidden_size (int): Size of hidden activations.
"""
def __init__(self, num_layers, hidden_size):
super(Highway, self).__init__()
self.transforms = nn.ModuleList([nn.Linear(hidden_size, hidden_size)
for _ in range(num_layers)])
self.gates = nn.ModuleList([nn.Linear(hidden_size, hidden_size)
for _ in range(num_layers)])
def forward(self, x):
for transform, gate in zip(self.transforms, self.gates):
# Shapes of g, t, and x are all (batch_size, seq_len, hidden_size)
h = torch.sigmoid(transform(x))
z = torch.sigmoid(gate(x))
x = z * h + (1 - z) * x
return x
def word_count(text):#还没有去除停用词
# text = text.replace('\n', '').split('\t')
text=re.sub(r'http[:.]+\S+', '', text)
# print(text)
text = re.sub(r'[0-9]+', ' ', text) # 去除数字
text = re.sub(r'[^A-Za-z.?!]', ' ', text) #去除非字母标点符号
# print(text)
# #去除省略号,去除省略号之后他会把句号也去掉,达不到分句效果。
pat = re.compile(r'[...]')#这个好像没有用。
text = re.sub(pat, ". ", text) # 将省略号用。代替
pat1 = re.compile(r'[!]') # 防止出现多余的感叹号
text = re.sub(pat1, ". ", text)
text.replace("\"", " ")
text = text.replace(',', '')
text = text.strip()
# print("11:", text)
newSentence = nltk.sent_tokenize(text)#先分好词,然后去除停用词
sentence = " ".join(newSentence).lower() # 将列表转换成字符串之后才可以使用split()
# filter_text = [word for word in sentence.split(' ') if word not in stopwords.words('english')] #去除停用词
# filter_text = " ".join(filter_text)
filter_text = nltk.sent_tokenize(sentence)
newWords = []
for sent in filter_text:#sent是str类型
a=nltk.word_tokenize(sent)
# if len(a)!=1 :#此处对单字符的处理是直接删掉,例如省略号分词之后是三个句号,但是会导致只有一个单词的评论出错
# newWords.append(a)
newWords.append(a)
# print(newWords) # 分好句子了
#将多个列表中的内容变成一个列表中的内容
return newWords
#加载ELMO已经训练好的模型
options_file = r"D:\BaiduNetdiskDownload\yelp\yelp_rzj\gcn\elmo\elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = r"D:\BaiduNetdiskDownload\yelp\yelp_rzj\gcn\elmo\elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
# 这里的1表示产生一组线性加权的词向量。
# 如果改成2 即产生两组不同的线性加权的词向量。
elmo = Elmo(options_file, weight_file, 1, dropout=0)
# use batch_to_ids to convert sentences to character ids
#读取文件
file_path = pd.read_excel(r"D:\BaiduNetdiskDownload\yelp\yelp_rzj\res\data.xls",usecols=[7])
train_data=[]
for d in file_path['reviewContent']:
d = str(d)
d = d.replace('\n', '')
text_a = str(d)
train_data.append(text_a)
# text="9Easily the most dramatic dish... ?A cloth pillow filled with mace scented air was placed in front of each of us and a dish with duck confit, duck breast and fois gras was placed on top. ?As we dug into this dish the air was slowly released from the pillow mixing with the food smells and leading us to visions of Alinea branded mace spray for linens. UPDATE: Check out the review in Wired: http://www.wired.com/wir…56"
result = open(r"D:\BaiduNetdiskDownload\yelp\yelp_rzj\gcn\elmo\elmo.xls", 'w', encoding='utf-8')#保存文件的路径
for text in train_data:
Q2=[]
sentence_lists = word_count(text)
character_ids = batch_to_ids(sentence_lists)
embeddings = elmo(character_ids)['elmo_representations'][0]
#最后一维拼接(batch_size*seq_len,out_channels) 代码参考:https://zhuanlan.zhihu.com/p/466841781
highways = Highway(num_layers, num_highways)#定义highway函数,应该将这个写道外面去,我懒得了
projection = nn.Linear(len(embeddings)*len(embeddings[0])*num_highways, output_dim, bias=True)
token_embeds=highways(embeddings)
flat_embed=tf.reshape(token_embeds.detach().numpy(),[-1])#如果这里的维度改变成功了,那么projection中的num_filters也需要更改。
#将flat_embed转为Tensor
flat_embed=np.array(flat_embed)
flat_embed=torch.tensor(flat_embed)
token_embeds=projection(flat_embed)#降维
list2 = np.array(token_embeds.detach().numpy()).tolist()
Q2.append(list2)
# 定义指定输出Excel文件的名称,读入方式,编码方式
# result = open(r"D:\BaiduNetdiskDownload\tensor5Text.xls", 'a+', encoding='gbk')
# 参数'w'表示往指定表格读入数据,会先将表格中原本的内容清空
# 若把参数’w'修改为‘a+',即可实现在原本内容的基础上,增加新写入的内容
for i in range(0, len(Q2[0])):
result.write(str(Q2[0][i]))
result.write('\t') # '\t'表示每写入一个元素后,会移动到同行的下一个单元格
result.write("\n") # 换行操作
result.close()
# print(token_embeds)