使用ELMO提取词向量后,通过highway+linear降维

代码参考链接:https://blog.csdn.net/sinat_34611224/article/details/83147812
所需要的语料模型可以从上面的链接获取

from allennlp.modules.elmo import Elmo, batch_to_ids
import re
import nltk
import numpy as np
from torch import nn
import tensorflow as tf
import torch.nn.functional as F
import torch
import pandas as pd
# 去除停用词
from nltk.corpus import stopwords
num_filters=1024
num_layers=4 #highway网络中设置的神经网络层数
num_highways=1024
output_dim=200  #最后输出的维度


class Highway(nn.Module):
    """
    Args:
        num_layers (int): Number of layers in the highway encoder.
        hidden_size (int): Size of hidden activations.
    """
    def __init__(self, num_layers, hidden_size):
        super(Highway, self).__init__()
        self.transforms = nn.ModuleList([nn.Linear(hidden_size, hidden_size)
                                         for _ in range(num_layers)])
        self.gates = nn.ModuleList([nn.Linear(hidden_size, hidden_size)
                                    for _ in range(num_layers)])

    def forward(self, x):
        for transform, gate in zip(self.transforms, self.gates):
            # Shapes of g, t, and x are all (batch_size, seq_len, hidden_size)
            h = torch.sigmoid(transform(x))
            z = torch.sigmoid(gate(x))
            x = z * h + (1 - z) * x

        return x

def word_count(text):#还没有去除停用词
    # text = text.replace('\n', '').split('\t')
    text=re.sub(r'http[:.]+\S+', '', text)
    # print(text)
    text = re.sub(r'[0-9]+', ' ', text)  # 去除数字
    text = re.sub(r'[^A-Za-z.?!]', ' ', text) #去除非字母标点符号
    # print(text)
    # #去除省略号,去除省略号之后他会把句号也去掉,达不到分句效果。
    pat = re.compile(r'[...]')#这个好像没有用。
    text = re.sub(pat, ". ", text)  # 将省略号用。代替
    pat1 = re.compile(r'[!]')  # 防止出现多余的感叹号
    text = re.sub(pat1, ". ", text)
    text.replace("\"", " ")
    text = text.replace(',', '')
    text = text.strip()
    # print("11:", text)
    newSentence = nltk.sent_tokenize(text)#先分好词,然后去除停用词
    sentence = " ".join(newSentence).lower()  # 将列表转换成字符串之后才可以使用split()
    # filter_text = [word for word in sentence.split(' ') if word not in stopwords.words('english')] #去除停用词
    # filter_text = " ".join(filter_text)
    filter_text = nltk.sent_tokenize(sentence)
    newWords = []
    for sent in filter_text:#sent是str类型
        a=nltk.word_tokenize(sent)
        # if len(a)!=1 :#此处对单字符的处理是直接删掉,例如省略号分词之后是三个句号,但是会导致只有一个单词的评论出错
        #     newWords.append(a)
        newWords.append(a)
    # print(newWords)  # 分好句子了
    #将多个列表中的内容变成一个列表中的内容
    return newWords

#加载ELMO已经训练好的模型
options_file = r"D:\BaiduNetdiskDownload\yelp\yelp_rzj\gcn\elmo\elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = r"D:\BaiduNetdiskDownload\yelp\yelp_rzj\gcn\elmo\elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

# 这里的1表示产生一组线性加权的词向量。
# 如果改成2 即产生两组不同的线性加权的词向量。
elmo = Elmo(options_file, weight_file, 1, dropout=0)

# use batch_to_ids to convert sentences to character ids
#读取文件
file_path = pd.read_excel(r"D:\BaiduNetdiskDownload\yelp\yelp_rzj\res\data.xls",usecols=[7])
train_data=[]
for d in file_path['reviewContent']:
    d = str(d)
    d = d.replace('\n', '')
    text_a = str(d)
    train_data.append(text_a)

# text="9Easily the most dramatic dish... ?A cloth pillow filled with mace scented air was placed in front of each of us and a dish with duck confit, duck breast and fois gras was placed on top. ?As we dug into this dish the air was slowly released from the pillow mixing with the food smells and leading us to visions of Alinea branded mace spray for linens. UPDATE: Check out the review in Wired: http://www.wired.com/wir…56"
result = open(r"D:\BaiduNetdiskDownload\yelp\yelp_rzj\gcn\elmo\elmo.xls", 'w', encoding='utf-8')#保存文件的路径
for text in train_data:
    Q2=[]
    sentence_lists = word_count(text)
    character_ids = batch_to_ids(sentence_lists)
    embeddings = elmo(character_ids)['elmo_representations'][0]
    #最后一维拼接(batch_size*seq_len,out_channels) 代码参考:https://zhuanlan.zhihu.com/p/466841781
    highways = Highway(num_layers, num_highways)#定义highway函数,应该将这个写道外面去,我懒得了
    projection = nn.Linear(len(embeddings)*len(embeddings[0])*num_highways, output_dim, bias=True)
    token_embeds=highways(embeddings)
    flat_embed=tf.reshape(token_embeds.detach().numpy(),[-1])#如果这里的维度改变成功了,那么projection中的num_filters也需要更改。
    #将flat_embed转为Tensor
    flat_embed=np.array(flat_embed)
    flat_embed=torch.tensor(flat_embed)
    token_embeds=projection(flat_embed)#降维
    list2 = np.array(token_embeds.detach().numpy()).tolist()
    Q2.append(list2)
    # 定义指定输出Excel文件的名称,读入方式,编码方式
    # result = open(r"D:\BaiduNetdiskDownload\tensor5Text.xls", 'a+', encoding='gbk')
    # 参数'w'表示往指定表格读入数据,会先将表格中原本的内容清空
    # 若把参数’w'修改为‘a+',即可实现在原本内容的基础上,增加新写入的内容
    for i in range(0, len(Q2[0])):
        result.write(str(Q2[0][i]))
        result.write('\t')  # '\t'表示每写入一个元素后,会移动到同行的下一个单元格
    result.write("\n")  # 换行操作
result.close()

# print(token_embeds)



  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值