【NLP】N2.Embeddingbag & Embedding-CSDN博客

本文链接：https://blog.csdn.net/pung0302/article/details/140725485

>- **🍨 本文为[🔗365天深度学习训练营](https://mp.weixin.qq.com/s/0dvHCaOoFnW8SCp3JpzKxg) 中的学习记录博客**
>- **🍖 原作者：[K同学啊](https://mtyjkh.blog.csdn.net/)**

任务：加载txt文件，并用Embeddingbag&Embedding完成词嵌入

一、导入必要的包和数据

import pandas as pd
import torch
from torch import nn
import jieba as jb

data_dir = "/kaggle/input/text-data1/.txt"
data = pd.read_csv(data_dir, header=None)
data

结果：

二、使用jieba分词：

tokenized_texts = [list(jb.cut(text)) for text in data[0]]
print(tokenized_texts)

输出分词结果：

三、制作词汇表：

word_index = {}
index_word = {}

for i, word in enumerate(set([word for text in tokenized_texts for word in text])):
    word_index[word] = i
    index_word[i] = word

vocab_size = len(word_index)

print(f"{vocab_size} / {word_index}")

词汇表大小：87

词汇表结果：

四、将文字转换为整数序列：

sequences1 = torch.tensor([word_index[word] for word in tokenized_texts[0]], dtype=torch.long)
sequences2 = torch.tensor([word_index[word] for word in tokenized_texts[1]], dtype=torch.long)

五、用embadding将句子转换为词嵌入向量：

embedding = nn.Embedding(vocab_size, 4)

embedding_sequences1 = embedding(sequences1)
embedding_sequences2 = embedding(sequences2)

print(embedding_sequences1)
print(embedding_sequences2)

embadding结果：

tensor([[-2.2421,  0.4768,  1.5981, -0.1372],
        [ 0.2662,  0.6506, -0.3813,  0.3465],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [ 0.1479,  0.4894, -0.3237, -2.0757],
        [-0.5640, -1.1852,  0.3003,  1.1728],
        [ 0.4459, -0.7934, -0.1811,  1.3251],
        [-1.4543, -1.3130,  1.0980,  0.8700],
        [ 0.2644, -0.8209,  2.3713, -1.3098],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [ 1.9804, -0.0224, -1.0361,  0.1449],
        [-0.8875,  1.1185,  0.5886, -1.3458],
        [-0.0645,  1.1644, -0.7693, -0.4122],
        [ 0.7435, -0.4162,  1.2579, -1.7981],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [ 1.4095, -0.1287, -0.2783,  0.1564],
        [ 1.2758,  0.7741, -1.6845,  0.0374],
        [ 0.7291,  2.4709, -1.0184,  0.9997],
        [ 0.3255,  0.1719, -1.2049, -0.7580],
        [-1.1748,  1.2276, -0.2742,  1.5316],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [ 0.4989,  0.2639, -1.3275,  1.4525],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [-1.6770,  0.5937, -0.4984, -0.0159],
        [-0.1202,  0.6479,  0.8591, -1.0248],
        [ 0.1757, -1.5443,  0.4225, -0.1653],
        [-0.4791, -0.4607, -1.1054, -1.2024],
        [-0.0297,  0.5840,  0.5421,  0.1201],
        [ 0.6148, -1.5387,  1.0232,  0.2355],
        [-1.2716, -0.9989,  1.1541, -1.3458],
        [ 0.7014,  0.5741, -0.2984,  0.7311],
        [-0.1746, -1.4164,  0.0412,  0.3107],
        [ 0.6628, -0.1708,  0.2175, -1.2665],
        [ 0.3255,  0.1719, -1.2049, -0.7580],
        [-1.1748,  1.2276, -0.2742,  1.5316],
        [-0.0645,  1.1644, -0.7693, -0.4122],
        [ 1.2490, -0.8628,  0.3805, -0.6328],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [-0.3621, -0.1040,  0.9854,  0.6555],
        [ 0.1479,  0.4894, -0.3237, -2.0757],
        [-0.9348, -0.9895,  1.6729, -0.0273],
        [ 1.2758,  0.7741, -1.6845,  0.0374],
        [ 0.4989,  0.2639, -1.3275,  1.4525],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [ 0.5002,  0.2031,  0.5255,  0.2732],
        [-0.9373, -0.8043, -0.7819,  0.6784],
        [ 0.1671, -0.9603, -0.9574, -1.0333],
        [-0.4921,  1.0120, -1.5466, -0.4126],
        [-0.0189, -0.6197,  0.2134,  0.6700],
        [ 0.3798,  1.1016,  0.9914, -1.7548],
        [ 1.6996,  1.1439,  0.3383, -1.5814],
        [ 0.6642, -0.8293, -0.0363, -0.6464],
        [-1.1748,  1.2276, -0.2742,  1.5316],
        [-0.2030,  0.4043, -0.4269,  2.0898],
        [-0.9348, -0.9895,  1.6729, -0.0273],
        [ 0.6830,  1.4578, -1.4581, -1.2760],
        [ 0.0985,  2.2091,  0.4579,  0.0645],
        [-0.8245,  0.7995,  1.2056, -0.8458],
        [ 0.2116,  0.3476,  0.3539, -0.6649],
        [-1.1390, -1.7131,  0.4298, -0.5274],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [ 1.8577,  0.3560,  0.3905,  0.1583],
        [-0.3970,  0.4146,  0.0542,  0.7691],
        [-0.8477, -0.1442, -1.3092,  0.3221],
        [-1.1390, -1.7131,  0.4298, -0.5274],
        [ 0.1671, -0.9603, -0.9574, -1.0333],
        [-0.5640, -1.1852,  0.3003,  1.1728],
        [-0.4246,  0.8655, -0.5637,  0.8559],
        [-0.9348, -0.9895,  1.6729, -0.0273],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [-0.3495, -2.5871, -0.7908, -1.3156],
        [-0.4246,  0.8655, -0.5637,  0.8559],
        [ 1.7937,  1.3631, -0.0138, -0.6104],
        [-2.0114,  1.8538,  0.2406, -0.0205],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [-0.0645,  1.1644, -0.7693, -0.4122]], grad_fn=<EmbeddingBackward0>)
tensor([[-0.3867, -0.0058, -0.7481,  0.4434],
        [-0.4892,  1.1163,  1.5090,  0.9548],
        [-0.3621, -0.1040,  0.9854,  0.6555],
        [ 0.4989,  0.2639, -1.3275,  1.4525],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [ 1.9659,  0.6739, -0.7447,  2.0967],
        [-0.8151,  0.5109, -0.5044,  0.2174],
        [ 1.7830, -0.6902,  0.3569,  0.8761],
        [-1.3112,  0.5679,  0.2629,  0.6237],
        [ 1.1008,  0.3024,  2.0047,  1.1738],
        [ 0.9640, -0.2232,  1.7577,  0.1672],
        [ 0.6152,  0.7374,  0.0465, -0.4756],
        [-0.1858,  0.1699, -0.6322, -0.4958],
        [ 0.1092, -1.2058, -1.2382,  1.7523],
        [ 1.5649, -1.1317,  0.6957,  0.4157],
        [ 0.9640, -0.2232,  1.7577,  0.1672],
        [ 1.7874,  1.4637, -1.2369,  0.4733],
        [-0.0645,  1.1644, -0.7693, -0.4122],
        [ 1.7830, -0.6902,  0.3569,  0.8761],
        [-1.3112,  0.5679,  0.2629,  0.6237],
        [ 1.1008,  0.3024,  2.0047,  1.1738],
        [ 0.9640, -0.2232,  1.7577,  0.1672],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [-0.4225,  0.2887,  1.0175,  0.1776],
        [-0.3821,  0.1820,  0.1650, -0.6111],
        [-0.5640, -1.1852,  0.3003,  1.1728],
        [ 0.4481,  0.4344, -0.2955, -0.9457],
        [ 1.5302,  1.3884,  0.0516, -0.4676],
        [-1.1748,  1.2276, -0.2742,  1.5316],
        [ 0.8415, -0.1924, -0.7211, -0.2267],
        [-2.1970,  0.7031, -0.1178, -0.4499],
        [ 1.2758,  0.7741, -1.6845,  0.0374],
        [ 1.5492, -1.2600, -0.5016,  0.2411],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [ 0.2468,  0.4953,  0.2391, -0.4277],
        [-1.8871,  0.4976,  0.3504,  1.2424],
        [ 1.2758,  0.7741, -1.6845,  0.0374],
        [-0.6616, -1.0267, -0.0144,  1.3585],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [-0.7786,  0.8371,  0.0190,  1.0264],
        [-0.6832, -0.7319, -0.8668,  0.6007],
        [ 0.1757, -1.5443,  0.4225, -0.1653],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [ 0.3108,  0.0142,  0.3639, -0.9848],
        [-0.6616, -1.0267, -0.0144,  1.3585],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [-0.7786,  0.8371,  0.0190,  1.0264],
        [-0.6832, -0.7319, -0.8668,  0.6007],
        [ 0.6202, -0.6972,  0.9012,  0.0262],
        [-0.0645,  1.1644, -0.7693, -0.4122],
        [ 0.6907,  0.2946, -0.8080, -1.1677],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [ 1.5302,  1.3884,  0.0516, -0.4676],
        [-1.1748,  1.2276, -0.2742,  1.5316],
        [-0.2030,  0.4043, -0.4269,  2.0898],
        [ 0.5002,  0.2031,  0.5255,  0.2732],
        [ 1.5477, -2.2033, -0.3226,  0.1326],
        [-0.2892, -2.0367,  0.9881, -0.8522],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [-0.4246,  0.8655, -0.5637,  0.8559],
        [-0.9348, -0.9895,  1.6729, -0.0273],
        [ 0.0985,  2.2091,  0.4579,  0.0645],
        [-0.8245,  0.7995,  1.2056, -0.8458],
        [ 0.2116,  0.3476,  0.3539, -0.6649],
        [-1.1390, -1.7131,  0.4298, -0.5274],
        [-0.0645,  1.1644, -0.7693, -0.4122],
        [ 0.7435, -0.4162,  1.2579, -1.7981],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [ 1.4095, -0.1287, -0.2783,  0.1564],
        [ 0.3255,  0.1719, -1.2049, -0.7580],
        [-1.1748,  1.2276, -0.2742,  1.5316],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [ 0.0812,  1.0305,  1.2780,  0.4558],
        [-0.2672,  0.4423, -1.7335, -2.1201],
        [-1.6770,  0.5937, -0.4984, -0.0159],
        [-0.0591, -1.0665, -0.5901,  0.2102],
        [ 1.1762,  0.1607,  1.5622,  0.0729],
        [ 0.2144,  0.6271,  1.4468,  0.2856],
        [ 1.7830, -0.6902,  0.3569,  0.8761],
        [-1.3112,  0.5679,  0.2629,  0.6237],
        [ 1.1008,  0.3024,  2.0047,  1.1738],
        [ 0.9640, -0.2232,  1.7577,  0.1672],
        [ 0.9006,  0.0673,  0.6713, -1.2377]], grad_fn=<EmbeddingBackward0>)

六、embedding_bag的均值方法将句子转换成词嵌入向量：

embedding_bag = nn.EmbeddingBag(vocab_size, 30, mode = "mean")

input_sequences = torch.cat([sequences1,
                             sequences2])
offsets = torch.tensor([0, len(sequences1)], dtype=torch.long)

eb_seq = embedding_bag(input_sequences, offsets)
print(eb_seq)

embadding bag的结果：