Pytorch处理文本数据规范流程

最新推荐文章于 2024-05-22 18:53:14 发布

qq_45073593

最新推荐文章于 2024-05-22 18:53:14 发布

阅读量859

点赞数

本文链接：https://blog.csdn.net/qq_45073593/article/details/108683265

版权

处理文本数据的Torch库torchtext

import torch
import torchtext

建立Fields,Field确定了处理数据的方式

from torchtext.data import Field
#torchtext.data.Field(
#sequential=True,#判断是否处理是序列数据，如果false,tokenization不会被应用
# use_vocab=True,#是否使用Vocab object,如果是false，数据本身就要是数字化的
# init_token=None, #将该token添加到该field的所有example开头
# eos_token=None,#将该token添加到该field的所有example结尾
# fix_length=None, #padding
# dtype=torch.int64, 
# preprocessing=None,#tokenize之后,numercialize之前的操作
# postprocessing=None, #numercialize之后，numbers转变为Tensor之前的操作
# lower=False,#是否全部小写
# tokenize=None, #=‘spacy’,Spacy tokenizer会被调用
# tokenizer_language='en',
# include_lengths=False, 
# batch_first=False,#是否返回的tensor中bacth优先
# pad_token='<pad>', #用于padding的token
# unk_token='<unk>', #unknown的token
# pad_first=False, #是否在开始就padding
# truncate_first=False, #是否在开始就缩短
# stop_words=None, #在processing过程discard单词
# is_target=False)

#定义Field
LABEL = Field(sequential=False, use_vocab=False)
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

建立数据库

from torchtext.data import TabularDataset
# #TabularDataset:Defines a Dataset of columns stored in CSV, TSV, or JSON format.
# (
# path,
# format,:数据格式 
# fields(list(tuple(str,Field))):如果是list,则format必须是CSV或者TSV,并且（str,fields)和column要一一对应
# skip_header(bool):是否跳过文件的第一行（尤其CSV文件）
#cav_reader_params(dict): Parameters to pass to the csv reader. Only relevant when format is csv or tsv. See 
# )

#建立数据库
tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("comment_text", TEXT), ("toxic", LABEL),
                 ("severe_toxic", LABEL), ("threat", LABEL),
                 ("obscene", LABEL), ("insult", LABEL),
                 ("identity_hate", LABEL)]
#splits方法对train、dev的数据集创造了dataset，这个过程是相同的。
trn, vld = TabularDataset.splits(
               path="data", # the root directory where the data lies
               train='train.csv', validation="valid.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)
 
tst_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                  ("comment_text", TEXT)]
tst = TabularDataset(
           path="data/test.csv", # the file path
           format='csv',
           skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
           fields=tst_datafields)

trn[0]#Example对象,此时仅仅被分词并没有nummerlize
<torchtext.data.example.Example at 0x1a9d8d39d88>

#field.build_vocab()
# 这行代码使得 Torchtext遍历训练集中的绑定TEXT field的数据，将单词注册到vocabulary。而且，它可以自动构建embedding矩阵。对于oov的词将被标为<unk>
TEXT.build_vocab(trn)

构造迭代器

#构造迭代器
from torchtext.data import Iterator, BucketIterator
#  torchtext.data.Iterator(
# dataset, 
# batch_size, 
# sort_key=None,#group相同长度的examples最小化padding
# device=None,
# batch_size_fn=None, 
# train=True,#iterators是否代表训练集
# repeat=False, #是否在多轮迭代中重复迭代器
# shuffle=None,
# sort=None, 
# sort_within_batch=None)

train_iter, val_iter = BucketIterator.splits(
 (trn, vld), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(64, 64),
 device=-1, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)

for index,i in enumerate(train_iter):
    print(index,i)
    for j in i :
        print(j)

0 
[torchtext.data.batch.Batch of size 1]
	[.comment_text]:[torch.LongTensor of size 1x1]
	[.toxic]:[torch.LongTensor of size 1]
	[.severe_toxic]:[torch.LongTensor of size 1]
	[.threat]:[torch.LongTensor of size 1]
	[.obscene]:[torch.LongTensor of size 1]
	[.insult]:[torch.LongTensor of size 1]
	[.identity_hate]:[torch.LongTensor of size 1]
(tensor([[2]]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]))
None

qq_45073593

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
Pytorch处理文本数据规范流程

处理文本数据的Torch库torchtextimport torchimport torchtext建立Fields,Field确定了处理数据的方式from torchtext.data import Field#torchtext.data.Field(#sequential=True,#判断是否处理是序列数据，如果false,tokenization不会被应用# use_vocab=True,#是否使用Vocab object,如果是false，数据本身就要是数字化的# init_tok
复制链接

扫一扫