Pytorch处理文本数据规范流程

处理文本数据的Torch库torchtext

import torch
import torchtext

建立Fields,Field确定了处理数据的方式

from torchtext.data import Field
#torchtext.data.Field(
#sequential=True,#判断是否处理是序列数据,如果false,tokenization不会被应用
# use_vocab=True,#是否使用Vocab object,如果是false,数据本身就要是数字化的
# init_token=None, #将该token添加到该field的所有example开头
# eos_token=None,#将该token添加到该field的所有example结尾
# fix_length=None, #padding
# dtype=torch.int64, 
# preprocessing=None,#tokenize之后,numercialize之前的操作
# postprocessing=None, #numercialize之后,numbers转变为Tensor之前的操作
# lower=False,#是否全部小写
# tokenize=None, #=‘spacy’,Spacy tokenizer会被调用
# tokenizer_language='en',
# include_lengths=False, 
# batch_first=False,#是否返回的tensor中bacth优先
# pad_token='<pad>', #用于padding的token
# unk_token='<unk>', #unknown的token
# pad_first=False, #是否在开始就padding
# truncate_first=False, #是否在开始就缩短
# stop_words=None, #在processing过程discard单词
# is_target=False)
#定义Field
LABEL = Field(sequential=False, use_vocab=False)
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True) 

建立数据库

from torchtext.data import TabularDataset
# #TabularDataset:Defines a Dataset of columns stored in CSV, TSV, or JSON format.
# (
# path,
# format,:数据格式 
# fields(list(tuple(str,Field))):如果是list,则format必须是CSV或者TSV,并且(str,fields)和column要一一对应
# skip_header(bool):是否跳过文件的第一行(尤其CSV文件)
#cav_reader_params(dict): Parameters to pass to the csv reader. Only relevant when format is csv or tsv. See 
# )
#建立数据库
tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("comment_text", TEXT), ("toxic", LABEL),
                 ("severe_toxic", LABEL), ("threat", LABEL),
                 ("obscene", LABEL), ("insult", LABEL),
                 ("identity_hate", LABEL)]
#splits方法对train、dev的数据集创造了dataset,这个过程是相同的。
trn, vld = TabularDataset.splits(
               path="data", # the root directory where the data lies
               train='train.csv', validation="valid.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)
 
tst_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                  ("comment_text", TEXT)]
tst = TabularDataset(
           path="data/test.csv", # the file path
           format='csv',
           skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
           fields=tst_datafields)
trn[0]#Example对象,此时仅仅被分词并没有nummerlize
<torchtext.data.example.Example at 0x1a9d8d39d88>
#field.build_vocab()
# 这行代码使得 Torchtext遍历训练集中的绑定TEXT field的数据,将单词注册到vocabulary。而且,它可以自动构建embedding矩阵。对于oov的词将被标为<unk>
TEXT.build_vocab(trn)

构造迭代器

#构造迭代器
from torchtext.data import Iterator, BucketIterator
#  torchtext.data.Iterator(
# dataset, 
# batch_size, 
# sort_key=None,#group相同长度的examples最小化padding
# device=None,
# batch_size_fn=None, 
# train=True,#iterators是否代表训练集
# repeat=False, #是否在多轮迭代中重复迭代器
# shuffle=None,
# sort=None, 
# sort_within_batch=None)
train_iter, val_iter = BucketIterator.splits(
 (trn, vld), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(64, 64),
 device=-1, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)
for index,i in enumerate(train_iter):
    print(index,i)
    for j in i :
        print(j)

0 
[torchtext.data.batch.Batch of size 1]
	[.comment_text]:[torch.LongTensor of size 1x1]
	[.toxic]:[torch.LongTensor of size 1]
	[.severe_toxic]:[torch.LongTensor of size 1]
	[.threat]:[torch.LongTensor of size 1]
	[.obscene]:[torch.LongTensor of size 1]
	[.insult]:[torch.LongTensor of size 1]
	[.identity_hate]:[torch.LongTensor of size 1]
(tensor([[2]]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]), tensor([1]))
None
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值