pytorch实现情感分类(wordavg&lstm&cnn)

import torch
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import spacy
from spacy.lang.en import English
import random
import torch.nn as nn
import torch.nn.functional as F

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

1.数据准备

nlp = English() #确定分词方式
TEXT = data.Field(tokenize=nlp)
# TEXT = data.Field(lower= True)
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) #载入数据

train_data, valid_data = train_data.split(split_ratio=0.7,random_state=random.seed(SEED)) #再拆分一个validation_set出来
print(f'Number of training examples: {
     len(train_data)}')
print(f'Number of validation examples: {
     len(valid_data)}')
print(f'Number of testing examples: {
     len(test_data)}')
Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000
print(vars(train_data[0]),'\n',len(vars(train_data[0])['text']))
print(vars(train_data[0])['text'][0],'\n',type((vars(train_data[0])['text'][0])))
{'text': This movie has got to be one of the worst I have ever seen make it to DVD!!! The story line might have clicked if the film had more funding and writers that would have cut the nonsense and sickly scenes that I highly caution parents on.... But the story line is like a loose cannon. If there was such a thing as a drive thru movie maker-this one would have sprung from that.It reminded me a lot of the quickie films that were put out in the 1960's, poor script writing and filming. <br /><br />The only sensible characters in the whole movie was the bartender and beaver. The rest of the film, could have easily been made by middle school children. I give this film a rating of 1 as it is truly awful and left my entire family with a sense of being cheated. My advice-Don't Watch It!!!, 'label': 'neg'} 
 173
This 
 <class 'spacy.tokens.token.Token'>
# 从上面可以看出,使用English()这种方法切出来的每个单词并不是string类型,
# 它的type是token,因此我们要把切出来的词的type都转化为str
for data in [train_data,valid_data,test_data]:
    for i in range(len(data)):
        a = data[i]
        a.text = [str(j) for j in a.text]

#建立词典
TEXT.build_vocab(train_data, max_size=25000,vectors='glove.6B.100d',unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
print(len(TEXT.vocab),len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(20))
25002 2
[('the', 203566), (',', 192495), ('.', 165539), ('and', 109443), ('a', 109116), ('of', 100702), ('to', 93766), ('is', 76328), ('in', 61255), ('I', 54004), ('it', 53508), ('that', 49187), ('"', 44285), ("'s", 43329), ('this', 42445), ('-', 37165), ('/><br', 35752), ('was', 35034), ('as', 30384), ('with', 29774)]
# 建立iterator(其实就是dataloader)
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
                                                (train_data, valid_data
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值