目录
1.数据集介绍
本次使用的是数据集是AG news,关于新闻分类的,一共有4个类别,并且类别均匀。由于类别过少,并未采用层序softmax
链接:https://pan.baidu.com/s/1YJNwiEn7L8HpgCWXInIHdA
提取码:ui2w
工具:Jupyter
word级的n-gram特征对于fasttext来说只是锦上添花,而非必须,只因word级别的n-gram信息弥补原本bag of word忽视词序的弊端。我自己在复现的时候,分别从没有添加n-gram特征与添加n-gram特征两方面进行比较,因为我认为没有n-gram特征的Fasttext是其本质,掌握了本质,其他的优化也就更容易理解。而关于新闻有标题和描述,我只采用了新闻描述进行分类。
2.无n-gram特征的Fasttext分类
2.1读取数据
from collections import Counter
from sklearn.model_selection import train_test_split
import pandas as pd
我们将训练数据划分为训练集和验证集(太懒,直接拿验证集当测试集使用,验证集并未参与训练,防止验证集被偷学)
data=pd.read_csv(r"./ag_news/train.csv",header=None,encoding="utf-8")
train,val=train_test_split(data,test_size=0.2,shuffle=True,)
#我们这里简单起见只取描述
train_x=list(train.iloc[:,2])
train_y=list(train.iloc[:,0])
val_x=list(val.iloc[:,2])
val_y=list(val.iloc[:,0])
清洗每个句子(这里我就简单清洗了)
#数据清洗
def clear(data):
for i,s in enumerate(data):
clears=""
#遍历当前句子每个字符
for char in s:
#若字符是字母或者空格则保留
if(char.isalpha() or char==" "):
clears+=char
data[i]=clears
clear(train_x)
clear(val_x)
制作词表和映射表
#拼接所有句子,制作词典
document=""
for s in train_x:
document+=" "+str(s).lower()
#按照空格拆分,并进行简单的数据清洗
clear_d=[]
for word in document.split(" "):
if(word.isalpha()):
clear_d.append(word)
freq=dict(Counter(clear_d))
word_freq={}
#统计词频高于10的
for k,v in freq.items():
if v>=10:
word_freq[k]=v
del freq
idx2word=["<pad>"]+list(word_freq.keys())+["<unk>"]
word2idx={w:idx for idx,w in enumerate(idx2word)}
vocab_size=len(idx2word)
del word_freq
下面对句子的进行编码:
def corpus_encode(corpus):
#分词操作
for i in range(len(corpus)):
#每个句子直接分词
sentences=corpus[i].lower().split(" ")
#存储当前句子的编码信息
c=[]
for word in sentences:
#是单词才编码
if(word.isalpha()):
c.append(word2idx.get(word,word2idx["<unk>"]))
#编码代替原先的句子
corpus[i]=c
corpus_encode(val_x)
corpus_encode(train_x)
2.2数据迭代器
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
由于句子的长度不一致,我们一般会pad补齐,此外在求均值的时候,我们的0值求和没有影响,但是计算入了句子长度,因此必须提前计算句子的真实长度:
#统计每个句子的长度,为了后面取均值
lensTrian=list(map(len,train_x))
lensVar=list(map(len,val_x))
下面则是格式转换,以及对句子进行pad
train_x=list(map(lambda x: torch.tensor(x),train_x))
val_x=list(map(lambda x: torch.tensor(x), val_x))
#原本类别是1 2 3 4
#CrossEntropyLoss()的类别要求0 1 2 3 这种格式,因此-1
#train_y最终会放入数据生成器中,因此写列表元素为tensor形式
train_y=list(map(lambda x: torch.tensor(x)-1, train_y))
val_y=list(map(lambda x: x-1, val_y))
val_y=torch.tensor(val_y)
#句子进行pad
train_x_encode=pad_sequence(train_x_encode,batch_first=True)
val_x_encode=pad_sequence(val_x_encode,batch_first=True)
class mydata(Dataset):
def __init__(self,train_encode,train_y,lens):
super(mydata,self).__init__()
self.train_encode=train_encode
self.train_y=train_y
self.lens=lens
def __len__(self):
return len(self.train_encode)
def __getitem__(self,idx):
return self.train_encode[idx],self.train_y[idx],self.lens[idx]
batch_size=20
dataset=mydata(train_x_encode,train_y,lensTrian)
dataloader=DataLoader(dataset,batch_size=batch_size)
测试数据生成器
for x,y,lens in dataloader:
print(x.shape,y.shape,lens.shape)
break
结果为:
torch.Size([20, 146]) torch.Size([20]) torch.Size([20])
2.3模型
from torch.nn.modules import Module
import torch.nn as nn
模型即为cbow模型
d_model=50
class cbow(Module):
def __init__(self,vocab_size,d_model,class_num):
super(cbow,self).__init__()
self.vocab_size=vocab_size
self.d_model=d_model
self.class_num=class_num
self.embed=nn.Embedding(vocab_size,d_model,padding_idx=0)
self.linear=nn.Linear(d_model,class_num)
def forward(self,x,lens):
#x[batch,maxlen],lens[batch]
x=self.embed(x)#x[batch,maxlen,d_model]
#词向量求和
x=x.sum(dim=1)#x[batch,d_model]
lens.unsqueeze_(1)#lens[batch,1]
#取均值
x/=lens
output=self.linear(x)
return output
测试模型
model=cbow(vocab_size,d_model,4)
model(x,lens).shape
结果
torch.Size([20, 4])
2.4.训练
from torch.optim.adagrad import Adagrad
epochs=50
lr=0.01
cuda=torch.cuda.is_available()
if(cuda):
model=model.cuda()
optimize=Adagrad(model.parameters(),lr)
lossCul=nn.CrossEntropyLoss()
for epoch in range(epochs):
#总损失
allloss=0
for step,(x,y,lens) in enumerate(dataloader):
if cuda:
x=x.cuda()
y=y.cuda()
lens=lens.cuda()
output=model(x,lens)
#计算极大似然函数损失
loss=lossCul(output,y)
optimize.zero_grad()
loss.backward()
optimize.step()
allloss+=loss
if((step+1)%500==0):
print("epochs:",epoch+1," iter:",step+1," loss:",allloss/(step+1))
#验证集进行验证
if((step+1)%2000==0):
if(cuda):
val_x_encode=val_x_encode.cuda()
val_y=val_y.cuda()
lensVar=lensVar.cuda()
output=model(val_x_encode,lensVar)
output=output.argmax(dim=1)
#计算准确率(因为其样本类别均匀)
acc=int(sum(output==val_y))/len(val_y)
print("epochs:",epoch+1," iter:",step+1," acc:",acc)
之前我是直接求和,发现在验证集的上的准确率只能达到83%,而对嵌入求平均可以达到90%(有误差也正常,因为预处理以及测试集都不一样,论文91.5%,如下图),因此嵌入一定要求平均
我的代码在迭代60多次的时截图(原本设置50,又运行一次):
3.带有bigram特征的Fattext分类
其实大量代码存在重复,只是多了一个n-gram特征微调
3.1读取数据
from collections import Counter
from sklearn.model_selection import train_test_split
import pandas as pd
data=pd.read_csv(r"./ag_news/train.csv",header=None,encoding="utf-8")
train,val=train_test_split(data,test_size=0.2,shuffle=True)
#我们这里只取新闻描述
train_x=list(train.iloc[:,2])
train_y=list(train.iloc[:,0])
val_x=list(val.iloc[:,2])
val_y=list(val.iloc[:,0])
#数据清洗
def clear(data):
for i,s in enumerate(data):
clears=""
#遍历当前句子每个字符
for char in s:
#若字符是字母或者空格则保留
if(char.isalpha() or char==" "):
clears+=char
data[i]=clears
clear(train_x)
clear(val_x)
#拼接所有句子,制作词典
document=""
for s in train_x:
document+=" "+str(s).lower()
#按照空格拆分,并进行简单的数据清洗
clear_d=[]
for word in document.split(" "):
if(word.isalpha()):
clear_d.append(word)
freq=dict(Counter(clear_d))
idx2word=["<pad>"]+list(freq.keys())+["<unk>"]
word2idx={w:idx for idx,w in enumerate(idx2word)}
vocab_size=len(idx2word)
del freq
def corpus_encode(corpus):
#分词操作
for i in range(len(corpus)):
#每个句子直接分词
s1=corpus[i]
sentences=corpus[i].lower().split(" ")
#存储当前句子的编码信息
c=[]
for word in sentences:
#是单词才编码
if(word.isalpha()):
c.append(word2idx.get(word,word2idx["<unk>"]))
if(len(c)==0):
print("原来的句子",s1)
print("分解以后的句子",sentences)
#编码代替原先的句子
corpus[i]=c
corpus_encode(val_x)
corpus_encode(train_x)
3.2word级的n-gram特征
Features=set()
def word_N_gram(s,N=2,train=False):
features=[]
for i in range(len(s)-N+1):
f=str(s[i:i+N])
if(train):
Features.add(f)#训练阶段应该统计有哪些特征
features.append(f)
return features
#生成每个句子的n-gram特征
train_n_gram=[]
for s in train_x:
train_n_gram.append(word_N_gram(s,train=True))
val_n_gram=[]
for s in val_x:
val_n_gram.append(word_N_gram(s))
#给n-gram特征建立索引表
idx2ngram=["[<pad>]"]+list(Features)+["[<unk>]"]
ngram2idx={w:c for c,w in enumerate(idx2ngram)}
ngram_size=len(idx2ngram)
#给特征进行编码
def encode_gram(n_gram):
for i,s in enumerate(n_gram):
feature=[]
for word in s:
feature.append(ngram2idx.get(word,ngram2idx["[<unk>]"]))
n_gram[i]=feature
encode_gram(train_n_gram)
encode_gram(val_n_gram)
3.3建立数据迭代器
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
#统计每个句子的长度,为了后面取均值
lensTrianWord=list(map(len,train_x))
lensVarWord=torch.tensor(list(map(len,val_x)),dtype=torch.float)
lensTrianNGram=list(map(len,train_n_gram))
lenValNGram=torch.tensor(list(map(len,val_n_gram)),dtype=torch.float)
train_x=list(map(lambda x: torch.tensor(x),train_x))
val_x=list(map(lambda x: torch.tensor(x), val_x))
train_n_gram=list(map(lambda x: torch.tensor(x),train_n_gram))
val_n_gram=list(map(lambda x:torch.tensor(x),val_n_gram))
#原本类别是1 2 3 4
#CrossEntropyLoss()的类别要求0 1 2 3 这种格式,因此-1
#train_y最终会放入数据生成器中,因此写列表元素为tensor形式
train_y=list(map(lambda x: torch.tensor(x)-1, train_y))
val_y=list(map(lambda x: x-1, val_y))
val_y=torch.tensor(val_y)
#句子进行pad
train_x=pad_sequence(train_x,batch_first=True)
val_x=pad_sequence(val_x,batch_first=True)
train_n_gram=pad_sequence(train_n_gram,batch_first=True)
val_n_gram=pad_sequence(val_n_gram,batch_first=True)
class mydata(Dataset):
def __init__(self,train_x,train_Ngram,train_y,lensX,lensNGram):
super(mydata,self).__init__()
self.train_x=train_x
self.train_Ngram=train_Ngram
self.train_y=train_y
self.lensX=lensX
self.lensNGram=lensNGram
def __len__(self):
return len(self.train_x)
def __getitem__(self,idx):
return self.train_x[idx],self.train_Ngram[idx],self.lensX[idx],self.lensNGram[idx],self.train_y[idx]
batch_size=20
data=mydata(train_x,train_n_gram,train_y,lensTrianWord,lensTrianNGram)
dataloader=DataLoader(data,batch_size,shuffle=True,num_workers=0)
测试:
for x,ngram,lenx,lenNgram,y in dataloader:
print(x.shape,ngram.shape,lenx.shape,lenNgram.shape,y.shape)
break
结果:
torch.Size([20, 173]) torch.Size([20, 172]) torch.Size([20]) torch.Size([20]) torch.Size([20])
3.4模型
import torch.nn as nn
from torch.nn import Module
d_model=50
class_num=4
ngram_size=len(ngram2idx)
class cbow(Module):
def __init__(self,vocab_size,ngram_size,d_model,class_num):
super(cbow,self).__init__()
self.vocab_size=vocab_size
self.d_model=d_model
self.class_num=class_num
self.embed1=nn.Embedding(vocab_size,d_model,padding_idx=0)
self.embed2=nn.Embedding(ngram_size,d_model,padding_idx=0)
self.linear=nn.Linear(d_model,class_num)
def forward(self,x,ngram,lensX,lenNgram):
#x[batch,maxlen1],ngram[batch,maxlen2] lensX[batch] lenNgram[batch]
x=self.embed1(x)#x[batch,maxlen1,d_model]
ngram=self.embed2(ngram)#ngram[batch,maxlen1,d_model]
#词向量求和
x=x.sum(dim=1)#x[batch,d_model]
ngram=ngram.sum(dim=1)#ngram[batch,d_model]
x=x+ngram
lens=lensX+lenNgram
lens=lens.unsqueeze(1)#lens[batch,1]
#取均值
x/=lens
output=self.linear(x)
return output
测试:
model=cbow(vocab_size,ngram_size,d_model,class_num)
model(x,ngram,lenx,lenNgram).shape
结果:
torch.Size([20, 4])
3.5训练
from torch.optim import Adagrad
epochs=50
lr=0.01
cuda=torch.cuda.is_available()
if(cuda):
model=model.cuda()
optimize=Adagrad(model.parameters(),lr)
lossCul=nn.CrossEntropyLoss()
for epoch in range(epochs):
#总损失
allloss=0
for step,(x,ngram,lenx,lenNgram,y) in enumerate(dataloader):
if cuda:
x=x.cuda()
ngram=ngram.cuda()
y=y.cuda()
lenx=lenx.cuda()
lenNgram=lenNgram.cuda()
output=model(x,ngram,lenx,lenNgram)
#计算极大似然函数损失
loss=lossCul(output,y)
optimize.zero_grad()
loss.backward()
optimize.step()
allloss+=loss
if((step+1)%500==0):
print("epochs:",epoch+1," iter:",step+1," loss:",allloss/(step+1))
#验证集进行验证
if((step+1)%2000==0):
if(cuda):
val_x=val_x.cuda()
val_y=val_y.cuda()
val_n_gram=val_n_gram.cuda()
lenValNGram=lenValNGram.cuda()
lensVarWord=lenValNGram.cuda()
output=model(val_x,val_n_gram,lensVarWord,lenValNGram)
output=output.argmax(dim=1)
#计算准确率(因为其样本类别均匀)
acc=int(sum(output==val_y))/len(val_y)
print("epochs:",epoch+1," iter:",step+1," acc:",acc)