目录
(2)分词、噪声字段、空格、数字、大小写替换、过滤停止词字典停止词
零、pytorch 中 lstm 参数
一、数据
腾讯800w预训练静态词向量:Tencent AI Lab Embedding Corpus for Chinese Words and Phrases
腾讯800w预训练静态词向量加载方法:腾讯词向量使用
二、整体步骤
0、定义参数的类
class Config():
model_name = 'lstm_attention' # 可以使用的模型:"lstm_attention"、"lstm"
learning_rate = 0.0006 # 学习率
max_seq = 64 # LSTM 输入最长的序列长度,该长度不是模型训练时batch的真实长度,dataloader会截取batch真实数据的最长长度,也就是每一个batch的序列长度可能是不同的
batch_size = 32 # batch size
epochs = 200 # iter 次数
embedding_dim = 200 # 词 embedding
layer_num = 2 # LSTM 层数
num_classes = 2 # label 类别数
dropout = 0.1 # drop 保留 1 - dropout
bidirectional = True # 是否使用双向LSTM
hidden_dim = 200 # LSTM hidden_size
vocab_most_common = 55000 # 取词频前50000词构建词汇表【词汇表最大为64221】
pretrain_w2v_limit = 500000 # 腾讯预训练词embedding加载个数
w2v_grad = True # 词 embedding 是否参与训练
focal_loss = False # 是否使用focal_loss
num_workers = 4 # 进程数
info_interval = 160 # 训练时多少个batch打印一次log
stop_word_path = './data/stopword.txt' # 停止词文件
pretrain_w2v = './data/Tencent_AILab_ChineseEmbedding.txt' # 腾讯800w词预训练静态词向量
vocab_save_path = './word2vec/Vocab_MostCommon{}.txt'.format(vocab_most_common) # 保存经过过滤词并排序后的vocab,过滤的两个方向:① 停止词 ②低频词
embedding_path = './word2vec/Embedding_PretrianLimit{}.txt'.format(vocab_most_common,pretrain_w2v_limit)
source_data = './data/online_shopping_10_cats.csv'
train_data = './data/train.txt'
val_data = './data/validation.txt'
test_data = './data/test.txt'
predict_data = './data/predict.txt' # 预测predict的数据
checkpoint = './model/{}.ckpt'.format(model_name)
1、数据预处理
(1)数据去重、数据分割训练集测试集验证集
class CreateModelData():
"""
给定 一个csv原始数据分成3分,生成 7:3:1的train,数据,格式为:target text
"""
def __init__(self):
pass
def load_csv_data(self,csv_data):
"""
加载、去重、shuffle
"""
source_df = pd.read_csv(csv_data)
# 去除首尾有空格的行
source_df.iloc[:,-1] = source_df.iloc[:,-1].str.strip()
# 只要有空行就删除
source_df = source_df.dropna(how='any')
# 打乱顺讯
index_shuffle = np.random.permutation(len(source_df))
source_df = source_df.iloc[index_shuffle,:]
return source_df
def split_data_to_train_eval_test(self,dataframe):
"""
对每个一cat类型、label、类别别分割为tran、eval、test,分割比例 7:2:1
"""
cats = dataframe.loc[:,'cat'].unique()
labels = dataframe.loc[:,'label'].unique()
train_df = pd.DataFrame(columns=dataframe.columns[-2:])
val_df = pd.DataFrame(columns=dataframe.columns[-2:])
test_df = pd.DataFrame(columns=dataframe.columns[-2:])
for cat in cats:
dataframe_cat = dataframe[dataframe.loc[:,'cat'] == cat].loc[:,dataframe.columns[-2:]]
for label in labels:
dataframe_label = dataframe_cat[dataframe_cat.loc[:,'label'] == label]
size = dataframe_label.shape[0]
train_end_idx = int(size * 0.7)
val_end_idx = int(size * 0.9)
train_df = pd.concat([train_df,dataframe_label.iloc[:train_end_idx,:]],axis=0)
val_df = pd.concat([val_df, dataframe_label.iloc[train_end_idx:val_end_idx, :]], axis=0)
test_df = pd.concat([test_df, dataframe_label.iloc[val_end_idx:, :]], axis=0)
return train_df,val_df,test_df
def save_csv(self,dataframe,path):
"""
保存文件为 csv
"""
dataframe.to_csv(path,sep='\t',header=None,index=None)
def forward(self,source_data_path):
"""
执行函数
"""
source_df = self.load_csv_data(csv_data = source_data_path)
# 分割 7:2:1 为 train val test
train_df,val_df,test_df = self.split_data_to_train_eval_test(dataframe=source_df)
# 保存
print("源数据一共:{}条,分割后train data:{} - eval data:{} - test data:{},保存至:'{}' - '{}' - '{}'".format(len(source_df),
len(train_df),len(val_df),len(test_df),'./data/train.data','./data/val.data','./data/test.data'))
self.save_csv(train_df,'./data/train.data')
self.save_csv(val_df,'./data/val.data')
self.save_csv(test_df,'./data/test.data')
(2)分词、噪声字段、空格、数字、大小写替换、过滤停止词字典停止词
- 分词
- 噪声字段、空格、数据、英文字母大小写、停止词字典
# 带有标签的数据
class DataProcessWithTarget():
"""
************* 训练集、验证集、测试集 数据预处理(文件带有target) **************
数据做以下:
① jieba 分词
② 去除停止词(低频词在构建 vocab时去除)、原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
③ 保存分词结果
"""
def __init__(self):
pass
def load_csv(self,path):
data_df = pd.read_csv(path,sep='\t',header=None)
target = data_df.iloc[:,-2]
data = data_df.iloc[:,-1]
return data,target
def load_stopword(self, path):
"""
加载停止词
"""
stop_word = []
with open(path, 'r', encoding='utf-8-sig') as f:
for line in f:
line = line.strip()
if line:
stop_word.append(line)
return stop_word
def jieba_(self,text,stop_word):
"""
jieba 分词的函数
① 这里我进行停止词
② 原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
"""
words = jieba.lcut(text)
words_list = []
# 对单词的预处理:
for word in words:
if word not in stop_word:
# 去除分词中的空格,并且将英文转化为小写
word = word.strip()
word = word.lower()
if word:
words_list.append(word)
return words_list
def save_file(self,target,data,path):
if len(target) != len(data):
raise Exception('长度不一致!')
with open(path,'w',encoding='utf-8') as w:
for idx in range(len(data)):
word_str = ' '.join(data[idx])
w.write(str(target[idx]))
w.write('\t')
w.write(word_str)
w.write('\n')
def forward(self,source_path,stop_word_path,report_path):
"""
主函数
return 分词结果 X,标签 target
"""
print('正在预处理:"{}"数据,处理后保存至:"{}",请稍等...'.format(source_path,report_path))
# 加载csv
data,target = self.load_csv(path=source_path)
# 加载 stop word
stop_word = self.load_stopword(stop_word_path)
# 分词、停止词、原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
data_list = []
target_list = []
for idx in range(len(target)):
word_list = self.jieba_(data.iloc[idx],stop_word=stop_word)
if word_list:
data_list.append(word_list)
target_list.append(target.iloc[idx])
else:
print('数据:"{}",行号:{}数据预处理后有空值,去除处理'.format(source_path,idx+1))
# 保存
self.save_file(target=target_list,data=data_list,path = report_path)
return data_list,target_list
# 预测时无标签的数据,数据处理必须与train、val、test集相同
class DataProcessNoTarget():
"""
模型predict的数据预处理(模型上线后数据预处理,需要与模型训练时预处理的方法完全相同)
ruturn predict集 X array
"""
def __init__(self):
pass
def load_data(self,path):
text_list = []
with open(path,'r',encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
text_list.append(line)
return text_list
def load_stopword(self, path):
"""
加载停止词
"""
stop_word = []
with open(path, 'r', encoding='utf-8-sig') as f:
for line in f:
line = line.strip()
if line:
stop_word.append(line)
return stop_word
def jieba_(self,text,stop_word):
"""
jieba 分词的函数
① 这里我进行停止词
② 原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
③ 映射为 id,并截取填充
"""
words = jieba.lcut(text)
words_list = []
# 对单词的预处理:
for word in words:
if word not in stop_word:
# 去除分词中的空格,并且将英文转化为小写
word = word.strip()
word = word.lower()
if word:
words_list.append(word)
return words_list
def data_2_id(self,vocab_2_id, max_seq, text):
"""
将 text 数据生成 model 输入数据 X 与 label。
通过 vocab 映射为 id
① 确定文本的最长长度,超过进行截取,不足的用 PAD 填充
② 由于vocab去除了低词频的词,所以也要用到 UNK 标签
return: X矩阵,2D 维度 numpy,Y 向量 1D 维度 numpy
"""
def padding(max_seq, X):
""" Pad 或 截取到相同长度,pad的值放在真实数据的前面 """
if len(X) < max_seq:
while len(X) < max_seq:
X.insert(0,vocab_2_id['<PAD>'])
else:
X = X[:max_seq]
return X
X = []
for line in text:
# mapping 为 id,注意 UNK 标签
line = [vocab_2_id[word] if word in vocab_2_id else vocab_2_id["<UNK>"] for word in line]
# padding 或 截取 为 固定长度,pad的值放在真实数据的前面
line = padding(max_seq=max_seq, X=line)
# 保存 X
X.append(line)
return np.array(X)
def forward(self,source_path,stop_word_path,vocab_2_id,max_seq):
"""
主函数
return predict数据映射的id numpy 矩阵
"""
print('正在预处理:"{}"数据,请稍等...'.format(source_path))
# 加载csv
data = self.load_data(path=source_path)
# 加载 stop word
stop_word = self.load_stopword(stop_word_path)
# 分词、停止词、原始数据噪声词、空格、数字、爬虫标签、英文大小写预处理
data_list = []
for idx in range(len(data)):
word_list = self.jieba_(data[idx],stop_word=stop_word)
if word_list:
data_list.append(word_list)
else:
print('数据:"{}",行号:{}数据预处理后有空值,去除处理'.format(source_path,idx+1))
# 映射填充为id
data = self.data_2_id(vocab_2_id=vocab_2_id,max_seq=max_seq,text=data_list)
return data
2、构建vocab、静态word embedding 表
(1)构建vocab
- 需要用train data 与 val data 共同构建voacb字典【添加PAD、UNK、BEG、END】
- 低频词去除(根据词频去除低频词)most_common
def build_vocab(train_data,val_data,save_path,most_common = None):
"""
使用 train data 和 val data 共同生成vocab,添加标签 <PAD> <UNK>,使用过滤词,词频从高到低排序
① 低频词去除【保留前 most_common 个词】
"""
vocab_dict = {}
paths = [train_data,val_data]
for _path in paths:
with open(_path,'r',encoding='utf-8-sig') as f:
for line in f:
line = line.strip()
if line:
word_list = line.split()[1:] # .split() 默认使用任何空格进行分类
for word in word_list:
if word not in vocab_dict:
vocab_dict[word] = 1
else:
vocab_dict[word] = vocab_dict[word] + 1
# 取前 most_common 个词
if most_common is not None:
ordered_vocab = Counter(vocab_dict).most_common(most_common)
else:
ordered_vocab = Counter(vocab_dict).most_common(sys.maxsize)
# 建立 vocab2id 字典,并加入 <PAD> <UNK> 标签
vocab_dict = collections.OrderedDict()
vocab_dict["<PAD>"] = 0
vocab_dict["<UNK>"] = 1
for word,counts in ordered_vocab:
if word not in vocab_dict:
vocab_dict[word] = len(vocab_dict)
# 保存 vocab_2_id
vocab_size = len(vocab_dict)
with open(save_path,'w',encoding = 'utf-8') as w:
for idx,(k,v) in enumerate(vocab_dict.items()):
w.write('{}\t{}'.format(k,v))
if idx + 1 < vocab_size:
w.write('\n')
return vocab_dict
(2)构建静态word embedding
- 利用vocab字典与腾讯800w预训练向量生成vocab维度的embedding表
def build_embedding(vocab_2_id,pretrain_w2v,save_path):
"""
使用 腾讯 预训练的词向量构建预训练词向量表, 用 numpy 保存txt格式数组
"""
# 加载腾讯词向量,limit 用于限制加载词向量的个数
pretrain_w2v_model = KeyedVectors.load_word2vec_format(pretrain_w2v,binary=False,limit=config.pretrain_w2v_limit) # limit 用于限制加载词汇表大小
# 初始化 embedding table
vocab_dim = len(vocab_2_id)
embed_dim = pretrain_w2v_model.vector_size
embedding_table = np.random.uniform(-1.,1.,(vocab_dim,embed_dim))
# 将 预训练词向量 对embedding表进行赋值
for word,index in vocab_2_id.items():
try:
embedding_table[index] = pretrain_w2v_model[word]
except KeyError:
pass
# 保存 embedding 表
np.savetxt(save_path,embedding_table)
return embedding_table
(3)PAD截取并转化为id
def data_2_id(vocab_2_id,max_seq,file_path):
"""
将 text 数据生成 model 输入数据 X 与 label。
通过 vocab 映射为 id
① 确定文本的最长长度,超过进行截取,不足的用 PAD 填充
② 由于vocab去除了低词频的词,所以也要用到 UNK 标签
return: X矩阵,2D 维度 numpy,Y 向量 1D 维度 numpy
"""
def padding(max_seq,X):
""" Pad 或 截取到相同长度,pad的值放在真实数据的前面 """
if len(X) < max_seq:
while len(X) < max_seq:
X.insert(0,vocab_2_id['<PAD>'])
else:
X = X[:max_seq]
return X
label = []
X = []
with open(file_path,'r',encoding='utf-8-sig') as f:
for line in f:
line = line.strip()
if line:
line_list = line.split() # .split() 默认使用任意个空格作为分隔符
# 获取 label 标签
label.append(int(line_list[0])) # 标签需要用 int 转化
# 获取 X
X_tmp = line_list[1:]
# mapping 为 id,注意 UNK 标签
X_tmp = [vocab_2_id[word] if word in vocab_2_id else vocab_2_id["<UNK>"] for word in X_tmp ]
# padding 或 截取 为 固定长度,pad的值放在真实数据的前面
X_tmp = padding(max_seq=max_seq,X=X_tmp)
# 保存 X
X.append(X_tmp)
return np.array(X),np.array(label)
3、DataSet、DataLoader
- 将数据映射为id,且将序列填充裁剪为固定长度
- 构建DataSet、DataLoader
class Data_Set(Dataset):
"""
生成 dataset
"""
def __init__(self,X,Label=None):
"""
X: 2D numpy int64
Label: 1D numpy int64
"""
self.X = X
self.Label = Label
def __len__(self):
return len(self.X)
def __getitem__(self,idx):
if self.Label is not None:
X = torch.tensor(self.X[idx],dtype=torch.int64) # 使用torch默认的整形数据
Label = torch.tensor(self.Label[idx],dtype=torch.int64)
return X,Label
# 考虑predict阶段没有label
else:
X = torch.tensor(self.X[idx],dtype=torch.int64)
return X
def collate_fn(batch):
"""
参数:batch 是 list 类型
DataLoader 中定义的 collate_fn 函数,用于对一个batch的数据进行处理
② 将 batch 数据转化为tensor
① 去除一个batch中多余的 PAD ,将数据最长长度调整为batch中最长样本的真实长度
"""
def intercept(X):
"""
X dim: [batch,T]
将tensor截取为真实值的最长度,要注意PAD必须为0才可执行
"""
max_seq = torch.max(torch.sum(X>=1,dim=1))
return X[:,-max_seq:]
X_list = []
label_list =[]
for item in batch:
if isinstance(item, tuple):
X,target_label = item # X dim: [batch,T]
if not (torch.is_tensor(X) and torch.is_tensor(target_label)):
X = torch.tensor(X)
target_label = torch.tensor(target_label)
X_list.append(X)
label_list.append(target_label)
# 考虑到预测没有标签
else:
X = item
if not torch.is_tensor(X):
X = torch.tensor(X)
X_list.append(X)
if label_list:
X = torch.stack(X_list,dim=0) # X dim: [batch,T]
label = torch.stack(label_list,dim=0)
return intercept(X), label
else:
X = torch.stack(X_list,dim=0) # X dim: [batch,T]
return intercept(X)
def get_vocab(file_path):
"""
加载 vocab_2_id
"""
vocab_dict = collections.OrderedDict()
with open(file_path,'r',encoding='utf-8-sig') as f:
for line in f:
line = line.strip()
if line:
key,value = line.split()
vocab_dict[key] = int(value)
return vocab_dict
def get_pretrain_embedding(file_path):
"""
加载 腾讯预训练 embedding
"""
embedding = np.loadtxt(file_path)
return embedding
def sort_eval(X,Y=None):
"""
X: 2D
接受验证集与测试集的 X Y array,对其真实长度从到小进行排序
return 验证集与测试集排序后的 X,Y
"""
if Y is not None:
seq_len = np.sum(X>0,axis=1)
datas = list(zip(X,Y,seq_len))
datas = sorted(datas,key=lambda i:i[-1])
X,Y,_ = zip(*datas)
return X,Y
else:
seq_len = np.sum(X > 0, axis=1)
datas = list(zip(X, seq_len))
datas = sorted(datas, key=lambda i: i[-1])
X, Y, = zip(*datas)
return X
if __name__ == '__main__':
pass
4、模型搭建
class LSTM_Model(nn.Module):
def __init__(self,
vocab_size,
n_class,
embedding_dim,
hidden_dim,
num_layers,
dropout,
bidirectional,
embedding_weights=None,
train_w2v=True,
**kwargs):
super(LSTM_Model, self).__init__()
self.vocab_size = vocab_size
self.n_class = n_class
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.dropout = dropout
self.bidirectional = bidirectional
self.embedding_weights = embedding_weights
self.train_w2v = train_w2v
# 构建 embedding 层
if self.embedding_weights is not None:
self.embedding_weights = torch.tensor(self.embedding_weights,
dtype=torch.float32) # torch 不接受 numpy 64位的浮点型,这里必须转化为32位,否则报错
self.embedding = nn.Embedding.from_pretrained(self.embedding_weights)
self.embedding.weight.requires_grad = self.train_w2v
else: # 保证预测的情况无需传入 预训练的embedding表
self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
self.embedding.weight.requires_grad = self.train_w2v
nn.init.uniform_(self.embedding.weight, -1., 1.)
# 构建 lstm
self.lstm = nn.LSTM(input_size=self.embedding_dim,
hidden_size=self.hidden_dim,
num_layers=self.num_layers,
dropout=self.dropout,
bidirectional=self.bidirectional)
# 双向
if self.bidirectional:
# FC[第一个时刻与最后一个时刻需要拼接]
self.fc1 = nn.Linear(4 * self.hidden_dim, self.hidden_dim)
self.fc2 = nn.Linear(self.hidden_dim, self.n_class)
else:
# FC
self.fc1 = nn.Linear(self.hidden_dim, self.n_class)
def forward(self, x):
# 0、embedding
embeddings = self.embedding(x) # (B,T) --> (B,T,D)
# 1、LSTM
outputs, states = self.lstm(embeddings.permute([1, 0, 2])) # lstm 默认 输入维度为 (seq,batch,dim),因此这里需要进行转换
if self.bidirectional:
input_tmp = torch.cat([outputs[0],outputs[-1]],dim=-1)
outputs = F.relu(self.fc1(input_tmp))
outputs = self.fc2(outputs)
else:
outputs = self.fc1(outputs[-1])
return outputs
class LSTM_Attention(nn.Module):
def __init__(self,
vocab_size,
n_class,
embedding_dim,
hidden_dim,
num_layers,
dropout,
bidirectional,
embedding_weights = None,
train_w2v=True,
**kwargs):
super(LSTM_Attention,self).__init__()
self.vocab_size = vocab_size
self.n_class = n_class
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.dropout = dropout
self.bidirectional = bidirectional
self.embedding_weights = embedding_weights
self.train_w2v = train_w2v
# 构建 embedding 层
if self.embedding_weights is not None:
self.embedding_weights = torch.tensor(self.embedding_weights,dtype=torch.float32) # torch 不接受 numpy 64位的浮点型,这里必须转化为32位,否则报错
self.embedding = nn.Embedding.from_pretrained(self.embedding_weights)
self.embedding.weight.requires_grad = self.train_w2v
else: # 保证预测的情况无需传入 预训练的embedding表
self.embedding = nn.Embedding(self.vocab_size,self.embedding_dim)
self.embedding.weight.requires_grad = self.train_w2v
nn.init.uniform_(self.embedding.weight,-1.,1.)
# 构建 lstm
self.lstm = nn.LSTM(input_size=self.embedding_dim,
hidden_size=self.hidden_dim,
num_layers=self.num_layers,
dropout=self.dropout,
bidirectional=self.bidirectional)
# 双向
if self.bidirectional:
# attention
self.attention1 = nn.Linear(2 * self.hidden_dim,2 * self.hidden_dim)
self.attention2 = nn.Linear(2 * self.hidden_dim,1)
# FC
self.fc1 = nn.Linear(2 * self.hidden_dim, self.hidden_dim)
self.fc2 = nn.Linear(self.hidden_dim,self.n_class)
else:
# attention
self.attention1 = nn.Linear(self.hidden_dim, self.hidden_dim)
self.attention2 = nn.Linear(self.hidden_dim,1)
# FC
self.fc1 = nn.Linear(self.hidden_dim, self.hidden_dim)
self.fc2 = nn.Linear(self.hidden_dim,self.n_class)
def forward(self,x):
# 0、embedding
embeddings = self.embedding(x) # (B,T) --> (B,T,D)
# 1、LSTM
outputs,states = self.lstm(embeddings.permute([1,0,2])) # lstm 默认 输入维度为 (seq,batch,dim),因此这里需要进行转换
T,B,D = outputs.size() # D = 2 * hidden_dim
outputs = outputs.permute([1,0,2])
# attention
u = torch.tanh(self.attention1(outputs))
v = self.attention2(u)
att_scores = F.softmax(v,dim=1)
encoding = torch.sum(torch.mul(outputs,att_scores),dim=1)
# FC
outputs = F.relu6(self.fc1(encoding))
outputs=self.fc2(outputs)
return outputs
if __name__ == '__main__':
lstm_attention = LSTM_Attention(10000,2,200,256,2,0.2,bidirectional=True,embedding_weights=None,train_w2v=True)
print(lstm_attention)
5、模型实例化
# 模型搭建
if Config.model_name == 'lstm_attention':
model = LSTM_Attention( vocab_size = len(vocab_2_id),
n_class = Config.num_classes,
embedding_dim = Config.embedding_dim,
hidden_dim = Config.hidden_dim,
num_layers = Config.layer_num,
dropout = Config.dropout,
bidirectional = Config.bidirectional,
embedding_weights = embedding_table,
train_w2v = Config.w2v_grad
)
# print(model.embedding.weight)
else:
model = LSTM_Model(vocab_size = len(vocab_2_id),
n_class = Config.num_classes,
embedding_dim = Config.embedding_dim,
hidden_dim = Config.hidden_dim,
num_layers = Config.layer_num,
dropout = Config.dropout,
bidirectional = Config.bidirectional,
embedding_weights = embedding_table,
train_w2v = Config.w2v_grad
)
print('Model-"{}" 细节:\n'.format(Config.model_name),model)
view_will_trained_params(model,model_name=Config.model_name)
6、优化器【分层学习率】、LOSS函数、学习率调整器
# 优化器 分层学习率
# 由于embedding是腾讯预训练词向量生成的,所有需要较小的学习率,一般低于正常神经网络训练学习率的10倍
special_layers = nn.ModuleList([model.embedding])
# 获取特等层的参数列表的内存id列表
special_layers_ids = list(map(lambda x: id(x), special_layers.parameters()))
# 基础层的参数列表
basic_params = filter(lambda x: id(x) not in special_layers_ids, model.parameters())
optimizer = optim.Adam([{'params': filter(lambda p: p.requires_grad, basic_params)},
{'params': filter(lambda p: p.requires_grad, special_layers.parameters()), 'lr': 8e-5}],
lr=Config.learning_rate)
import torch
import numpy as np
import torch.nn.functional as F
import math
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score
def view_will_trained_params(model,model_name):
"""
********** 查看模型哪些层的参数参与训练,哪些层的参数被固定了 ************
"""
train_params = []
for name,param in model.named_parameters():
if param.requires_grad == True:
train_params.append((name,param.shape))
print("\n{} 模型将要参与训练的层为:\n".format(model_name),train_params,end='\n\n\n')
def get_device():
dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(dev)
return device
def focal_loss(output, target, alpha=1.0, gamma=2.0, *args, **kwargs):
"""
********** 给定模型前向传播的输出[batch,class]与真实值target[class,],计算loss误差 ************
1. 仅仅在训练的时候使用 focal_loss ,验证时不使用 focal_loss
2. 默认情况下不进行聚合
"""
assert np.ndim(output) == 2
assert np.ndim(target) == 1
assert len(output) == len(target)
ce_loss = F.cross_entropy(input=output, target=target, reduction="none") # 这里必须使用 none 模式, ce_loss dim: [B,]
pt = torch.exp(-ce_loss) # pt dim: [B,]
# 构建 focal_loss
focalloss = (alpha * (torch.tensor(1.0) - pt) ** gamma * ce_loss).mean()
return focalloss
def cross_entropy(output, target, *args, **kwargs):
"""
普通的交叉熵损失函数,默认情况下不进行聚合
"""
assert np.ndim(output) == 2
assert np.ndim(target) == 1
assert len(output) == len(target)
ce_loss = F.cross_entropy(input=output, target=target, reduction="mean") # ce_loss 是一个均值
return ce_loss
class WarmupCosineLR():
def __init__(self,optimizer,warmup_iter:int,lrs_min:list = [1e-5,],T_max:int = 10):
"""
******************* pytorch自定义学习率 预热warmup + Cosline 余弦衰减 **************************
具体可看文章:https://blog.csdn.net/qq_36560894/article/details/114004799?utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-13.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-13.control
Args:
optimizer (Optimizer): pytotch 优化器
warmup_iter: 预热的最大epoch
lrs_min: list, optimizer 学习率一一对应的最小值
T_max:余弦半周期,该值必须比 warmup_iter 大
特点:
① 支持分层学习率多组学习率衰减
"""
self.optimizer = optimizer
self.warmup_iter = warmup_iter
self.lrs_min = lrs_min
self.T_max = T_max
self.base_lrs = [i['lr'] for i in optimizer.param_groups]
def get_lr(self):
if self.iter < self.warmup_iter:
return [i * self.iter *1. / self.warmup_iter for i in self.base_lrs]
else:
return [self.lrs_min[idx] + 0.5*(i-self.lrs_min[idx])*(1.0+math.cos((self.iter-self.warmup_iter)/(self.T_max-self.warmup_iter)*math.pi)) \
for idx,i in enumerate(self.base_lrs)]
def step(self,iter:int):
if iter == 0:
iter = iter + 1
self.iter = iter
# 获取当前epoch学习率
decay_lrs = self.get_lr()
# 更新学习率
for param_group, lr in zip(self.optimizer.param_groups, decay_lrs):
param_group['lr'] = lr
def get_score(target,predict):
"""
给定真实的变迁target 与 预测的标签predict ,计算 acc、recall、precision、F1
"""
import warnings
warnings.filterwarnings('ignore')
assert np.ndim(target) == 1
assert np.ndim(predict) == 1
assert np.shape(target) == np.shape(predict)
con_matrix = confusion_matrix(y_true=target,y_pred=predict)
# 计算acc
acc = accuracy_score(y_true=target,y_pred=predict)
# 计算 macro recall
recall = recall_score(y_true=target,y_pred=predict,average='macro')
# 计算 macro precision
precision = precision_score(y_true=target,y_pred=predict,average='macro')
# 计算 macro F1
F1 = f1_score(y_true=target,y_pred=predict,average='macro')
return (acc,recall,precision,F1),con_matrix
if __name__ == "__main__":
# 0、WramUp + cosinelr 学习率变化曲线
import torch.optim as optim
import matplotlib.pyplot as plt
optimizer = optim.Adam(params=[torch.ones((3,4),requires_grad=True)],lr=0.01)
scheduler_ = WarmupCosineLR(optimizer,
warmup_iter=5,
lrs_min=[0.001,],
T_max=50)
lr = optimizer.param_groups[0]['lr']
print(lr)
y = []
x = []
for epoch in range(200):
scheduler_.step(epoch+1)
print(optimizer.param_groups[0]['lr'])
y.append(optimizer.param_groups[0]['lr'])
x.append(epoch+1)
plt.plot(x,y)
plt.show()
# 计算分值
y_t = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
y_p = [1,1,1,0,0,1,1,0,1,0,2,2,1,1,1,1,0,1,1]
print(get_score(y_t,y_p))
7、训练train&eval与保存模型参数字典
梯度截断的使用步骤:
1. 计算loss函数值
2. loss 反向传播
3. 梯度截断
4. 优化器更新梯度参数
optimizer.zero_grad()
loss, hidden = model(data, hidden, targets)
loss.backward()
# 梯度截断
torch.nn.utils.clip_grad_norm(filter(lambda p: p.requires_grad,model.parameters()), args.clip)
optimizer.step()
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from __future__ import with_statement
from os import lseek
from model import LSTM_Attention,LSTM_Model
from data_process import data_2_id
from loader_utils import get_vocab,get_pretrain_embedding,Data_Set,collate_fn,sort_eval
from model_utils import view_will_trained_params,focal_loss,cross_entropy,WarmupCosineLR,get_score
from create_config import Config
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import copy
import os
def train_one_epoch(model,device,optimizer,loss_fun,metric_fun,train_loader,current_epoch,info_interval:int=None):
"""
********** 一个epoch模型训练 ************
关于 model.eval() model.train() with torch.no_grad() with torch.set_grad_enabled(bool) 区别
return:
① batch_losses:每个batch均值loss列表
② 整个epoch 的 acc,recall,precision,F1
"""
print('Training ... ')
model.train()
model.to(device)
LRs = [i['lr'] for i in optimizer.param_groups] # 获取当前epoch 优化器 optimizer 学习率组
batch_losses = []
batch_targets = []
batch_predicts = []
for idx, (input_x, target) in enumerate(train_loader):
input_x, target = input_x.to(device), target.to(device)
optimizer.zero_grad()
output = model(input_x) # 前向传播
loss = loss_fun(output, target, alpha=1.0, gamma=2.0)
loss.backward() # 反向传播计算梯度
optimizer.step() # 更新
batch_losses.append(loss.item())
# 计算score
pre = torch.argmax(output, dim=1)
pre = pre.cpu().numpy().reshape(-1).tolist()
target = target.cpu().numpy().reshape(-1).tolist()
(acc,recall,precision,F1),con_matrix = metric_fun(target=target,predict=pre)
batch_targets.extend(target)
batch_predicts.extend(pre)
if info_interval is not None:
if idx % info_interval == 0:
print("Epoch:{}\t[{}\{}\t\t{:.2f}%]\tLoss:{:.8f}\tScores: < acc:{:.3f}%\t"\
"macro_recall:{:.3f}%\tmacro_precision:{:.3f}%\tmacro_F1:{:.3f}%\t >\t\tBatch input_x shape:{}".format(
current_epoch, idx * len(input_x),
len(train_loader.dataset), 100. * (idx / len(train_loader)),loss.item(),
100. * acc,100. * recall,100. * precision,100. * F1,input_x.shape
))
# 计算一个epoch的score
(epoch_acc, epoch_recall, epoch_precision, epoch_F1), con_matrix = metric_fun(target=batch_targets, predict=batch_predicts)
print("Epoch Info :\tLoss:{:.8f}\tScores: <\tacc:{:.3f}%\t "\
"macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>\tLRs:{}".format(
np.mean(batch_losses),100. * epoch_acc,100. * epoch_recall,100. * epoch_precision,100. * epoch_F1,LRs
))
return batch_losses,[epoch_acc, epoch_recall, epoch_precision, epoch_F1]
def eval_one_epoch(model,device,loss_fun,metric_fun,eval_loader):
"""
********** 一个epoch模型验证 ************
关于 model.eval() model.train() with torch.no_grad() with torch.set_grad_enabled(bool) 区别
return: batch_losses 每个batch均值loss列表,batch_scores 每个batch的 acc,recall,precision,F1
"""
print('Evaling ... ')
model.eval() # 开启与dropout、BN层,它不会阻止梯度的计算,只不过回传参数,因此,eval 模式使用 with torch.no_grad() 还是很有必要的,加快计算速度。
model.to(device)
batch_losses = []
batch_targets = []
batch_predicts = []
with torch.no_grad():
for idx, (input_x, target) in enumerate(eval_loader):
input_x, target = input_x.to(device), target.to(device)
output = model(input_x) # 前向传播
loss = loss_fun(output, target, alpha=1.0, gamma=2.0)
batch_losses.append(loss.item())
# 计算score
pre = torch.argmax(output, dim=1)
pre = pre.cpu().numpy().reshape(-1).tolist()
target = target.cpu().numpy().reshape(-1).tolist()
(acc, recall, precision, F1), con_matrix = metric_fun(target=target, predict=pre)
batch_targets.extend(target)
batch_predicts.extend(pre)
# 计算一个epoch的score
(epoch_acc, epoch_recall, epoch_precision, epoch_F1), con_matrix = metric_fun(target=batch_targets, predict=batch_predicts)
print(
"Epoch Info :\tLoss:{:.8f}\tScores: Scores: <\tacc:{:.3f}%\t "\
"macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>".format(
np.mean(batch_losses), 100. * epoch_acc, 100. * epoch_recall,
100. * epoch_precision, 100. * epoch_F1
))
return batch_losses,[epoch_acc, epoch_recall, epoch_precision, epoch_F1]
def train(model,device,optimizer,scheduler_fun,loss_fun,epochs,metric_fun,info_interval,checkpoint,train_loader,eval_loader):
"""
********** 模型训练 ************
return:
① train_losses,eval_losses: 2D list ,(epoch,batch_num)
② train_scores,eval_scores: 2D list,(epoch,4)acc,recall,precision,F1
"""
# 判断加载已保留的最优的模型参数【支持断点续传】
best_scores = [-0.000001,-0.000001,-0.000001,-0.000001] # 定义初始的acc,recall,precision,F1的值
history_epoch,best_epoch = 0,0 # 定义历史训练模型epoch次数初始值、最优模型的epoch初始值
best_params = copy.deepcopy(model.state_dict()) # 获取模型的最佳参数,OrderDict属于链表,对其更该引用的变量也会变动,因此这里要用到深拷贝
best_optimizer = copy.deepcopy(optimizer.state_dict())
LRs = [i['lr'] for i in optimizer.param_groups]
if os.path.exists(checkpoint):
"""
为了保证 gpu/cpu 训练的模型参数可以相互加载,这里在load时使用 map_location=lambda storage, loc: storage 来控制,详情请看文章:
https://blog.csdn.net/nospeakmoreact/article/details/89634039?utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.withoutpai&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.withoutpai
"""
if torch.cuda.is_available():
ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage.cuda()) # 使用 gpu 读取 模型参数
else:
ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage) # 使用 cpu 读取模型参数
best_scores = ck_dict['best_score']
history_epoch,best_epoch = ck_dict['epochs'],ck_dict['best_epochs']
model.load_state_dict(ck_dict['best_params'])
# optimizer.load_state_dict(ck_dict['optimizer'])
# if torch.cuda.is_available():
# """
# 重载optimizer的参数时将所有的tensor都放到cuda上(optimizer保存时默认放在cpu上了),详情见:
# https://blog.csdn.net/weixin_41848012/article/details/105675735
# """
# for state in optimizer.state.values():
# for k, v in state.items():
# if torch.is_tensor(v):
# state[k] = v.cuda()
best_params = copy.deepcopy(model.state_dict()) # 获取模型的最佳参数,OrderDict属于链表,对其更该引用的变量也会变动,因此这里要用到深拷贝
# best_optimizer = copy.deepcopy(optimizer.state_dict())
LRs = [i['lr'] for i in optimizer.param_groups]
print('From "{}" load history model params:\n\tTrained Epochs:{}\n\t'\
'Best Model Epoch:{}\n\t各层学习率 LRs 为:{}\n\tBest Score:<\tacc:{:.3f}%\t'\
' macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>\n'.format(
checkpoint, history_epoch,best_epoch,LRs
, 100. * best_scores[0],100. * best_scores[1]
,100. * best_scores[2],100. * best_scores[3]))
# print(best_params)
# print(best_optimizer)
# Train
train_losses =[]
eval_losses = []
train_scores = []
eval_scores = []
for epoch in range(1,epochs + 1):
# 获得本次训练的 lr 学习率
scheduler_fun.step(history_epoch + epoch) # 这里需要使用历史的epoch,为了是LR变化符合 Warmup + cosine
LRs = [i['lr'] for i in optimizer.param_groups]
# train & eval
train_batch_loss,train_score = train_one_epoch(model=model,
device=device,
optimizer=optimizer,
loss_fun=loss_fun,
metric_fun=metric_fun,
train_loader=train_loader,
current_epoch=history_epoch+epoch,
info_interval=info_interval)
print()
eval_batch_loss,eval_score = eval_one_epoch(model=model,
device=device,
loss_fun=loss_fun,
metric_fun=metric_fun,
eval_loader=eval_loader)
train_losses.append(train_batch_loss)
eval_losses.append(eval_batch_loss)
train_scores.append(train_score)
eval_scores.append(eval_score)
# 保存模型[当验证集的 F1 值 大于最优F1时,模型进行保存
if best_scores[3] < eval_score[3]:
print('历史模型分值:{:.3f}%,更新分值{:.3f}%,优化器学习率:{},模型参数更新保存\n'.format(100.*best_scores[3],100.*eval_score[3],LRs))
best_scores = eval_score
best_params = copy.deepcopy(model.state_dict())
best_optimizer = copy.deepcopy(optimizer.state_dict())
best_epoch = history_epoch + epoch
else:
print("模型最优的epcoh为:{},模型验证集最高分值:{:.3f}%, model 效果未提升\n".format(best_epoch,100.* best_scores[3]))
ck_dict = {
"best_score":best_scores,
"best_params":best_params,
"optimizer":best_optimizer,
'epochs':history_epoch + epoch,
'best_epochs':best_epoch
}
torch.save(ck_dict,checkpoint)
# 训练结束,将模型赋予最优的参数
model.load_state_dict(best_params)
return model,train_losses,eval_losses,train_scores,eval_scores
if __name__ == '__main__':
dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(dev)
# 数据加载
vocab_2_id = get_vocab(Config.vocab_save_path) # 词汇表 50002
embedding_table = get_pretrain_embedding(Config.embedding_path) # (50002,200),numpy float默认为64位,torch需要32位,需要转化为float32的tensor
# DataSet DataLoader
X_train,target_train = data_2_id(vocab_2_id,Config.max_seq,Config.train_data)
kwargs = {'num_workers':Config.num_workers,'pin_memory':True} if torch.cuda.is_available() else {'num_workers':Config.num_workers}
train_dataset = Data_Set(X_train,target_train)
train_loader = DataLoader(dataset=train_dataset,
batch_size=Config.batch_size,
shuffle=True,
collate_fn = collate_fn,
**kwargs
)
print('dataloader 第一个batch的情况如下:')
print(next(iter(train_loader)),next(iter(train_loader))[0].shape)
X_val,target_val = data_2_id(vocab_2_id,Config.max_seq,Config.val_data)
# TODO 为了避免batch长短不齐形成过多的PAD,这里对 eval 数据 按照真实的长度从小到大排序
X_val,target_val = sort_eval(X_val,target_val)
val_dataset = Data_Set(X_val,target_val)
val_loader = DataLoader(dataset=val_dataset,
batch_size=Config.batch_size,
shuffle=False,
collate_fn = collate_fn,
**kwargs
)
# 模型搭建
if Config.model_name == 'lstm_attention':
model = LSTM_Attention( vocab_size = len(vocab_2_id),
n_class = Config.num_classes,
embedding_dim = Config.embedding_dim,
hidden_dim = Config.hidden_dim,
num_layers = Config.layer_num,
dropout = Config.dropout,
bidirectional = Config.bidirectional,
embedding_weights = embedding_table,
train_w2v = Config.w2v_grad
)
# print(model.embedding.weight)
else:
model = LSTM_Model(vocab_size = len(vocab_2_id),
n_class = Config.num_classes,
embedding_dim = Config.embedding_dim,
hidden_dim = Config.hidden_dim,
num_layers = Config.layer_num,
dropout = Config.dropout,
bidirectional = Config.bidirectional,
embedding_weights = embedding_table,
train_w2v = Config.w2v_grad
)
print('Model-"{}" 细节:\n'.format(Config.model_name),model)
view_will_trained_params(model,model_name=Config.model_name)
# 优化器、学习率调整器、LOSS函数,设置分层学习率
special_layers = nn.ModuleList([model.embedding])
# 获取特等层的参数列表的内存id列表
special_layers_ids = list(map(lambda x: id(x), special_layers.parameters()))
# 基础层的参数列表
basic_params = filter(lambda x: id(x) not in special_layers_ids, model.parameters())
optimizer = optim.Adam([{'params': filter(lambda p: p.requires_grad, basic_params)},
{'params': filter(lambda p: p.requires_grad, special_layers.parameters()), 'lr': 8e-5}],
lr=Config.learning_rate)
scheduler_fun = WarmupCosineLR(optimizer,warmup_iter=4,lrs_min=[5e-5,1e-6],T_max=40)
# train
if Config.focal_loss:
loss_fun = focal_loss
else:
loss_fun = cross_entropy
train(model=model,
device=device,
optimizer=optimizer,
scheduler_fun=scheduler_fun,
loss_fun=loss_fun,
epochs=Config.epochs,
metric_fun=get_score,
info_interval=Config.info_interval,
checkpoint=Config.checkpoint,
train_loader=train_loader,
eval_loader=val_loader)
8、测试集test
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from __future__ import with_statement
from model import LSTM_Attention,LSTM_Model
from data_process import data_2_id
from loader_utils import get_vocab, Data_Set, collate_fn,sort_eval
from model_utils import get_score
from create_config import Config
from torch.utils.data import DataLoader
import torch
import numpy as np
import os
import re
def eval_one_epoch(model, device, metric_fun, eval_loader):
"""
********** 一个epoch模型验证 ************
"""
print('Predict ... ')
model.eval() # 开启与dropout、BN层,它不会阻止梯度的计算,只不过回传参数,因此,eval 模式使用 with torch.no_grad() 还是很有必要的,加快计算速度。
model.to(device)
batch_targets = []
batch_predicts = []
error_samples = []
with torch.no_grad():
for idx, (input_x, target) in enumerate(eval_loader):
input_x, target = input_x.to(device), target.to(device)
output = model(input_x) # 前向传播
# 计算score
pre = torch.argmax(output, dim=1)
error_x = input_x[target != pre]
error_target = pre[target != pre]
pre = pre.cpu().numpy().reshape(-1).tolist()
target = target.cpu().numpy().reshape(-1).tolist()
error_x = error_x.cpu().numpy().tolist()
error_target = error_target.cpu().numpy().tolist()
batch_targets.extend(target)
batch_predicts.extend(pre)
error_samples.append((error_target,error_x))
# 计算一个epoch的score
(epoch_acc, epoch_recall, epoch_precision, epoch_F1), con_matrix = metric_fun(target=batch_targets,
predict=batch_predicts)
print(
"Epoch Info :\tScores: Scores: <\tacc:{:.3f}%\t macro_recall:{:.3f}%\t"\
" macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>".format(100. * epoch_acc, 100. * epoch_recall,
100. * epoch_precision, 100. * epoch_F1
))
return [epoch_acc, epoch_recall, epoch_precision, epoch_F1],con_matrix,error_samples
def predict(model,device, metric_fun,checkpoint,predict_loader):
"""
********** 模型测试 ************
"""
# 判断加载已保留的最优的模型参数
if os.path.exists(checkpoint):
if torch.cuda.is_available():
ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage.cuda()) # 使用 gpu 读取 模型参数
else:
ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage) # 使用 cpu 读取模型参数
best_scores = ck_dict['best_score']
history_epoch,best_epoch = ck_dict['epochs'],ck_dict['best_epochs']
model.load_state_dict(ck_dict['best_params'])
print(
'From "{}" load history model params:\n\tTrained Epochs:{}\n\tBest Model Epoch:{}\n'\
'\tBest Score:<\tacc:{:.3f}%\t macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>\n\t'.format(
checkpoint, history_epoch,best_epoch, 100. * best_scores[0], 100. * best_scores[1], 100. * best_scores[2],
100. * best_scores[3]))
# predict
eval_score,con_matrix,error_samples = eval_one_epoch(model=model,
device=device,
metric_fun=metric_fun,
eval_loader=predict_loader)
else:
print('Model not exists .... ')
eval_score = None
con_matrix = None
error_samples = None
exit()
return eval_score,con_matrix,error_samples
if __name__ == '__main__':
dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(dev)
# 数据加载,#预测的情况下无需加载 预训练的embedding 表,load model paramters 会加载
vocab_2_id = get_vocab(Config.vocab_save_path) # 词汇表 50002
# DataSet DataLoader
X_test, target_test = data_2_id(vocab_2_id, Config.max_seq, Config.test_data)
# TODO 为了避免batch长短不齐形成过多的PAD,这里对 eval 数据 按照真实的长度从小到大排序
X_test, target_test = sort_eval(X_test, target_test)
kwargs = {'num_workers': Config.num_workers, 'pin_memory': True} if torch.cuda.is_available() else {
'num_workers': Config.num_workers}
test_dataset = Data_Set(X_test, target_test)
test_loader = DataLoader(dataset=test_dataset,
batch_size=Config.batch_size,
shuffle=False,
collate_fn=collate_fn,
**kwargs
)
print('dataloader 第一个batch的情况如下:')
print(next(iter(test_loader)), next(iter(test_loader))[0].shape)
# 模型搭建
if Config.model_name == 'lstm_attention':
model = LSTM_Attention(vocab_size=len(vocab_2_id),
n_class=Config.num_classes,
embedding_dim=Config.embedding_dim,
hidden_dim=Config.hidden_dim,
num_layers=Config.layer_num,
dropout=Config.dropout,
bidirectional=Config.bidirectional,
embedding_weights=None, # 预测的情况下会加载
train_w2v=Config.w2v_grad
)
# print(model.embedding.weight)
else:
model = LSTM_Model(vocab_size=len(vocab_2_id),
n_class=Config.num_classes,
embedding_dim=Config.embedding_dim,
hidden_dim=Config.hidden_dim,
num_layers=Config.layer_num,
dropout=Config.dropout,
bidirectional=Config.bidirectional,
embedding_weights=None,
train_w2v=Config.w2v_grad
)
print('Model-"{}" 细节:\n'.format(Config.model_name), model)
# predict
_,con_matrix,error_samples = predict(model=model,
device=device,
metric_fun=get_score,
checkpoint=Config.checkpoint,
predict_loader=test_loader)
print('混淆矩阵:\n',con_matrix)
# 保存 测试出错了样本
print('保存测试集错误的样本:"{}"'.format('./data/test_error_sample.data'))
error_target,error_x = zip(*error_samples)
error_target_ = []
error_x_ =[]
for i in range(len(error_target)):
for j in range(len(error_target[i])):
error_target_.append(error_target[i][j])
error_x_.append(error_x[i][j])
print(len(error_target_),len(error_x_))
vocab_keys = list(vocab_2_id.keys())
error_x_ = [np.array(vocab_keys)[np.array(i)].tolist() for i in error_x_]
with open('./data/test_error_sample.data','w',encoding='utf-8') as w:
for idx in range(len(error_target_)):
word_str = ''.join(error_x_[idx])
word_str = re.sub('<PAD>\s*', '', word_str)
w.write(str(error_target_[idx]))
w.write('\t')
w.write(word_str)
w.write('\n')
9、无target预测
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/8/20 23:06
# @Author :
# @Site :
# @File : predict.py
# @Software: PyCharm
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from __future__ import with_statement
from os import lseek
from model import LSTM_Attention, LSTM_Model
from data_process import data_2_id, DataProcessNoTarget
from loader_utils import get_vocab, Data_Set, collate_fn
from create_config import Config
from torch.utils.data import DataLoader
import torch
import torch.nn.functional as F
import os
import numpy as np
def eval_one_epoch(model, device, eval_loader):
"""
********** 一个epoch模型验证 ************
"""
print('Predict ... ')
model.eval() # 开启与dropout、BN层,它不会阻止梯度的计算,只不过回传参数,因此,eval 模式使用 with torch.no_grad() 还是很有必要的,加快计算速度。
model.to(device)
batch_predicts = []
batch_probs = []
with torch.no_grad():
for idx, input_x in enumerate(eval_loader):
input_x = input_x.to(device)
output = model(input_x) # 前向传播
output = F.softmax(output,dim=-1)
# 计算score
prob,pre = torch.max(output,dim=-1)
prob = prob.cpu().numpy().reshape(-1).tolist()
pre = pre.cpu().numpy().reshape(-1).tolist()
batch_predicts.extend(pre)
batch_probs.append(prob)
return np.array(batch_predicts),np.array(batch_probs)
def predict(model, device, checkpoint, predict_loader):
"""
********** 模型测试 ************
"""
# 判断加载已保留的最优的模型参数
if os.path.exists(checkpoint):
if torch.cuda.is_available():
ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage.cuda()) # 使用 gpu 读取 模型参数
else:
ck_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage) # 使用 cpu 读取模型参数
best_scores = ck_dict['best_score']
history_epoch, best_epoch = ck_dict['epochs'], ck_dict['best_epochs']
model.load_state_dict(ck_dict['best_params'])
print(
'From "{}" load history model params:\n\tTrained Epochs:{}\n\tBest Model Epoch:{}\n' \
'\tBest Score:<\tacc:{:.3f}%\t macro_recall:{:.3f}%\t macro_precision:{:.3f}%\t macro_F1:{:.3f}%\t>\n\t'.format(
checkpoint, history_epoch, best_epoch, 100. * best_scores[0], 100. * best_scores[1],
100. * best_scores[2],
100. * best_scores[3]))
# predict
predict_array,probs_array = eval_one_epoch(model=model,
device=device,
eval_loader=predict_loader)
else:
print('Model not exists .... ')
predict_array = None
probs_array = None
exit()
return predict_array,probs_array
if __name__ == '__main__':
dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(dev)
# 数据加载
vocab_2_id = get_vocab(Config.vocab_save_path) # 词汇表 50002
# DataSet DataLoader
X_predict = DataProcessNoTarget().forward(Config.predict_data,Config.stop_word_path,vocab_2_id,Config.max_seq)
kwargs = {'num_workers': Config.num_workers, 'pin_memory': True} if torch.cuda.is_available() else {
'num_workers': Config.num_workers}
predict_dataset = Data_Set(X_predict)
predict_loader = DataLoader(dataset=predict_dataset,
batch_size=Config.batch_size,
shuffle=False,
collate_fn=collate_fn,
**kwargs
)
print('dataloader 第一个batch的情况如下:')
print(next(iter(predict_loader)), next(iter(predict_loader))[0].shape)
# 模型搭建
if Config.model_name == 'lstm_attention':
model = LSTM_Attention(vocab_size=len(vocab_2_id),
n_class=Config.num_classes,
embedding_dim=Config.embedding_dim,
hidden_dim=Config.hidden_dim,
num_layers=Config.layer_num,
dropout=Config.dropout,
bidirectional=Config.bidirectional,
embedding_weights=None, # 预测的情况下会加载
train_w2v=Config.w2v_grad
)
# print(model.embedding.weight)
else:
model = LSTM_Model(vocab_size=len(vocab_2_id),
n_class=Config.num_classes,
embedding_dim=Config.embedding_dim,
hidden_dim=Config.hidden_dim,
num_layers=Config.layer_num,
dropout=Config.dropout,
bidirectional=Config.bidirectional,
embedding_weights=None,
train_w2v=Config.w2v_grad
)
print('Model-"{}" 细节:\n'.format(Config.model_name), model)
# predict
predict_array,probs_array = predict(model=model,
device=device,
checkpoint=Config.checkpoint,
predict_loader=predict_loader)
print('predict 结果:\n结果:{}\n置信度:{}'.format(np.array(['讨厌','喜欢'])[predict_array],probs_array))
三、训练时候的问题
3.1、模型过拟合
1. 首排除模型问题:
1.1 换成简单的模型,训练后模型过拟合依然存在,排除模型错误
1.1 双向的lstm怎么区分处理pad的输出与真实信息输出
正向传播:由于我们在文本前面进行PAD,所以正向传播的过程中最后一个时刻的输出应该是全部句子正向
传播的语义表征
反向传播:而bilstm中反向传播时,前面的时刻都是PAD的数据,因此PAD的数据的输出不应该使用,我们
应该根据每个batch中每个样本真实的数据的长度来获取句子的语义表征。
数据问题:
1. 数据是否不均衡
2. 样本存在一部分噪声样本,需要用机器学习的方法将其清洗,oneclassvm
3. 原数据是否打乱了排序?train_dataloader 是否打乱顺序,相对的 val_loader/test_loader 是否从大到小排序【为了避
免长短不齐加入过多的PAD标签】
4. 停止词字典与去除高频词可能过滤掉了有用词【情感分析场景下:哈、吗、呵,这些词也是有主观色彩的,不建议去除】
训练问题
5. dataloader 中的定义coffle_fn 函数剪切每一个batch的长度为最长真实长度,使的每一个batch的长度可能不同
6. 如果使用了预训练模型,是否使用分层学习率,过大了学习率会使预训练模型震荡
7. 学习率的定义、loss函数 focal loss、学习率衰减策略
focal loss:主要为了解决样本不均衡的问题,还有一些标签正确却特征难学的样本,但是如果数据中脏数据较多可能
导致模型的准确率下降
3.2、常见的问题
1. 为什么 PAD 标签对应的id 是0,UNK 不是0呢?
这是因为后续对batch每个样本真实长度统计计算方便规定PAD的id为0
2. 为什么PAD会放在前面?
PAD放在前面时,正向传播时最后一个时刻是真实的数据,不需要去针对真实的样本数据的长度去索引真实数据的最后
一个时刻的输出
3、验证集与测试集最优的 acc、macro reall、macro precison、macro f1 为 92.5%,进一步训练会过拟合
原因:原始数据中存在一部分噪声数据,测试集将预测错误的样本保存发现:
① 有些标签原始数据标注错误
② 有些数据情感色彩处于模棱两可的状态
解决方法猜想:
数据预处理分割train、val、test集前应该做离群点检测,机器学习的方法有OneClassSVM、Isolation Forest等