1.gensim的使用
先上一个3.x的api转化成4.x的链接:https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
刚上手的时候发现网上很多写法都是3.x的版本,自己环境的版本是4.x导致运行时很多错误,但是想实现的功能又是3.x的写法的例子多,有了这个可以自己查询然后转化。
from gensim.models import Word2Vec as wtv
model = wtv(tmp_seqs,min_count=2,window=5,sg=1,vector_size=16,epochs=10)#这里的size表示序列想转化成的维度,传的序列形式为[['AAA','TTT],['CCC',GGG]],sg表示词向量的训练模型,0表cbow,1是skim-gram
model.save("./word2vec_model")# 加载模型词向量参数
word_model = wtv.load("./word2vec_model") # 加载模型词向量参数
word_model.train([["ACT","ACG"]], total_examples=1, epochs=10) #基于新语料更新词向量表示
#这里是我放的处理DNA列的代码,f的格式为['ACTG','ACTG']这种格式,可以改成[['AAA','TTT],['CCC',GGG]]这种输入格式更好
def seqs_convert_array(f):
new_seqs=[]
for sentence in f:
new_seq=[]
for i in range(len(sentence)-2):
new_seq.append(model.wv[sentence[i:i+3]])
new_seqs.append(new_seq)
return new_seqs
2.使用sklearn包记录对要训练的feature和label进行切分和数据打包
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch
#这里传进去的标签l需要的形式是1维分类即可 l=[1,0,1,0] 这种形式
X_train, X_eval, y_train, y_eval = train_test_split(new_arrays, l, test_size=0.2, random_state=666)
#转化为tensor形式
X_train = torch.tensor(X_train)
X_eval = torch.tensor(X_eval)
y_train = torch.tensor(y_train)
y_eval = torch.tensor(y_eval)
#数据打包
batch_size = 256
train_dataset = TensorDataset(X_train, y_train)
# 一批一批地读取
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
3.构建lstm神经网络
这里暂时就用了一个lstm网络和一个全连接进行二分类,lstm将输入特征维度,仍然隐藏层转化为16维度,先实现架构,后续再调参。h和c两个参数暂时没用到先注释后续再调参
class LSTMModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(LSTMModel, self).__init__()
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
# LSTM
self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True) # batch_first=True (batch_dim, seq_dim, feature_dim)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
# Initialize hidden state with zeros
#h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
# Initialize cell state
#c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
# 28 time steps
# We need to detach as we are doing truncated backpropagation through time (BPTT)
# If we don't, we'll backprop all the way to the start even after going through another batch
#out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
out,_ = self.lstm(x)
# Index hidden state of last time step
# out.size() --> 100, 28, 100
# out[:, -1, :] --> 100, 100 --> just want last time step hidden states!
out = self.fc(out[:, -1, :])
# out.size() --> 100, 10
return out
4.训练模型
这里写的比较简洁,刚上路暂时先写个简洁版,后续完善一下这个训练的模型,写成一个完善一点的函数进行调参
input_dim = 16
hidden_dim = 16
layer_dim = 1
output_dim = 2
epochs=20
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
optimizer=torch.optim.Adam(model.parameters(),lr=0.01)
loss_func=torch.nn.CrossEntropyLoss()
for i in range(epochs):
tr_loss = 0
tr_examples = 0
tr_steps = 0
for x,y in train_dataloader:
optimizer.zero_grad()
# 前向传播
out= model(x)
#print("keys : ", outputs.keys()) # ['loss', 'logits']
# 获取loss
loss = loss_func(out,y)
# 保存loss
#train_loss.append(loss.item())
# 累计loss
#tr_loss += loss.item()
# 累计样本量
#tr_examples += x.size(0)
# 多少批次
#tr_steps += 1
# 反向传播
loss.backward()
# 更新参数
optimizer.step()
# 更新学习率
print("Training loss : {}".format(loss))
with torch.no_grad():
pred=model(X_eval)
loss = loss_func(pred,y_eval)
print("accurray loss : {}".format(loss))
5.后续
第一次写博客记录,暂时先这样,后续慢慢回积累,希望这个习惯能坚持下去,很多时候代码都写过一次就忘记了,又要网上查了复制修改,现在我就可以很快的找到自己的写的代码复制修改,完结~