CNN和LSTM实现DNA结合蛋白二分类(python+keras实现)

CNN和LSTM实现DNA结合蛋白二分类(python+keras实现)


主要内容

  • word to vector
  • 结合蛋白序列修正
  • word embedding
  • CNN1D实现
  • LSTM实现

from __future__ import print_function
import numpy as np
import h5py
from keras.models import model_from_json

np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.datasets import imdb
import cPickle


def trans(str1):
    a = []
    dic = {'A':1,'B':22,'U':23,'J':24,'Z':25,'O':26,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,'V':18,'W':19,'Y':20,'X':21}
    for i in range(len(str1)):
        a.append(dic.get(str1[i]))
    return a


def createTrainData(str1):
    sequence_num = []
    label_num = []
    for line in open(str1):
        proteinId, sequence, label = line.split(",")
        proteinId = proteinId.strip(' \t\r\n');
        sequence = sequence.strip(' \t\r\n');
        sequence_num.append(trans(sequence))
        label = label.strip(' \t\r\n');
        label_num.append(int(label))

    return sequence_num,label_num



a,b=createTrainData("positive_and_negative.csv")
t = (a, b)
cPickle.dump(t,open("data.pkl","wb"))

def createTrainTestData(str_path, nb_words=None, skip_top=0,
              maxlen=None, test_split=0.25, seed=113,
              start_char=1, oov_char=2, index_from=3):
    X,labels = cPickle.load(open(str_path, "rb"))

    np.random.seed(seed)
    np.random.shuffle(X)
    np.random.seed(seed)
    np.random.shuffle(labels)
    if start_char is not None:
        X = [[start_char] + [w + index_from for w in x] for x in X]
    elif index_from:
        X = [[w + index_from for w in x] for x in X]

    if maxlen:
        new_X = []
        new_labels = []
        for x, y in zip(X, labels):
            if len(x) < maxlen:
                new_X.append(x)
                new_labels.append(y)
        X = new_X
        labels = new_labels
    if not X:
        raise Exception('After filtering for sequences shorter than maxlen=' +
                        str(maxlen) + ', no sequence was kept. '
                                      'Increase maxlen.')
    if not nb_words:
        nb_words = max([max(x) for x in X])


    if oov_char is not None:
        X = [[oov_char if (w >= nb_words or w < skip_top) else w for w in x] for x in X]
    else:
        nX = []
        for x in X:
            nx = []
            for w in x:
                if (w >= nb_words or w < skip_top):
                    nx.append(w)
            nX.append(nx)
        X = nX

    X_train = np.array(X[:int(len(X) * (1 - test_split))])
    y_train = np.array(labels[:int(len(X) * (1 - test_split))])

    X_test = np.array(X[int(len(X) * (1 - test_split)):])
    y_test = np.array(labels[int(len(X) * (1 - test_split)):])

    return (X_train, y_train), (X_test, y_test)



# Embedding
max_features = 23
maxlen = 1000
embedding_size = 128

# Convolution
#filter_length = 3
nb_filter = 64
pool_length = 2

# LSTM
lstm_output_size = 70

# Training
batch_size = 128
nb_epoch = 100


print('Loading data...')
(X_train, y_train), (X_test, y_test) = createTrainTestData("data.pkl",nb_words=max_features, test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Build model...')

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Dropout(0.5))
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=10,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
model.add(MaxPooling1D(pool_length=pool_length))
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=5,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
model.add(MaxPooling1D(pool_length=pool_length))

model.add(LSTM(lstm_output_size))
model.add(Dense(1))
model.add(Activation('relu'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,
          validation_data=(X_test, y_test))

#json_string = model.to_json()
#open('my_model_rat.json', 'w').write(json_string)
#model.save_weights('my_model_rat_weights.h5')
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
print('***********************************************************************')



github链接:代码实现
文章地址 :PLOS ONE
数据地址:datasets

  • 4
    点赞
  • 60
    收藏
    觉得还不错? 一键收藏
  • 14
    评论
Python是一种高级编程语言,Keras是一个用于构建神经网络模型的深度学习库,CNN代表卷积神经网络,LSTM代表长短期记忆。Python的流行性使得它成为使用Keras库构建神经网络模型的理想选择。 卷积神经网络(CNN)是一种前馈神经网络,常用于计算机视觉任务。其核心思想是通过卷积运算来提取图像的特征。CNN在图像识别、物体检测和语义分割等任务中表现出色。 长短期记忆(LSTM)是一种适用于处理序列数据的循环神经网络(RNN)的特殊类型。与普通的RNN相比,LSTM能够更好地捕捉到长期依赖关系。LSTM通过控制记忆单元来处理序列中的信息,对于许多自然语言处理任务,如语言建模和机器翻译,LSTM是一个非常强大的工具。 在使用Keras库时,借助Python的灵活性和易用性,我们可以轻松地利用CNNLSTM来构建复杂的深度学习模型。Keras库提供了丰富的高级API和多种预训练模型,可以帮助我们快速搭建和训练模型。 例如,我们可以使用Keras库中的layers模块来创建CNN模型的卷积层和池化层,然后使用LSTM层来处理时序数据。在构建模型时,我们可以选择性地添加Batch Normalization层或Dropout层来提高模型的泛化能力。 使用Keras库可以进行模型的编译、训练和评估等操作,还可以进行模型的保存和加载。同时,Keras库还提供了方便的可视化工具,如TensorBoard,可以帮助我们更好地理解和分析模型的结构和性能。 综上所述,PythonKerasCNNLSTM是一组强大的工具和技术,可以用于构建和训练复杂的深度学习模型,解决图像识别、自然语言处理和其他各种任务。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 14
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值