生物序列生成onehot编码

  • 定义生成onehot类
  • 每次读入一行描述行和一行序列
  • 生成csv文件
import pandas as pd
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import re

# fasta = ">description\nAAAAAAAAAAAAAAACCCCCCCCCCCGGGGGGGGGGGGGTTTTTTTTTTTTTTTT\n"
class hot_dna:
    def __init__(self, fasta):

        # check for and grab sequence name
        if re.search(">", fasta):
            name = re.split("\n", fasta)[0]
            sequence = re.split("\n", fasta)[1]
        else:
            name = 'unknown_sequence'
            sequence = fasta

        # get sequence into an array
        seq_array = array(list(sequence))

        # integer encode the sequence
        label_encoder = LabelEncoder()
        integer_encoded_seq = label_encoder.fit_transform(seq_array)

        # one hot the sequence
        onehot_encoder = OneHotEncoder(sparse=False)
        # reshape because that's what OneHotEncoder likes
        integer_encoded_seq = integer_encoded_seq.reshape(len(integer_encoded_seq), 1)
        onehot_encoded_seq = onehot_encoder.fit_transform(integer_encoded_seq)

        # add the attributes to self
        self.name = name
        self.sequence = fasta
        self.integer = integer_encoded_seq
        self.onehot = onehot_encoded_seq

inputfile = "H_sapiens_acc_sample__len398_pos.fasta"
savefile = "SpliceRover_H_sapiens_acc_pos.csv"

with open(inputfile,"r") as f:
    data = f.readlines()
    for index,line in enumerate(data):
        # index 从0开始,

        if index % 2 == 0:
            fasta = data[index]+data[index+1]
            my_hottie = hot_dna(fasta)
            onehot = pd.DataFrame(my_hottie.onehot)
            onehot.to_csv(savefile,index=False,header=False,mode="a+")
f.close()

  • 0
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值