import pandas as pd
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import re
# fasta = ">description\nAAAAAAAAAAAAAAACCCCCCCCCCCGGGGGGGGGGGGGTTTTTTTTTTTTTTTT\n"classhot_dna:def__init__(self, fasta):# check for and grab sequence nameif re.search(">", fasta):
name = re.split("\n", fasta)[0]
sequence = re.split("\n", fasta)[1]else:
name ='unknown_sequence'
sequence = fasta
# get sequence into an array
seq_array = array(list(sequence))# integer encode the sequence
label_encoder = LabelEncoder()
integer_encoded_seq = label_encoder.fit_transform(seq_array)# one hot the sequence
onehot_encoder = OneHotEncoder(sparse=False)# reshape because that's what OneHotEncoder likes
integer_encoded_seq = integer_encoded_seq.reshape(len(integer_encoded_seq),1)
onehot_encoded_seq = onehot_encoder.fit_transform(integer_encoded_seq)# add the attributes to self
self.name = name
self.sequence = fasta
self.integer = integer_encoded_seq
self.onehot = onehot_encoded_seq
inputfile ="H_sapiens_acc_sample__len398_pos.fasta"
savefile ="SpliceRover_H_sapiens_acc_pos.csv"withopen(inputfile,"r")as f:
data = f.readlines()for index,line inenumerate(data):# index 从0开始,if index %2==0:
fasta = data[index]+data[index+1]
my_hottie = hot_dna(fasta)
onehot = pd.DataFrame(my_hottie.onehot)
onehot.to_csv(savefile,index=False,header=False,mode="a+")
f.close()