- 把生物序列转化为onehot编码
- 读取文件(注意这个文件没有fasta文件的描述行,只有序列行)
- 保存为csv,每行表示一个字符
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
flag = 0
def Process_one_hot(input_word):
input_word = list(input_word)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = np.array(input_word).reshape(len(input_word), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
return onehot_encoded
def read_seq_save_onehot(read_file, save_file):
result_str = ''
with open(read_file, "r") as f:
for line in f.readlines():
line = line.strip('\n')
result_str = result_str + line
onehot_encoded = Process_one_hot(result_str)
save = pd.DataFrame(onehot_encoded, columns=['A', 'C', 'G', 'T'])
save['Class'] = flag
print(save)
save.to_csv(save_file, mode='a', columns=['Class', 'A', 'C', 'G', 'T'],
index=False)
read_file = "EI_false_test.txt"
save_file = "EI_false_test.csv"
read_seq_save_onehot(read_file,save_file)