DNA特征提取ANF

DNA特征提取 专栏收录该内容
0 篇文章 0 订阅

import sys, os, platform
import pandas as pd
import re
import os

def ANF(fastas, **kw):

    AA = 'ACGT'
    encodings = []
    header = ['#', 'label']
    for i in range(1, len(fastas[0][1]) + 1):
        header.append('ANF.' + str(i))
    # encodings.append(header)

    for i in fastas:
        name, sequence, label = i[0], i[1], i[2]
        code = [name, label]
        for j in range(len(sequence)):
            code.append(sequence[0: j + 1].count(sequence[j]) / (j + 1))
        encodings.append(code[2:])
    return encodings

def read_nucleotide_sequences(file):
    if os.path.exists(file) == False:
        print('Error: file %s does not exist.' % file)
        sys.exit(1)
    with open(file) as f:
        records = f.read()
    if re.search('>', records) == None:
        print('Error: the input file %s seems not in FASTA format!' % file)
        sys.exit(1)
    records = records.split('>')[1:]
    fasta_sequences = []
    for fasta in records:
        array = fasta.split('\n')
        header, sequence = array[0].split()[0], re.sub('[^ACGTU-]', '-', ''.join(array[1:]).upper())
        header_array = header.split('|')
        name = header_array[0]
        label = header_array[1] if len(header_array) >= 2 else '0'
        label_train = header_array[2] if len(header_array) >= 3 else 'training'
        sequence = re.sub('U', 'T', sequence)
        fasta_sequences.append([name, sequence, label, label_train])
    return fasta_sequences



def main():
    input_path="F:/M6A/oriTest/FULL/MOLM/allSingle/"
    out_path="F:/M6A/featureFile/all/ANF/"
    files = os.listdir(input_path)
    for inputfile_name in files:
        fasta = read_nucleotide_sequences(input_path+inputfile_name)
    # inputfile_name="FullTrainPosA549.fasta"
    # fasta = read_nucleotide_sequences(input_path + inputfile_name)
        a = ANF(fasta)
        df=pd.DataFrame(a)
        # b = pd.DataFrame(a)
        # df = df.fillna(b)
        # df.to_csv(out_path + inputfile_name[:-6] + ".csv")
        # print(type(df))
        df.to_csv(out_path+inputfile_name[:-6]+".csv",header=None,index=None)
        print(inputfile_name)

main()
  • 0
    点赞
  • 1
    评论
  • 0
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

©️2021 CSDN 皮肤主题: 游动-白 设计师:白松林 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值