批量PDB转一个jsonl

最新推荐文章于 2024-07-20 17:12:48 发布

Mario cai

最新推荐文章于 2024-07-20 17:12:48 发布

阅读量225

点赞数

分类专栏： AI制药文章标签： python 人工智能

本文链接：https://blog.csdn.net/caihaihua0572/article/details/127801737

版权

AI制药专栏收录该内容

7 篇文章 3 订阅

订阅专栏

from Bio import PDB
import numpy as np
import pandas as pd
import scipy.stats as stats
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import xlwt
import re
import os
import time
from tqdm import *
from time import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from sklearn import manifold, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pandas as pd
import seaborn as sns
import jsonlines  # 导入

seq2 = {
    'ALA': 'A',
    'CYS': 'C',
    'ASP': 'D',
    'GLU': 'E',
    'PHE': 'F',
    'GLY': 'G',
    'HIS': 'H',
    'ILE': 'I',
    'LYS': 'K',
    'LEU': 'L',
    'MET': 'M',
    'ASN': 'N',
    'PRO': 'P',
    'GLN': 'Q',
    'ARG': 'R',
    'SER': 'S',
    'THR': 'T',
    'VAL': 'V',
    'TRP': 'W',
    'TYR': 'Y',
    'GAP': '-'
}

TOTAL = []
file = r'D:\ProteinMPNN-main\TS500\pdb'
abs_path = os.path.abspath(file)
nnnn = os.listdir(abs_path)

NAME = []
COORDS = []
SEQU = []
for i in tqdm(range(0, len(nnnn))):
    coords = {}
    file = r'D:\ProteinMPNN-main\TS500\pdb'
    name = nnnn[i][0:nnnn[i][0:20].rfind('.')]
    txtname = fr'{file}\{nnnn[i]}'
    fopen = open(txtname, 'r')
    lines = fopen.readlines()
    file = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = file.add_sheet('data')
    data = []
    i = 0
    for line in lines:
        line = line.strip(' ').split('\t')
        data.append(line)
        i = i + 1
    data = np.array(data)
    data = np.transpose(data)
    x, y = data.shape
    N = []
    CA = []
    C = []
    O = []
    SEQ = ''
    type = 'V'
    for i in range(0, y - 1):
        typ = (data[0][i][20:22].replace(" ", ""))
        if typ == f'{type}':
        # print(str(data[0][i][16:20].replace(" ", "")))
            X = []
            XX = []
            XXX = []
            XXXX = []
            if data[0][i][13:15].replace(" ", "") == 'N':
                if data[0][i + 1][13:16].replace(" ", "") == 'CA':
                    if data[0][i + 2][13:15].replace(" ", "") == 'C':
                        if data[0][i + 3][13:15].replace(" ", "") == 'O':
                            X.append(float(data[0][i][26:38].replace(" ", "")))
                            X.append(float(data[0][i][38:46].replace(" ", "")))
                            X.append(float(data[0][i][46:55].replace(" ", "")))
                            N.append(X)
                            XX.append(float(data[0][i+1][26:38].replace(" ", "")))
                            XX.append(float(data[0][i+1][38:46].replace(" ", "")))
                            XX.append(float(data[0][i+1][46:55].replace(" ", "")))
                            CA.append(XX)
                            XXX.append(float(data[0][i+2][26:38].replace(" ", "")))
                            XXX.append(float(data[0][i+2][38:46].replace(" ", "")))
                            XXX.append(float(data[0][i+2][46:55].replace(" ", "")))
                            C.append(XXX)
                            XXXX.append(float(data[0][i+3][26:38].replace(" ", "")))
                            XXXX.append(float(data[0][i+3][38:46].replace(" ", "")))
                            XXXX.append(float(data[0][i+3][46:55].replace(" ", "")))
                            O.append(XXXX)

                            SEQ = SEQ + str(seq2[data[0][i][16:20].replace(" ", "")])
                        # break
    if N != []:
        ddf = {
            f'N_chain_{type}': N,
            f'CA_chain_{type}': CA,
            f'C_chain_{type}': C,
            f'O_chain_{type}': O,
        }
        coords.update(ddf)
        NAME.append(name)
        COORDS.append(coords)
        SEQU.append(SEQ)

with jsonlines.open(fr"D:\ProteinMPNN-main\TS500\jsonl/{type}.jsonl", 'w') as w:
    for i in range(0, len(NAME)):
        w.write({f"coords_chain_{type}": COORDS[i], "name": NAME[i], 'num_of_chains': 1,
                 'seq': SEQU[i], f"seq_chain_{type}": SEQU[i],
                 })

        # TYPE.append(type)

        # break