from Bio import PDB
import numpy as np
import pandas as pd
import scipy.stats as stats
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import xlwt
import re
import os
import time
from tqdm import *
from time import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from sklearn import manifold, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pandas as pd
import seaborn as sns
import jsonlines # 导入
seq2 = {
'ALA': 'A',
'CYS': 'C',
'ASP': 'D',
'GLU': 'E',
'PHE': 'F',
'GLY': 'G',
'HIS': 'H',
'ILE': 'I',
'LYS': 'K',
'LEU': 'L',
'MET': 'M',
'ASN': 'N',
'PRO': 'P',
'GLN': 'Q',
'ARG': 'R',
'SER': 'S',
'THR': 'T',
'VAL': 'V',
'TRP': 'W',
'TYR': 'Y',
'GAP': '-'
}
TOTAL = []
file = r'D:\ProteinMPNN-main\TS500\pdb'
abs_path = os.path.abspath(file)
nnnn = os.listdir(abs_path)
NAME = []
COORDS = []
SEQU = []
for i in tqdm(range(0, len(nnnn))):
coords = {}
file = r'D:\ProteinMPNN-main\TS500\pdb'
name = nnnn[i][0:nnnn[i][0:20].rfind('.')]
txtname = fr'{file}\{nnnn[i]}'
fopen = open(txtname, 'r')
lines = fopen.readlines()
file = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = file.add_sheet('data')
data = []
i = 0
for line in lines:
line = line.strip(' ').split('\t')
data.append(line)
i = i + 1
data = np.array(data)
data = np.transpose(data)
x, y = data.shape
N = []
CA = []
C = []
O = []
SEQ = ''
type = 'V'
for i in range(0, y - 1):
typ = (data[0][i][20:22].replace(" ", ""))
if typ == f'{type}':
# print(str(data[0][i][16:20].replace(" ", "")))
X = []
XX = []
XXX = []
XXXX = []
if data[0][i][13:15].replace(" ", "") == 'N':
if data[0][i + 1][13:16].replace(" ", "") == 'CA':
if data[0][i + 2][13:15].replace(" ", "") == 'C':
if data[0][i + 3][13:15].replace(" ", "") == 'O':
X.append(float(data[0][i][26:38].replace(" ", "")))
X.append(float(data[0][i][38:46].replace(" ", "")))
X.append(float(data[0][i][46:55].replace(" ", "")))
N.append(X)
XX.append(float(data[0][i+1][26:38].replace(" ", "")))
XX.append(float(data[0][i+1][38:46].replace(" ", "")))
XX.append(float(data[0][i+1][46:55].replace(" ", "")))
CA.append(XX)
XXX.append(float(data[0][i+2][26:38].replace(" ", "")))
XXX.append(float(data[0][i+2][38:46].replace(" ", "")))
XXX.append(float(data[0][i+2][46:55].replace(" ", "")))
C.append(XXX)
XXXX.append(float(data[0][i+3][26:38].replace(" ", "")))
XXXX.append(float(data[0][i+3][38:46].replace(" ", "")))
XXXX.append(float(data[0][i+3][46:55].replace(" ", "")))
O.append(XXXX)
SEQ = SEQ + str(seq2[data[0][i][16:20].replace(" ", "")])
# break
if N != []:
ddf = {
f'N_chain_{type}': N,
f'CA_chain_{type}': CA,
f'C_chain_{type}': C,
f'O_chain_{type}': O,
}
coords.update(ddf)
NAME.append(name)
COORDS.append(coords)
SEQU.append(SEQ)
with jsonlines.open(fr"D:\ProteinMPNN-main\TS500\jsonl/{type}.jsonl", 'w') as w:
for i in range(0, len(NAME)):
w.write({f"coords_chain_{type}": COORDS[i], "name": NAME[i], 'num_of_chains': 1,
'seq': SEQU[i], f"seq_chain_{type}": SEQU[i],
})
# TYPE.append(type)
# break
批量PDB转一个jsonl
最新推荐文章于 2024-07-02 10:24:12 发布