Tutorial Basic Modeling
https://salilab.org/modeller/tutorial/basic.html
1. 搜索相关序列
从官网下载源文件
分析
- TvLDH.ali 存放目标序列,需满足要求的格式
>P1;TvLDH
sequence:TvLDH:::::::0.00: 0.00
MSEAAHVLITGAAGQIGYILSHWIASGELYGDRQVYLHLLDIPPAMNRLTALTMELEDCAFPHLAGFVATTDPKA
AFKDIDCAFLVASMPLKPGQVRADLISSNSVIFKNTGEYLSKWAKPSVKVLVIGNPDNTNCEIAMLHAKNLKPEN
FSSLSMLDQNRAYYEVASKLGVDVKDVHDIIVWGNHGESMVADLTQATFTKEGKTQKVVDVLDHDYVFDTFFKKI
GHRAWDILEHRGFTSAASPTKAAIQHMKAWLFGTAPGEVLSMGIPVPEGNPYGIKPGVVFSFPCNVDKEGKIHVV
EGFKVNDWLREKLDFTEKDLFHEKEIALNHLAQGG*
- pdb_95.pir 存放非冗余的PDB序列
C; Produced by MODELLER
>P1;1swyA
structureX:1swy: 1 :A: 164 :A:MOL_ID 1; MOLECULE LYSOZYME; CHAIN A; SYNONYM LYSIS PROTEIN, MURAMIDASE,:MOL_ID 1; ORGANISM_SCIENTIFIC BACTERIOPHAGE T4; ORGANISM_COMMON VIRUS; GE: 1.06:-1.00
MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNCNGVITKDEAEKLFNQDVAAAV
RGILRNAKLKPVYDSLDAVRECALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVI
TTFRTGTWDAYKNL*
C; Produced by MODELLER
>P1;1a6m
structureX:1a6m: 1 : : 151 : :MOL_ID 1; MOLECULE MYOGLOBIN; CHAIN NULL; BIOLOGICAL_UNIT MONOMER:MOL_ID 1; ORGANISM_SCIENTIFIC PHYSETER CATODON; ORGANISM_COMMON SPERM WHAL: 1.00:-1.00
VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKTEAEMKASEDLKKHGVTVLTALGAI
LKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELG
Y*
...
C; Produced by MODELLER
>P1;1zkoA
structureX:1zko: -4 :A: 123 :A:MOL_ID 1; MOLECULE GLYCINE CLEAVAGE SYSTEM H PROTEIN; CHAIN A, B; ENGINEE:MOL_ID 1; ORGANISM_SCIENTIFIC THERMOTOGA MARITIMA; ORGANISM_COMMON BACTERI: 1.65:-1.00
HHHHHLKMKKYTKTHEWVSIEDKVATVGITNHAQEQLGDVVYVDLPEVGREVKKGEVVASIESVKAAADVYAPLS
GKIVEVNEKLDTEPELINKDPEGEGWLFKMEISDEGELEDLLDEQAYQEFCAQ*
- build_profile.py 用于搜索和已知结构有关联的序列
from modeller import *
log.verbose()
#初始化环境
env = environ()
#-- Prepare the input files
#-- Read in the sequence database
#创建sequence_db对象,用于存放蛋白质序列数据
sdb = sequence_db(env)
#从文件读取非冗余的PDB序列,文件需为pir格式。少于30或多余4000个残基的序列和不标准的序列会被去除
sdb.read(seq_database_file='pdb_95.pir', seq_database_format='PIR',
chains_list='ALL', minmax_db_seq_len=(30, 4000), clean_sequences=True)
#-- Write the sequence database in binary form
#将上一步的读到序列写为二进制格式
sdb.write(seq_database_file='pdb_95.bin', seq_database_format='BINARY',
chains_list='ALL')
#-- Now, read in the binary database
#读回二进制格式数据(如需多次使用相同的数据,前两步第一次做一遍即可,后续可直接使用二进制文件)
sdb.read(seq_database_file='pdb_95.bin', seq_database_format='BINARY',
chains_list='ALL')
#-- Read in the target sequence/alignment
#创建alignment对象,用于从TvLDH.ali中读序列数据,然后转为profile格式(和alignment包含相似的数据,但是更紧实,利于在数据库中搜索序列)
aln = alignment(env)
aln.append(file='TvLDH.ali', alignment_format='PIR', align_codes='ALL')
#-- Convert the input sequence/alignment into
# profile format
prf = aln.to_profile()
#-- Scan sequence database to pick up homologous sequences
#在数据库中搜索序列,符合的序列将被加入profile中
prf.build(sdb, matrix_offset=-450, rr_file='${LIB}/blosum62.sim.mat',
gap_penalties_1d=(-500, -50), n_prof_iterations=1,
check_profile=False, max_aln_evalue=0.01)
#-- Write out the profile in text format
#写文件
prf.write(file='build_profile.prf', profile_format='TEXT')
#-- Convert the profile back to alignment format
aln = prf.to_alignment()
#-- Write out the alignment file
aln.write(file='build_profile.ali', alignment_format='PIR')
- build_profile.prf 存放搜索到的相似序列
# Number of sequences: 30
# Length of profile : 335
# N_PROF_ITERATIONS : 3
# GAP_PENALTIES_1D : -900.0 -50.0
# MATRIX_OFFSET : 0.0
# RR_FILE : $(LIB)/as1.sim.mat
1 TvLDH S 0 335 1 335 0 0 0 0. 0.0 MSEAAHVLITGAAGQIGYILSHWIASGELYGDRQVYLHLLDIPPAMNRLTALTMELEDCAFPHLAGFVATTDPKAAFKDIDCAFLVASMPLKPGQVRADLISSNSVIFKNTGEYLSKWAKPSVKVLVIGNPDNTNCEIAMLHAKNLKPENFSSLSMLDQNRAYYEVASKLGVDVKDVHDIIVWGNHGESMVADLTQATFTKEGKTQKVVDVLDHDYVFDTFFKKIGHRAWDILEHRGFTSAASPTKAAIQHMKAWLFGTAPGEVLSMGIPVPEGNPYGIKPGVVFSFPCNVDKEGKIHVVEGFKVNDWLREKLDFTEKDLFHEKEIALNHLAQGG
2 1a5z X 1 312 75 242 63 229 164 28. 0.85E-08 --------------------------------------------------------------------------ADLKGSDVVIVAAGVPQKPGETRLQLLGRNARVMKEIARNVSKYAPDSI-VIVVTNPVDV-LTYFFLKESGMDPRKFGSGTVLDTARLRTLIAQHCGFSPRSVH-V