hhblits
是 HMM-HMM(Hidden Markov Model to Hidden Markov Model)比对方法的一部分,也是 HMMER 软件套件中的工具之一。与 hhsearch
类似,hhblits
也用于进行高效的蛋白质序列比对,特别擅长于检测远缘同源性。
hh-suite 源码 https://github.com/soedinglab/hh-suite
以下是 hhblits
的一些主要特点和用途:
-
HMM-HMM比对:
hhblits
使用隐藏马尔可夫模型(HMM)来表示蛋白质家族的模型,从而能够更全面地考虑多序列信息。这使得它相对于传统的序列-序列比对方法更能捕捉蛋白质家族的模式和结构。 -
迭代比对:
hhblits
支持迭代比对的策略,通过在每一轮中更新模型,逐渐提高比对的灵敏度。这种策略对于发现相对远缘同源蛋白质非常有用。 -
远缘同源性检测: 与其他 HMM-HMM比对方法类似,
hhblits
被设计用于检测远缘同源蛋白质,即那些在序列上相对较远但在结构和功能上相似的蛋白质。 -
数据库搜索: 用户可以使用
hhblits
在大型蛋白质数据库中搜索与给定蛋白质序列相似的蛋白质。 -
灵敏度和特异性:
hhblits
的设计目标是在保持高灵敏度的同时,降低假阳性的比对,以提高比对的特异性。
import glob
import os
import subprocess
from typing import Any, List, Mapping, Optional, Sequence
from absl import logging
import contextlib
import shutil
import tempfile
import time
_HHBLITS_DEFAULT_P = 20
_HHBLITS_DEFAULT_Z = 500
@contextlib.contextmanager
def tmpdir_manager(base_dir: Optional[str] = None):
"""Context manager that deletes a temporary directory on exit."""
tmpdir = tempfile.mkdtemp(dir=base_dir)
try:
yield tmpdir
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
@contextlib.contextmanager
def timing(msg: str):
logging.info('Started %s', msg)
tic = time.time()
yield
toc = time.time()
logging.info('Finished %s in %.3f seconds', msg, toc - tic)
class HHBlits:
"""Python wrapper of the HHblits binary."""
def __init__(self,
*,
binary_path: str,
databases: Sequence[str],
n_cpu: int = 4,
n_iter: int = 3,
e_value: float = 0.001,
maxseq: int = 1_000_000,
realign_max: int = 100_000,
maxfilt: int = 100_000,
min_prefilter_hits: int = 1000,
all_seqs: bool = False,
alt: Optional[int] = None,
p: int = _HHBLITS_DEFAULT_P,
z: int = _HHBLITS_DEFAULT_Z):
"""Initializes the Python HHblits wrapper.
Args:
binary_path: The path to the HHblits executable.
databases: A sequence of HHblits database paths. This should be the
common prefix for the database files (i.e. up to but not including
_hhm.ffindex etc.)
n_cpu: The number of CPUs to give HHblits.
n_iter: The number of HHblits iterations.
e_value: The E-value, see HHblits docs for more details.
maxseq: The maximum number of rows in an input alignment. Note that this
parameter is only supported in HHBlits version 3.1 and higher.
realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
maxfilt: Max number of hits allowed to pass the 2nd prefilter.
HHblits default: 20000.
min_prefilter_hits: Min number of hits to pass prefilter.
HHblits default: 100.
all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
HHblits default: False.
alt: Show up to this many alternative alignments.
p: Minimum Prob for a hit to be included in the output hhr file.
HHblits default: 20.
z: Hard cap on number of hits reported in the hhr file.
HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
Raises:
RuntimeError: If HHblits binary not found within the path.
"""
self.binary_path = binary_path
self.databases = databases
self.n_cpu = n_cpu
self.n_iter = n_iter
self.e_value = e_value
self.maxseq = maxseq
self.realign_max = realign_max
self.maxfilt = maxfilt
self.min_prefilter_hits = min_prefilter_hits
self.all_seqs = all_seqs
self.alt = alt
self.p = p
self.z = z
def query(self, input_fasta_path: str) -> List[Mapping[str, Any]]:
"""Queries the database using HHblits."""
with tmpdir_manager() as query_tmp_dir:
a3m_path = os.path.join(query_tmp_dir, 'output.a3m')
db_cmd = []
for db_path in self.databases:
db_cmd.append('-d')
db_cmd.append(db_path)
cmd = [
self.binary_path,
'-i', input_fasta_path,
'-cpu', str(self.n_cpu),
'-oa3m', a3m_path,
'-o', '/dev/null',
'-n', str(self.n_iter),
'-e', str(self.e_value),
'-maxseq', str(self.maxseq),
'-realign_max', str(self.realign_max),
'-maxfilt', str(self.maxfilt),
'-min_prefilter_hits', str(self.min_prefilter_hits)]
if self.all_seqs:
cmd += ['-all']
if self.alt:
cmd += ['-alt', str(self.alt)]
if self.p != _HHBLITS_DEFAULT_P:
cmd += ['-p', str(self.p)]
if self.z != _HHBLITS_DEFAULT_Z:
cmd += ['-Z', str(self.z)]
cmd += db_cmd
print("cmd:",' '.join(cmd))
logging.info('Launching subprocess "%s"', ' '.join(cmd))
process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
with timing('HHblits query'):
stdout, stderr = process.communicate()
retcode = process.wait()
if retcode:
# Logs have a 15k character limit, so log HHblits error line by line.
logging.error('HHblits failed. HHblits stderr begin:')
for error_line in stderr.decode('utf-8').splitlines():
if error_line.strip():
logging.error(error_line.strip())
logging.error('HHblits stderr end')
raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (
stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))
with open(a3m_path) as f:
a3m = f.read()
raw_output = dict(
a3m=a3m,
output=stdout,
stderr=stderr,
n_iter=self.n_iter,
e_value=self.e_value)
return [raw_output]
if __name__ == "__main__":
hhblits_binary_path = "/home/zheng/software/hhsuite-3.3.0-SSE2-Linux/bin/hhblits"
input_fasta_path = "/home/zheng/test/Q94K49.fasta"
#hhsuite数据库下载地址:https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/
scop70_database_path = "/home/zheng/database/scop70_1.75_hhsuite3/scop70_1.75"
pdb70_database_path = "/home/zheng/database/pdb70_from_mmcif_latest/pdb70"
## 单一数据库
#hhblits_runner = HHBlits(binary_path = hhblits_binary_path,
# databases = [scop70_database_path])
## 多个数据库
database_lst = [scop70_database_path, pdb70_database_path]
hhblits_runner = HHBlits(binary_path = hhblits_binary_path,
databases = database_lst)
result = hhblits_runner.query(input_fasta_path)
print(len(result))
print(result[0]['a3m'])
#print(result)