python运行Hmmsearch二进制命令的包装器类

最新推荐文章于 2024-09-15 22:31:42 发布
qq_27390023
最新推荐文章于 2024-09-15 22:31:42 发布
阅读量226
点赞数
文章标签： python 生物信息学
本文链接：https://blog.csdn.net/qq_27390023/article/details/134399650
版权
Hmmsearch (Hidden Markov Model search) 是一个用于搜索隐藏马尔可夫模型 (HMM) 的工具，通常用于生物信息学中的蛋白质序列分析。HMM 是一种统计模型，特别适用于建模具有一定状态转移概率的序列数据。Hmmsearch 可以用于根据已知的蛋白质家族 HMM 模型来搜索新的未知序列，并预测其可能的结构和功能。

import dataclasses
import os
import re
import subprocess
from typing import List, Optional, Sequence
from absl import logging
import contextlib
import tempfile
import shutil
import tempfile
import time

@dataclasses.dataclass(frozen=True)
class TemplateHit:
  """Class representing a template hit."""
  index: int
  name: str
  aligned_cols: int
  sum_probs: Optional[float]
  query: str
  hit_sequence: str
  indices_query: List[int]
  indices_hit: List[int]


def convert_stockholm_to_a3m(stockholm_format: str,
                             max_sequences: Optional[int] = None,
                             remove_first_row_gaps: bool = True) -> str:
  """Converts MSA in Stockholm format to the A3M format."""
  descriptions = {}
  sequences = {}
  reached_max_sequences = False

  for line in stockholm_format.splitlines():
    reached_max_sequences = max_sequences and len(sequences) >= max_sequences
    if line.strip() and not line.startswith(('#', '//')):
      # Ignore blank lines, markup and end symbols - remainder are alignment
      # sequence parts.
      seqname, aligned_seq = line.split(maxsplit=1)
      if seqname not in sequences:
        if reached_max_sequences:
          continue
        sequences[seqname] = ''
      sequences[seqname] += aligned_seq

  for line in stockholm_format.splitlines():
    if line[:4] == '#=GS':
      # Description row - example format is:
      # #=GS UniRef90_Q9H5Z4/4-78            DE [subseq from] cDNA: FLJ22755 ...
      columns = line.split(maxsplit=3)
      seqname, feature = columns[1:3]
      value = columns[3] if len(columns) == 4 else ''
      if feature != 'DE':
        continue
      if reached_max_sequences and seqname not in sequences:
        continue
      descriptions[seqname] = value
      if len(descriptions) == len(sequences):
        break

  # Convert sto format to a3m line by line
  a3m_sequences = {}
  if remove_first_row_gaps:
    # query_sequence is assumed to be the first sequence
    query_sequence = next(iter(sequences.values()))
    query_non_gaps = [res != '-' for res in query_sequence]
  for seqname, sto_sequence in sequences.items():
    # Dots are optional in a3m format and are commonly removed.
    out_sequence = sto_sequence.replace('.', '')
    if remove_first_row_gaps:
      out_sequence = ''.join(
          _convert_sto_seq_to_a3m(query_non_gaps, out_sequence))
    a3m_sequences[seqname] = out_sequence

  fasta_chunks = (f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
                  for k in a3m_sequences)
  return '\n'.join(fasta_chunks) + '\n'  # Include terminating newline.


def parse_hmmsearch_a3m(query_sequence: str,
                        a3m_string: str,
                        skip_first: bool = True) -> Sequence[TemplateHit]:
  """Parses an a3m string produced by hmmsearch.

  Args:
    query_sequence: The query sequence.
    a3m_string: The a3m string produced by hmmsearch.
    skip_first: Whether to skip the first sequence in the a3m string.

  Returns:
    A sequence of `TemplateHit` results.
  """
  # Zip the descriptions and MSAs together, skip the first query sequence.
  parsed_a3m = list(zip(*parse_fasta(a3m_string)))
  if skip_first:
    parsed_a3m = parsed_a3m[1:]

  indices_query = _get_indices(query_sequence, start=0)

  hits = []
  for i, (hit_sequence, hit_description) in enumerate(parsed_a3m, start=1):
    if 'mol:protein' not in hit_description:
      continue  # Skip non-protein chains.
    metadata = _parse_hmmsearch_description(hit_description)
    # Aligned columns are only the match states.
    aligned_cols = sum([r.isupper() and r != '-' for r in hit_sequence])
    indices_hit = _get_indices(hit_sequence, start=metadata.start - 1)

    hit = TemplateHit(
        index=i,
        name=f'{metadata.pdb_id}_{metadata.chain}',
        aligned_cols=aligned_cols,
        sum_probs=None,
        query=query_sequence,
        hit_sequence=hit_sequence.upper(),
        indices_query=indices_query,
        indices_hit=indices_hit,
    )
    hits.append(hit)

  return hit


@contextlib.contextmanager
def tmpdir_manager(base_dir: Optional[str] = None):
  """Context manager that deletes a temporary directory on exit."""
  tmpdir = tempfile.mkdtemp(dir=base_dir)
  try:
    yield tmpdir
  finally:
    shutil.rmtree(tmpdir, ignore_errors=True)


@contextlib.contextmanager
def timing(msg: str):
  logging.info('Started %s', msg)
  tic = time.time()
  yield
  toc = time.time()
  logging.info('Finished %s in %.3f seconds', msg, toc - tic)


class Hmmsearch(object):
  """Python wrapper of the hmmsearch binary."""

  def __init__(self,
               *,
               binary_path: str,
               hmmbuild_binary_path: str,
               database_path: str,
               flags: Optional[Sequence[str]] = None):
    """Initializes the Python hmmsearch wrapper.

    Args:
      binary_path: The path to the hmmsearch executable.
      hmmbuild_binary_path: The path to the hmmbuild executable. Used to build
        an hmm from an input a3m.
      database_path: The path to the hmmsearch database (FASTA format).
      flags: List of flags to be used by hmmsearch.

    Raises:
      RuntimeError: If hmmsearch binary not found within the path.
    """
    self.binary_path = binary_path
    self.hmmbuild_runner = Hmmbuild(binary_path=hmmbuild_binary_path)
    self.database_path = database_path
    if flags is None:
      # Default hmmsearch run settings.
      flags = ['--F1', '0.1',
               '--F2', '0.1',
               '--F3', '0.1',
               '--incE', '100',
               '-E', '100',
               '--domE', '100',
               '--incdomE', '100']
    self.flags = flags

    if not os.path.exists(self.database_path):
      logging.error('Could not find hmmsearch database %s', database_path)
      raise ValueError(f'Could not find hmmsearch database {database_path}')

  @property
  def output_format(self) -> str:
    return 'sto'

  @property
  def input_format(self) -> str:
    return 'sto'

  def query(self, msa_sto: str) -> str:
    """Queries the database using hmmsearch using a given stockholm msa."""
    hmm = self.hmmbuild_runner.build_profile_from_sto(msa_sto,
                                                      model_construction='hand')
    return self.query_with_hmm(hmm)

  def query_with_hmm(self, hmm: str) -> str:
    """Queries the database using hmmsearch using a given hmm."""
    with tmpdir_manager() as query_tmp_dir:
      hmm_input_path = os.path.join(query_tmp_dir, 'query.hmm')
      out_path = os.path.join(query_tmp_dir, 'output.sto')
      with open(hmm_input_path, 'w') as f:
        f.write(hmm)

      cmd = [
          self.binary_path,
          '--noali',  # Don't include the alignment in stdout.
          '--cpu', '8'
      ]
      # If adding flags, we have to do so before the output and input:
      if self.flags:
        cmd.extend(self.flags)
      cmd.extend([
          '-A', out_path,
          hmm_input_path,
          self.database_path,
      ])

      logging.info('Launching sub-process %s', cmd)
      process = subprocess.Popen(
          cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
      with timing(
          f'hmmsearch ({os.path.basename(self.database_path)}) query'):
        stdout, stderr = process.communicate()
        retcode = process.wait()

      if retcode:
        raise RuntimeError(
            'hmmsearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (
                stdout.decode('utf-8'), stderr.decode('utf-8')))

      with open(out_path) as f:
        out_msa = f.read()

    return out_msa

  def get_template_hits(self,
                        output_string: str,
                        input_sequence: str) -> Sequence[TemplateHit]:
    """Gets parsed template hits from the raw string output by the tool."""
    a3m_string = convert_stockholm_to_a3m(output_string,
                                          remove_first_row_gaps=False)
    template_hits = parse_hmmsearch_a3m(
        query_sequence=input_sequence,
        a3m_string=a3m_string,
        skip_first=False)
    return template_hits


class Hmmbuild(object):
  """Python wrapper of the hmmbuild binary."""
  def __init__(self,
               *,
               binary_path: str,
               singlemx: bool = False):
    """Initializes the Python hmmbuild wrapper.

    Args:
      binary_path: The path to the hmmbuild executable.
      singlemx: Whether to use --singlemx flag. If True, it forces HMMBuild to
        just use a common substitution score matrix.

    Raises:
      RuntimeError: If hmmbuild binary not found within the path.
    """
    self.binary_path = binary_path
    self.singlemx = singlemx

  def build_profile_from_sto(self, sto: str, model_construction='fast') -> str:
    """Builds a HHM for the aligned sequences given as an A3M string.

    Args:
      sto: A string with the aligned sequences in the Stockholm format.
      model_construction: Whether to use reference annotation in the msa to
        determine consensus columns ('hand') or default ('fast').

    Returns:
      A string with the profile in the HMM format.

    Raises:
      RuntimeError: If hmmbuild fails.
    """
    return self._build_profile(sto, model_construction=model_construction)

  def build_profile_from_a3m(self, a3m: str) -> str:
    """Builds a HHM for the aligned sequences given as an A3M string.

    Args:
      a3m: A string with the aligned sequences in the A3M format.

    Returns:
      A string with the profile in the HMM format.

    Raises:
      RuntimeError: If hmmbuild fails.
    """
    lines = []
    for line in a3m.splitlines():
      if not line.startswith('>'):
        line = re.sub('[a-z]+', '', line)  # Remove inserted residues.
      lines.append(line + '\n')
    msa = ''.join(lines)
    return self._build_profile(msa, model_construction='fast')

  def _build_profile(self, msa: str, model_construction: str = 'fast') -> str:
    """Builds a HMM for the aligned sequences given as an MSA string.

    Args:
      msa: A string with the aligned sequences, in A3M or STO format.
      model_construction: Whether to use reference annotation in the msa to
        determine consensus columns ('hand') or default ('fast').

    Returns:
      A string with the profile in the HMM format.

    Raises:
      RuntimeError: If hmmbuild fails.
      ValueError: If unspecified arguments are provided.
    """
    if model_construction not in {'hand', 'fast'}:
      raise ValueError(f'Invalid model_construction {model_construction} - only'
                       'hand and fast supported.')

    with tmpdir_manager() as query_tmp_dir:
      input_query = os.path.join(query_tmp_dir, 'query.msa')
      output_hmm_path = os.path.join(query_tmp_dir, 'output.hmm')



      print("input_query:", input_query)
      print("output_hmm_path:", output_hmm_path)      


      with open(input_query, 'w') as f:
        f.write(msa)

      cmd = [self.binary_path]
      # If adding flags, we have to do so before the output and input:

      if model_construction == 'hand':
        cmd.append(f'--{model_construction}')
      if self.singlemx:
        cmd.append('--singlemx')
      cmd.extend([
          '--amino',
          output_hmm_path,
          input_query,
      ])

      with open(input_query) as f:
         msa_file = f.read()
      print(msa_file)
      print("cmd:",cmd)
      print("开始hmmbuild")
      

      logging.info('Launching subprocess %s', cmd)
      process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)

      with timing('hmmbuild query'):
        stdout, stderr = process.communicate()
        retcode = process.wait()
        logging.info('hmmbuild stdout:\n%s\n\nstderr:\n%s\n',
                     stdout.decode('utf-8'), stderr.decode('utf-8'))
       
      print("retcode",retcode)
      if retcode:
        raise RuntimeError('hmmbuild failed\nstdout:\n%s\n\nstderr:\n%s\n'
                           % (stdout.decode('utf-8'), stderr.decode('utf-8')))

      with open(output_hmm_path, encoding='utf-8') as f:
        hmm = f.read()

    return hmm


if __name__ == "__main__":
  # 测试Hmmsearch类
  hmm_search_path = "/home/zheng/anaconda3/envs/deep_learning/bin/hmmsearch"
  hmm_build_path = "/home/zheng/anaconda3/envs/deep_learning/bin/hmmbuild"
  database_path = "/home/zheng/test/test_database/globins45.fa"  
  hmmer_runner = Hmmsearch(binary_path = hmm_search_path,
                           hmmbuild_binary_path = hmm_build_path,
                           database_path = database_path)
  with open("/home/zheng/test/HBB_HUMAN.sto") as f:
    sto_str = f.read()
  result = hmmer_runner.query(sto_str)
  print(result)