我想阅读源码看一下alphafold2的实现细节,但是看完这个py文件里面并没有实现细节。只有对传入参数的设置,但读完代码也做一下记录。
1. 检查特定的参数:数据库文件夹、fasta文件、模板
运行main函数
if __name__ == '__main__':
flags.mark_flags_as_required([
'data_dir',
'fasta_paths',
'max_template_date',
])
app.run(main)
2. main()函数
设置数据库路径(8个)、模板路径、输出文件路径
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
# You can individually override the following paths if you have placed the
# data in locations other than the FLAGS.data_dir.
# Path to the Uniref90 database for use by JackHMMER.
uniref90_database_path = os.path.join(
FLAGS.data_dir, 'uniref90', 'uniref90.fasta')
# Path to the Uniprot database for use by JackHMMER.
uniprot_database_path = os.path.join(
FLAGS.data_dir, 'uniprot', 'uniprot.fasta')
# Path to the MGnify database for use by JackHMMER.
mgnify_database_path = os.path.join(
FLAGS.data_dir, 'mgnify', 'mgy_clusters_2018_12.fa')
# Path to the BFD database for use by HHblits.
bfd_database_path = os.path.join(
FLAGS.data_dir, 'bfd',
'bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt')
# Path to the Small BFD database for use by JackHMMER.
small_bfd_database_path = os.path.join(
FLAGS.data_dir, 'small_bfd', 'bfd-first_non_consensus_sequences.fasta')
# Path to the Uniclust30 database for use by HHblits.
uniclust30_database_path = os.path.join(
FLAGS.data_dir, 'uniclust30', 'uniclust30_2018_08', 'uniclust30_2018_08')
# Path to the PDB70 database for use by HHsearch.
pdb70_database_path = os.path.join(FLAGS.data_dir, 'pdb70', 'pdb70')
# Path to the PDB seqres database for use by hmmsearch.
pdb_seqres_database_path = os.path.join(
FLAGS.data_dir, 'pdb_seqres', 'pdb_seqres.txt')
# Path to a directory with template mmCIF structures, each named <pdb_id>.cif.
template_mmcif_dir = os.path.join(FLAGS.data_dir, 'pdb_mmcif', 'mmcif_files')
# Path to a file mapping obsolete PDB IDs to their replacements.
obsolete_pdbs_path = os.path.join(FLAGS.data_dir, 'pdb_mmcif', 'obsolete.dat')
alphafold_path = pathlib.Path(__file__).parent.parent
data_dir_path = pathlib.Path(FLAGS.data_dir)
if alphafold_path == data_dir_path or alphafold_path in data_dir_path.parents:
raise app.UsageError(
f'The download directory {FLAGS.data_dir} should not be a subdirectory '
f'in the AlphaFold repository directory. If it is, the Docker build is '
f'slow since the large databases are copied during the image creation.')
为模型使用的每个文件和目录创建装入点,并传回命令行修改参数为挂载目录
for i, fasta_path in enumerate(FLAGS.fasta_paths):
mount, target_path = _create_mount(f'fasta_path_{i}', fasta_path)
mounts.append(mount)
target_fasta_paths.append(target_path)
command_args.append(f'--fasta_paths={",".join(target_fasta_paths)}')
把fasta文件的路径映射到/mnt/fasta文件路径下,返回挂载器和挂载目录
def _create_mount(mount_name: str, path: str) -> Tuple[types.Mount, str]:
"""Create a mount point for each file and directory used by the model."""
path = pathlib.Path(path).absolute()
target_path = pathlib.Path(_ROOT_MOUNT_DIRECTORY, mount_name)
if path.is_dir():
source_path = path
mounted_path = target_path
else:
source_path = path.parent
mounted_path = pathlib.Path(target_path, path.name)
if not source_path.exists():
raise ValueError(f'Failed to find source directory "{source_path}" to '
'mount in Docker container.')
logging.info('Mounting %s -> %s', source_path, target_path)
mount = types.Mount(target=str(target_path), source=str(source_path),
type='bind', read_only=True)
return mount, str(mounted_path)
将path相对路径转换为绝对路径
path = pathlib.Path(path).absolute()
目标路径为:/mnt/fasta_path_{i} i=1,2,3,...
target_path = pathlib.Path(_ROOT_MOUNT_DIRECTORY, mount_name)
根据传入参数的不同(有无多聚体),选择不同的数据库
if FLAGS.model_preset == 'multimer':
database_paths.append(('uniprot_database_path', uniprot_database_path))
database_paths.append(('pdb_seqres_database_path',
pdb_seqres_database_path))
else:
database_paths.append(('pdb70_database_path', pdb70_database_path))
挂载数据库路径
for name, path in database_paths:
if path:
mount, target_path = _create_mount(name, path)
mounts.append(mount)
command_args.append(f'--{name}={target_path}')
挂载输出路径/mnt/output/
output_target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, 'output')
mounts.append(types.Mount(output_target_path, FLAGS.output_dir, type='bind'))
初始化docker客户端
client = docker.from_env()
运行docker容器,返回一个Container对象
container = client.containers.run(
image=FLAGS.docker_image_name,
command=command_args,
device_requests=device_requests,
remove=True,
detach=True,
mounts=mounts,
user=FLAGS.docker_user,
environment={
'NVIDIA_VISIBLE_DEVICES': FLAGS.gpu_devices,
# The following flags allow us to make predictions on proteins that
# would typically be too long to fit into GPU memory.
'TF_FORCE_UNIFIED_MEMORY': '1',
'XLA_PYTHON_CLIENT_MEM_FRACTION': '4.0',
})
environment:以下标志允许我们对通常太长而无法放入GPU内存的蛋白质进行预测。
run()函数的解析如下图:参考 http://t.csdn.cn/Qwqel
有一些没有写出来,应该就是运行时设置的一些参数
更多关于Python第三方库操作Docker的内容可以参考官方手册:https://docker-py.readthedocs.io/en/stable/index.html
查看官方文档后把剩下的参数列出:
device_requests | list列表 | 将主机的所有资源公开给容器 |
mounts | list列表 | 容器转载列表(规范后的)![]() |
environment | dict or list 字典或列表 | 要在容器内设置的环境变量 |