nanopolish中fast5文件multi_to_single和ont_fast5_api的使用

最新推荐文章于 2024-05-20 10:02:05 发布

随风而逝*

最新推荐文章于 2024-05-20 10:02:05 发布

阅读量2.9k

点赞数 6

文章标签： nanopolish fast5 生物信息甲基化 python

本文链接：https://blog.csdn.net/weixin_41869644/article/details/88956794

版权

1.针对许多reads-id合并的fast5文件，我们需要使用ont_fast5_api去根据reads-id拆分成单个fast5文件。

conda create -y --name ont-fast5-api python=3.6  #创建py版本为3.6的虚拟环境

conda activate ont-fast5-api  #进入虚拟环境

pip install ont-fast5-api  #安装ont-fast5-api这个模块

或者用github安装。

git clone https://github.com/nanoporetech/ont_fast5_api
cd ont_fast5_api
python setup.py install

2.multi_to_single_fast5

multi_to_single_fast5 -h
usage: [-h] -i INPUT_PATH -s SAVE_PATH [--recursive] [-t THREADS] [-v]

optional arguments:
  -h, --help            show this help message and exit
  -i INPUT_PATH, --input_path INPUT_PATH
                        MultiRead fast5 file or path to directory of MultiRead
                        files
  -s SAVE_PATH, --save_path SAVE_PATH
                        Folder to output SingleRead fast5 files to
  --recursive           Search recursively through folders for for MultiRead
                        fast5 files
  -t THREADS, --threads THREADS
                        Number of threads to use
  -v, --version         show program's version number and exit

3.single_to_multi_fast5

single_to_multi_fast5 -h
usage:  [-h] -i INPUT_PATH -s SAVE_PATH [-f FILENAME_BASE] [-n BATCH_SIZE]
        [-t THREADS] [--recursive] [-v]

optional arguments:
  -h, --help            show this help message and exit
  -i INPUT_PATH, --input_path INPUT_PATH
                        Folder containing single read fast5 files
  -s SAVE_PATH, --save_path SAVE_PATH
                        Folder to output multi read files to
  -f FILENAME_BASE, --filename_base FILENAME_BASE
                        Root of output filename, default='batch' ->
                        'batch_0.fast5'
  -n BATCH_SIZE, --batch_size BATCH_SIZE
                        Number of reads per multi-read file
  -t THREADS, --threads THREADS
                        Number of threads to use
  --recursive           Search recursively through folders for for single_read
                        fast5 files
  -v, --version         show program's version number and exit

4.我主要用的是把合并的fast5转成单个reads的fast5.

multi_to_single_fast5 -i /home/fast5 -s /output/ --recursive -t 3

5.附上multi_to_single_fast5的脚本供大家参考：

from __future__ import division

from argparse import ArgumentParser
from multiprocessing import Pool
from collections import deque
import logging
import h5py
import os

from ont_fast5_api import CURRENT_FAST5_VERSION, __version__
from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar
from ont_fast5_api.fast5_file import Fast5File
from ont_fast5_api.multi_fast5 import MultiFast5File

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
exc_info = False



class EmptyFast5(Fast5File):
    def _initialise_file(self):
        # We don't want to create/validate the full f5 data structure as most fields won't exist yet
        self.handle = h5py.File(self.filename, self.mode)
        self.handle.attrs['file_version'] = CURRENT_FAST5_VERSION
        self._is_open = True


def batch_convert_multi_files_to_single(input_path, output_folder, threads, recursive):

    pool = Pool(threads)
    file_list = get_fast5_file_list(input_path, recursive)
    pbar = get_progress_bar(len(file_list))

    def update(results):
        pbar.update(pbar.currval + 1)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    results_array = []
    for batch_num, filename in enumerate(file_list):
        results_array.append(pool.apply_async(convert_multi_to_single,
                                              args=(filename, output_folder,
                                                    str(batch_num)),
                                              callback=update))

    pool.close()
    pool.join()

    with open(os.path.join(output_folder,
                           "filename_mapping.txt"), 'w') as output_table:
        output_table.write("multi_read_file\tsingle_read_file\n")
        for result_set in results_array:
            results = result_set.get()
            multi_read_file = results.popleft()
            for single_read_file in results:
                output_table.write("{}\t{}\n".format(multi_read_file,
                                                     single_read_file))
    pbar.finish()


def convert_multi_to_single(input_file, output_folder, subfolder):
    results = deque([os.path.basename(input_file)])
    try:
        with MultiFast5File(input_file, 'r') as multi_f5:
            for read_id in multi_f5.get_read_ids():
                try:
                    read = multi_f5.get_read(read_id)
                    output_file = os.path.join(output_folder, subfolder, "{}.fast5".format(read_id))
                    create_single_f5(output_file, read)
                    results.append(os.path.basename(output_file))
                except Exception as e:
                    logger.error("{}\n\tFailed to copy read '{}' from {}"
                                 "".format(str(e), read_id, input_file), exc_info=exc_info)
    except Exception as e:
        logger.error("{}\n\tFailed to copy files from: {}"
                     "".format(e, input_file), exc_info=exc_info)
    finally:
        return results


def create_single_f5(output_file, read):
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))
    with EmptyFast5(output_file, 'w') as single_f5:
        for group in read.handle:
            if group == "Raw":
                read_number = read.handle["Raw"].attrs["read_number"]
                single_f5.handle.copy(read.handle[group], "Raw/Reads/Read_{}".format(read_number))
            elif group in ("channel_id", "context_tags", "tracking_id"):
                if "UniqueGlobalKey" not in single_f5.handle:
                    single_f5.handle.create_group("UniqueGlobalKey")
                single_f5.handle.copy(read.handle[group], "UniqueGlobalKey/{}".format(group))
            else:
                single_f5.handle.copy(read.handle[group], group)


def main():
    parser = ArgumentParser("")
    parser.add_argument('-i', '--input_path', required=True,
                        help="MultiRead fast5 file or path to directory of MultiRead files")
    parser.add_argument('-s', '--save_path', required=True,
                        help="Folder to output SingleRead fast5 files to")
    parser.add_argument('--recursive', action='store_true',
                        help="Search recursively through folders for for MultiRead fast5 files")
    parser.add_argument('-t', '--threads', type=int, default=1, required=False,
                        help="Number of threads to use")
    parser.add_argument('-v', '--version', action='version', version=__version__)
    args = parser.parse_args()

    batch_convert_multi_files_to_single(args.input_path, args.save_path, args.threads, args.recursive)


if __name__ == '__main__':
    main()

6.具体参考：https://github.com/nanoporetech/ont_fast5_api/tree/master/ont_fast5_api

随风而逝*

关注

6
点赞
踩
6

收藏

觉得还不错? 一键收藏
4
评论
nanopolish中fast5文件multi_to_single和ont_fast5_api的使用

1.针对许多reads-id合并的fast5文件，我们需要使用ont_fast5_api去根据reads-id拆分成单个fast5文件。conda create -y --name ont-fast5-api python=3.6 #创建py版本为3.6的虚拟环境conda activate ont-fast5-api #进入虚拟环境pip install ont-fast5-a...
复制链接

扫一扫