真核微生物基因组质量评估工具EukCC的安装和详细使用方法-CSDN博客

本文链接：https://blog.csdn.net/2401_87033955/article/details/142179739

eukcc folder
–out outfolder
–threads 8
–links linktable.csv
binfolder


EukCC 首先将分别对所有bins进行运行。随后，它会识别那些至少达到50%完整度但尚未超过100-improve\_percent的中等质量bins。接下来，它会找出那些通过至少100对端读配对与这些中等质量bins相连接的bins。若经过合并后bin的质量评分有所提高，则该bin将会被合并。


已合并的bins可以在输出文件夹中找到。


### 警示


![](https://img-blog.csdnimg.cn/direct/d419b279698d4ffcbc7d16bab53f4b1b.png)


blinks.py

#!/usr/bin/env python3
import pysam
from Bio import SeqIO
from collections import defaultdict
import os
import argparse
import logging
import csv

def is_in(read, contig_map, within=1000):
if read.reference_name not in contig_map.keys():
return False
if read.reference_start <= within or read.reference_end <= within:
return True
elif read.reference_start > (
contig_map[read.reference_name] - within
) or read.reference_end > (contig_map[read.reference_name] - within):
return True
else:
return False

def keep_read(read, contig_map, within=1000, min_ANI=98, min_cov=0):
ani = (
(read.query_alignment_length - read.get_tag(“NM”))
/ float(read.query_alignment_length)
* 100
)
cov = read.query_alignment_length / float(read.query_length) * 100

if ani >= min_ANI and cov >= min_cov and is_in(read, contig_map, within) is True:
    return True
else:
    return False

def contig_map(bindir, suffix=“.fa”):
m = {}
for f in os.listdir(bindir):
if f.endswith(suffix) is False:
continue
path = os.path.join(bindir, f)
with open(path, “r”) as handle:
for record in SeqIO.parse(handle, “fasta”):
m[record.name] = len(record.seq)
return m

def bin_map(bindir, suffix=“.fa”):
contigs = defaultdict(str)
contigs_per_bin = defaultdict(int)
for f in os.listdir(bindir):
if f.endswith(suffix) is False:
continue
path = os.path.join(bindir, f)
binname = os.path.basename(f)
with open(path, “r”) as handle:
for record in SeqIO.parse(handle, “fasta”):
contigs[record.name] = binname
contigs_per_bin[binname] += 1
return contigs, contigs_per_bin

def read_pair_generator(bam):
“”"
Generate read pairs in a BAM file or within a region string.
Reads are added to read_dict until a pair is found.
From: https://www.biostars.org/p/306041/
“”"
read_dict = defaultdict(lambda: [None, None])
for read in bam.fetch():
if not read.is_paired or read.is_secondary or read.is_supplementary:
continue
qname = read.query_name
if qname not in read_dict:
if read.is_read1:
read_dict[qname][0] = read
else:
read_dict[qname][1] = read
else:
if read.is_read1:
yield read, read_dict[qname][1]
else:
yield read_dict[qname][0], read
del read_dict[qname]

def read_bam_file(bamf, link_table, cm, within, ANI):
samfile = pysam.AlignmentFile(bamf, “rb”)

# generate link table
logging.info("Parsing Bam file. This can take a few moments")
for read, mate in read_pair_generator(samfile):
    if keep_read(read, cm, within, min_ANI=ANI) and keep_read(
        mate, cm, within, min_ANI=ANI
    ):
        # fill in the table
        link_table[read.reference_name][mate.reference_name] += 1
        if read.reference_name != mate.reference_name:
            link_table[mate.reference_name][read.reference_name] += 1

return link_table

def main():
# set arguments
# arguments are passed to classes
parser = argparse.ArgumentParser(
description=“Evaluate completeness and contamination of a MAG.”
)
parser.add_argument(“bindir”, type=str, help=“Run script on these bins”)
parser.add_argument(
“bam”,
type=str,
help=“Bam file(s) with reads aligned against all contigs making up the bins”,
nargs=“+”,
)
parser.add_argument(
“–out”,
“-o”,
type=str,
required=False,
help=“Path to output table (Default: links.csv)”,
default=“links.csv”,
)
parser.add_argument(
“–ANI”, type=float, required=False, help=“ANI of matching read”, default=99
)
parser.add_argument(
“–within”,
type=int,
required=False,
help=“Within this many bp we need the read to map”,
default=1000,
)
parser.add_argument(
“–contigs”,
“-c”,
action=“store_true”,
default=False,
help=“Instead of bins print contigs”,
)
parser.add_argument(
“–quiet”,
“-q”,
dest=“quiet”,
action=“store_true”,
default=False,
help=“Silcence most output”,
)
parser.add_argument(
“–debug”,
“-d”,
action=“store_true”,
default=False,
help=“Debug and thus ignore safety”,
)
args = parser.parse_args()

# define logging
logLevel = logging.INFO
if args.quiet:
    logLevel = logging.WARNING
elif args.debug:
    logLevel = logging.DEBUG
logging.basicConfig(
    format="%(asctime)s %(message)s",
    datefmt="%d-%m-%Y %H:%M:%S: ",
    level=logLevel,
)

bindir = args.bindir

cm = contig_map(bindir)
bm, contigs_per_bin = bin_map(bindir)
logging.debug("Found {} contigs".format(len(cm)))

link_table = defaultdict(lambda: defaultdict(int))
bin_table = defaultdict(lambda: defaultdict(int))

# iterate all bam files
for bamf in args.bam:
    link_table = read_bam_file(bamf, link_table, cm, args.within, args.ANI)

logging.debug("Created link table with {} entries".format(len(link_table)))

# generate bin table
for contig_1, dic in link_table.items():
    for contig_2, links in dic.items():
        bin_table[bm[contig_1]][bm[contig_2]] += links

logging.debug("Created bin table with {} entries".format(len(bin_table)))

out_data = []
logging.debug("Constructing output dict")
if args.contigs:
    for contig_1, linked in link_table.items():
        for contig_2, links in linked.items():
            out_data.append(
                {
                    "bin_1": bm[contig_1],
                    "bin_2": bm[contig_2],
                    "contig_1": contig_1,
                    "contig_2": contig_2,
                    "links": links,
                    "bin_1_contigs": contigs_per_bin[bm[contig_1]],
                    "bin_2_contigs": contigs_per_bin[bm[contig_2]],
                }
            )
else:
    for bin_1, dic in bin_table.items():
        for bin_2, links in dic.items():
            out_data.append({"bin_1": bin_1, "bin_2": bin_2, "links": links})

logging.debug("Out data has {} rows".format(len(out_data)))
# results
logging.info("Writing output")
with open(args.out, "w") as fout:
    if len(out_data) > 0:
        cout = csv.DictWriter(fout, fieldnames=list(out_data[0].keys()))
        cout.writeheader()
        for row in out_data:
            cout.writerow(row)
    else:
        logging.warning("No rows to write")

if name == “main”:
main()


scripts/filter\_euk\_bins.py

#!/usr/bin/env python3

This file is part of the EukCC (https://github.com/openpaul/eukcc).

Copyright © 2019 Paul Saary

This program is free software: you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation, version 3.

This program is distributed in the hope that it will be useful, but

WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

General Public License for more details.

You should have received a copy of the GNU General Public License

along with this program. If not, see http://www.gnu.org/licenses/.

provides all file operation functions

used inthis package

import os
import argparse
import subprocess
import logging
import tempfile
import gzip
from multiprocessing import Pool

backup fasta handler, so we can use readonly directories

class fa_class:
def init(self, seq, name, long_name):
self.seq = seq
self.name = name
self.long_name = long_name

def __str__(self):
    return self.seq

def __len__(self):
    return len(self.seq)

def Fasta(path):
“”"
Iterator for fasta files
“”"
entry = False

with open(path) as fin:
    for line in fin:
        if line.startswith(">"):
            if entry is not False:
                entry.seq = "".join(entry.seq)
                yield entry
            # define new entry
            long_name = line.strip()[1:]
            name = long_name.split()[0]
            entry = fa_class([], name, long_name)
        else:
            entry.seq.append(line.strip())
    # yield last one
    entry.seq = "".join(entry.seq)
    yield entry

def gunzip(path, tmp_dir):
“”"
Gunzip a file for EukRep
“”"
if path.endswith(“.gz”):
fna_path = os.path.join(tmp_dir, “contigs.fna”)
logging.debug(“Going to unzip fasta into {}”.format(fna_path))
with gzip.open(path, “r”) as fin, open(fna_path, “w”) as fout:
for line in fin:
fout.write(line.decode())
path = fna_path
logging.debug(“Done unzipping {}”.format(fna_path))

return path

class EukRep:
“”“Class to call and handle EukRep data”“”

def __init__(self, fasta, eukout, bacout=None, minl=1500, tie="euk"):
    self.fasta = fasta
    self.eukout = eukout
    self.bacout = bacout
    self.minl = minl
    self.tie = tie

def run(self):
    # command list will be called
    cmd = [
        "EukRep",
        "--min",
        str(self.minl),
        "-i",
        self.fasta,
        "--seq_names",
        "-ff",
        "--tie",
        self.tie,
        "-o",
        self.eukout,
    ]
    if self.bacout is not None:
        cmd.extend(["--prokarya", self.bacout])

    subprocess.run(cmd, check=True, shell=False)

    self.read_result()

def read_result(self):
    self.euks = self.read_eukfile(self.eukout)

    self.bacs = set()
    if self.bacout is not None:
        self.bacs = self.read_eukfile(self.bacout)

def read_eukfile(self, path):
    lst = set()
    with open(path) as infile:
        for line in infile:
            lst.add(line.strip())
    return lst

class bin:
def init(self, path, eukrep):
self.e = eukrep
self.bname = os.path.basename(path)
self.path = os.path.abspath(path)

def __str__(self):
    return "{} euks: {} bacs: {}".format(self.bname, self.table["euks"], self.table["bacs"])

def stats(self):
    """read bin content and figure genomic composition"""
    logging.debug("Loading bin")
    fa_file = Fasta(self.path)
    stats = {"euks": 0, "bacs": 0, "NA": 0, "sum": 0}
    # loop and compute stats

最后的话

最近很多小伙伴找我要Linux学习资料，于是我翻箱倒柜，整理了一些优质资源，涵盖视频、电子书、PPT等共享给大家！

资料预览

给大家整理的视频资料：

给大家整理的电子书资料：

如果本文对你有帮助，欢迎点赞、收藏、转发给朋友，让我有持续创作的动力！
logging.debug(“Loading bin”)
fa_file = Fasta(self.path)
stats = {“euks”: 0, “bacs”: 0, “NA”: 0, “sum”: 0}
# loop and compute stats