SemiBin2安装
GitHub - BigDataBiology/SemiBin: SemiBin: metagenomics binning with self-supervised deep learning
conda create -n SemiBin
conda activate SemiBin
conda install -c conda-forge -c bioconda semibin=2.1.0
SemiBin2 -h
使用(我选择的是自己构建训练集的方法)
有两种方法,1-1.使用所有contig.fa构建一个训练集 1-2.在所有contig.fa中选择一些,进行构建训练集
#对contig.fa进行映射得到${num}_aligen_sort.bam
${software}/bwa-mem2/bwa-mem2 index ${assembly}/${i}
${software}/bwa-mem2/bwa-mem2 mem -a -t 180 ${assembly}/${i} ${num}.1.fq.gz ${num}.2.fq.gz > ${num}_aligen.sam
samtools view -Sb -@180 ${num}_aligen.sam > ${num}_aligen.bam
samtools sort -@180 ${num}_aligen.bam -o ${num}_aligen_sort.bam
rm ${num}_aligen.sam
rm ${num}_aligen.bam
rm ${num}_mem2.bam.bai
cd ${assembly}
rm ${num}*.fa.*
2.对每个contig.fa生成特征(data.csv/data_split.csv文件)
这里也有两种方法,一个是多样本模式(可用于一个动物的N个重复),一个是单样本模式
单样本(简单一点,我选择用这个,免得出错哈哈)
SemiBin2 generate_sequence_features_single -i ${num}.fa -b ${num}_aligen_sort.bam -o ${num}_output
多样本
#假如S1-5是一个动物的三个重复,得到output/concatenated.fa
SemiBin2 concatenate_fasta --input-fasta S1.fa S2.fa S3.fa S4.fa S5.fa --output output
#提取特征前用每个fastq映射concatenated.fa
SemiBin2 generate_sequence_features_multi \
-i concatenated.fa.gz \
-b S1.sorted.bam S2.sorted.bam S3.sorted.bam S4.sorted.bam S5.sorted.bam \
-o output
3.训练自监督(我选了18个)
#!/bin/bash
source /home/zhongpei/miniconda3/bin/activate SemiBin
time SemiBin2 train_self \
--train-from-many \
-p 100 \
--engine auto \
--data Unknown_CA010-001R0001.fastp_output/data.csv Unknown_CA010-001R0002.fastp_output/data.csv Unknown_CA010-001R0003.fastp_output/data.csv Unknown_CA010-001R0004.fastp_output/data.csv Unknown_CA010-001R0005.fastp_output/data.csv Unknown_CA010-001R0006.fastp_output/data.csv Unknown_CA010-001R0007.fastp_output/data.csv Unknown_CA010-001R0008.fastp_output/data.csv Unknown_CA010-001R0009.fastp_output/data.csv Unknown_CA010-001R0010.fastp_output/data.csv Unknown_CA010-001R0011.fastp_output/data.csv Unknown_CA010-001R0012.fastp_output/data.csv Unknown_CA010-001R0013.fastp_output/data.csv Unknown_CA010-001R0014.fastp_output/data.csv Unknown_CA010-001R0015.fastp_output/data.csv Unknown_CA010-001R0016.fastp_output/data.csv Unknown_CA010-001R0017.fastp_output/data.csv Unknown_CA010-001R0018.fastp_output/data.csv \
--data-split Unknown_CA010-001R0001.fastp_output/data_split.csv Unknown_CA010-001R0002.fastp_output/data_split.csv Unknown_CA010-001R0003.fastp_output/data_split.csv Unknown_CA010-001R0004.fastp_output/data_split.csv Unknown_CA010-001R0005.fastp_output/data_split.csv Unknown_CA010-001R0006.fastp_output/data_split.csv Unknown_CA010-001R0007.fastp_output/data_split.csv Unknown_CA010-001R0008.fastp_output/data_split.csv Unknown_CA010-001R0009.fastp_output/data_split.csv Unknown_CA010-001R0010.fastp_output/data_split.csv Unknown_CA010-001R0011.fastp_output/data_split.csv Unknown_CA010-001R0012.fastp_output/data_split.csv Unknown_CA010-001R0013.fastp_output/data_split.csv Unknown_CA010-001R0014.fastp_output/data_split.csv Unknown_CA010-001R0015.fastp_output/data_split.csv Unknown_CA010-001R0016.fastp_output/data_split.csv Unknown_CA010-001R0017.fastp_output/data_split.csv Unknown_CA010-001R0018.fastp_output/data_split.csv \
-o fan_18model
metabat2安装
#stable release version
wget https://bitbucket.org/berkeleylab/metabat/get/master.tar.gz
tar xzvf master.tar.gz
cd berkeleylab-metabat-*
#依赖项
sudo apt-get install libboost-all-dev
#run the installation script
mkdir build && cd build && cmake .. -DCMAKE_INSTALL_PREFIX=. && make && make test && make install
metabat2使用
${software}/metabat2/berkeleylab-metabat-5947766d9d4c/build/bin/jgi_summarize_bam_contig_depths --outputDepth ${num}.depth.txt ${num}_aligen_sort.bam
${software}/metabat2/berkeleylab-metabat-5947766d9d4c/build/bin/metabat2 -i ${assembly}/${i} -a ${num}.depth.txt -o ${num}_metabat2 -t 90
BASALT安装(集成了多种分箱工具和分箱优化)
#修改~/.condarc
channels:
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
- defaults
git clone https://github.com/EMBL-PKU/BASALT.git
cd BASALT
#如果遇到错误,请从腾讯云(https://share.weiyun.com/xXdRiDkl)下载basalt_env.yml并创建conda环境:
conda env create -n BASALT --file basalt_env.yml
mamba install pytorch torchvision torchaudio -c pytorch
pip install torch torchvision
mamba install -c conda-forge tensorboardx
#安装的同时下载训练好的模型
https://share.weiyun.com/r33c2gqa
#给权限
chmod -R 777 /home/XX/mambaforge/envs/BASALT/bin/*
mv BASALT.zip ~/.cache
cd ~/.cache
unzip BASALT.zip
BASALT -h
使用
#下次测试集
https://figshare.com/articles/dataset/BASALT_demo_files/22323424
#全部解压之后进入Data*文件夹
nohup bash basalt.sh
#!/bin/bash
source /home/zhongpei/miniconda3/bin/activate BASALT
BASALT -a Unknown_CA010-001R0001.fastp_megahit_contigs.fa,Unknown_CA010-001R0002.fastp_megahit_contigs.fa\
-s Unknown_CA010-001R0001.fastp.1.fq.gz,Unknown_CA010-001R0001.fastp.2.fq.gz/Unknown_CA010-001R0002.fastp.1.fq.gz,Unknown_CA010-001R0002.fastp.2.fq.gz\
-t 140 -m 400 -qc checkm2