利用之前已经封装好环境的bioi_t2t_v1.tar
镜像进行下列分析
数据准备
数据存放在:
/home/ATCG03/day3_t2t/T2T/data
0.0 Kmer分析
- 使用
jellyfish
对二代/HiFi测序数据进行kmer
分析
workdir=/work/T2T/myt2t
datadir=/work/T2T/data
mkdir $workdir/0.kmer
cd $workdir/0.kmer
以二代测序数据为例,生成reads
解压脚本
ls $datadir/illu_*.gz | awk '{print "gzip -dc "$0 }' > generate.file
#进行kmer分析
jellyfish count -t 4 -C -m 21 -s 1G -g generate.file -G 2 -o Kmer_21
#统计kmer数目
jellyfish stats -o Kmer_21.stat Kmer_21
#生成kmer统计频数表
jellyfish histo -v -t 4 -h 10000000 -o Kmer_21.histo Kmer_21
- 基因组大小估计计数
genomescope.R -i Kmer_21.histo -o genoscope_out -p 2 -k 21 -m 10000000
0.1 ONF和HiFi数据组装合并/混合组装
方法一:ONF和HiFi数据分别组装后合并
1.HiFi
基因组组装
#路径设定
workdir=/work/T2T/myt2t
datadir=/work/T2T/data
#HiFi基因组组装
mkdir $workdir/1.1_hifi_assembly_hifiasm
cd $workdir/1.1_hifi_assembly_hifiasm
hifiasm -o hifi.asm -t 8 $datadir/hifi.fastq.gz && awk '/^S{print ">"$2;print $3} ' hifi.asm.bp.p_ctg.gfa > hifi.fa
#结果统计
[root@b7395b3b4b89 18:55:50 /work/T2T/myt2t/1.1_hifi_assembly_hifiasm]# assembly-stats hifi.fa
stats for hifi.fa
sum = 0, n = 0, ave = 0.00, largest = 0
N50 = 0, n = 0
N60 = 0, n = 0
N70 = 0, n = 0
N80 = 0, n = 0
N90 = 0, n = 0
N100 = 0, n = 0
N_count = 0
Gaps = 0
- 用
nextdenovo
软件组装ONT
基因组
mkdir $workdir/1.2_ONT_assembly_nextdenovo
cd $workdir/1.2_ONT_assembly_nextdenovo
find $datadir -name ont.fastq.gz > input.fofn
#导入输入文件
配置.cfg
文件,命名为run.cfg
[General]
job_type = local
#local,slurm,sge,pbs,lsf
job_prefix = nextDenovo
task = all
#all,correct,assemble
rewrite = yes
#yes/no
deltmp = yes
#删除临时文件
parallel_jobs = 2
#并行任务数,默认10
input_type = raw
#raw,corrected
read_type =ont
#clr,ont,hifi
input_fofn =input.fofn
#reads文件路径列表
workdir = out
#指定输出路径
[correct_option]
read_cutoff = 5k
#长度大于5k才会被组装
genome_size = 146m
#基因组预估大小
pa_correction = 4
#correct步骤并行任务数,内存
sort_options = -m 2g -t 5
minimap2_options_raw = -t 5
correction_options = -p 5
[assemble_option]
minimap2_options_cns = -t 5
nextgraphoptions = -a 1
nextDenovo run.cfg
- 组装后的
ONT
数据的nextPolish
mkdir $workdir/1.3_ONT_polish_nextpolish
cd $workdir/1.3_ONT_polish_nextpolish
#创建软链接
ln -s ../1.2_ONT_assembly_nextdenovo/out/03.ctg_graph/nd.asm.fasta contig.fa
#利用hifi数据,ont数据,illunima数据polish
find $datadir -name hifi.fastq.gz > ./hifi.fofn
find $datadir -name ont.fastq.gz > ./lgs.fofn
find $datadir -name illu_\* > ./sgs.fofn
#运行nextPolish
配置.cfg
文件,命名为run.cfg
[General]
job_type = local
#运行环境,可选local,rm,sge,pbs;
job_prefix = nextPolish
task = best
rewrite = yes
rerun = 3
parallel_jobs = 6
#并行任务数,默认10
multithread_jobs = 5
genome = contig.fa
genome_size = auto
workdir = ./output
#指定输出路径
polish_options = -p {multithread jobs}
{sgs_option}
sgs_fofn = ./sgs.fofn
sgs_options = -max_depth 100 -bwa
[lgs option]
lgs_fofn = ./lgs.fofn
lgs_options = -min_read_len 1k -max_depth 100
lgs_minimap2_options = -x map-ont -t
{multithread_jobs}
[hifi option]
hifi_fofn = ./hifi.fofn
hifi_options = -min_read_len 1k -max_depth 100
hifi_minimap2_options = -x map-hifi
#运行nextPolish
nextPolish run.cfg
Polish
后的ONT
和HiFi
数据合并
未完待续!!!
方法二:ONT
和HiFi
原始数据混合组装