bwa shm 测试

# 从既往的项目中随便找出2条比对记录
le /mnt/lustre/user/wubin/03.Clinical_Data/PMseq/03.run/20240415130508/V350254454/24P9100213/04.map_PMDB/library1.V350254454_L02_32.bacteria_mask_001.tab.xls|head -3|cut -f1-3 >example.map.txt

# 找出对应的fasta序列
perl get_fa.pl bacteria.mask.part_001.fa example.map.txt example.fa


# 给fasta序列添加上注释信息
#>NZ_CP040684.1__bacteria__Pseudomonas__Pseudomonas_aeruginosa__Pseudomonas_aeruginosa_strain=C79
# 逻辑:给>NZ_CP040684.1添加上注释,分别是“accession”,“category”,“genus_name”,“species_name”, “organism_name”
# 不同单元之间的注释用“__”连接,同一单元内部的空格被替换成"_"
perl anno_fa.pl example.fa example.anno.fa example.map

# 构建bwa mem库

bwa index -a bwtsw example.anno.fa 

# 提取fq序列,生成新的fq文件
perl get_fq.pl example.map.txt V350254454_L02_32.unmap.fq.gz example.fq.gz


# 比对,看有没有什么问题
bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' example.anno.fa example.fq.gz -o example.reads.sam

# 结果:发现只要将accession的信息用下划线连接起来,长一点似乎无所谓。
# 还要再看看";"之类的符号是否影响


#=======================================重新注释ref_fa文件后再比对=======================================
# 给fasta序列添加上注释信息
perl anno_fa.pl example.fa example.anno_check_semicolon.fa example.map --add_string ";XXXXX"

# 构建bwa mem库

bwa index -a bwtsw example.anno_check_semicolon.fa

# 比对,看有没有什么问题
bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' example.anno_check_semicolon.fa example.fq.gz -o example.check_semicolon.reads.sam

# 测试结果证明,ref_fa中的 ';' 不会影响比对,只有空格' '才会影响。


# 给fasta序列添加上注释信息
perl anno_fa.pl example.fa example.anno_check_colon.fa example.map --add_string ":XXXXX"

# 构建bwa mem库

bwa index -a bwtsw example.anno_check_colon.fa

# 比对,看有没有什么问题
bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' example.anno_check_colon.fa example.fq.gz -o example.check_colon.reads.sam

# 测试结果证明,ref_fa中的 ':' 不会影响比对,只有空格' '才会影响。



# 给fasta序列添加上注释信息
perl anno_fa.pl example.fa example.anno_check_space.fa example.map --add_string " XXXXX"

# 构建bwa mem库

bwa index -a bwtsw example.anno_check_space.fa

# 比对,看有没有什么问题
bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' example.anno_check_space.fa example.fq.gz -o example.check_space.reads.sam

# 测试结果证明,ref_fa中的 ' ' 不会影响比对,但比对结果指挥显示第一个空格前的那一部分。
# 所以就是:坚决不能有空格


# 给fasta序列添加上注释信息
perl anno_fa.pl example.fa example.anno_check_bracket.fa example.map --add_string ":(catogory)XXXXX__(Genus)XXXXX__(species)XXXXXX__(Strain)XXXXX"

# 构建bwa mem库

bwa index -a bwtsw example.anno_check_bracket.fa

# 比对,看有没有什么问题
bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' example.anno_check_bracket.fa example.fq.gz -o example.check_bracket.reads.sam

# 测试结果证明,ref_fa中的 ' ' 不会影响比对,但比对结果指挥显示第一个空格前的那一部分。
# 所以就是:坚决不能有空格



# 复制一份,改成这种形式: >(catogory)XXXXX__(Genus)XXXXX__(species)XXXXXX__(Strain)XXXXX__bacteria__Pseudomonas__Pseudomonas_aeruginosa__Pseudomonas_aeruginosa_strain=R11
cp example.anno_check_bracket.fa example.anno_check_bracket2.fa


# 构建bwa mem库

bwa index -a bwtsw example.anno_check_bracket2.fa

# 比对,看有没有什么问题
bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' example.anno_check_bracket2.fa example.fq.gz -o example.check_bracket2.reads.sam

#=======================================试试bwa shm 重复的问题=======================================
# 如果一个文件a.fa 被bwa shm a.fa操作过
# 然后对a.fa做修改,修改后的文件仍然叫 a.fa
# 然后bwa shm a.fa
# 此时 /dev/shm/bwactl 文件中会有2个a.fa
# 此时再运行: bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' a.fa example.fq.gz -o a.fa.sam
# 就会出现以下报错:
# bwa: bwa.c:329: bwa_mem2idx: Assertion `k == l_mem' failed.
# [M::bam2fq_mainloop] discarded 0 singletons
# [M::bam2fq_mainloop] processed 0 reads
# [M::bam2fq_mainloop] discarded 0 singletons
# [M::bam2fq_mainloop] processed 0 reads

# 第1次修改
# 给fasta序列添加上注释信息
perl anno_fa.pl example.fa a.fa example.map --add_string ";MMMMM"


# 构建bwa mem库
bwa index -a bwtsw a.fa

# bwa shm 建库
bwa shm a.fa


# 此时看看发生了什么
# ls /dev/shm/
# bwactl  bwaidx-a.fa


# 删除bwaidx-a.fa,修改a.fa,重新建库
rm /dev/shm/bwaidx-a.fa

# 此时如果不修改a.fa,就直接再运行一次 bwa shm a.fa,就会有如下报错:
# bwa shm a.fa
#[M::bwa_idx_load_from_disk] read 0 ALT contigs
# shm_open(): File exists
#[E::main_shm] failed to stage the index in shared memory

# 第2次修改
# 给fasta序列添加上注释信息
perl anno_fa.pl example.fa a.fa example.map --add_string ";KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKAAAAAAAAA"


# 构建bwa mem库
bwa index -a bwtsw a.fa

# bwa shm 建库
bwa shm a.fa

# 此时可以看看bwactl中有没有重复
bwa shm -l



$perl anno_fa.pl example.fa a.fa example.map --add_string ";KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKAAAAAAAAA"
[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa index -a bwtsw a.fa
[bwa_index] Pack FASTA... 0.23 sec
[bwa_index] Construct BWT for the packed sequence...
[BWTIncCreate] textLength=26052488, availableWord=13832852
[BWTIncConstructFromPacked] 10 iterations done. 22817064 characters processed.
[bwt_gen] Finished constructing BWT in 12 iterations.
[bwa_index] 7.06 seconds elapse.
[bwa_index] Update BWT... 0.17 sec
[bwa_index] Pack forward-only FASTA... 0.14 sec
[bwa_index] Construct SA from BWT and Occ... 2.03 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index -a bwtsw a.fa
[main] Real time: 9.754 sec; CPU: 9.641 sec
[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa shm a.fa
[M::bwa_idx_load_from_disk] read 0 ALT contigs
shm_open(): File exists
[E::main_shm] failed to stage the index in shared memory
[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa shm a.fa
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[main] Version: 0.7.17-r1188
[main] CMD: bwa shm a.fa
[main] Real time: 0.070 sec; CPU: 0.070 sec
[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa shm -l|grep -w a.fa
[main] Version: 0.7.17-r1188
[main] CMD: bwa shm -l
[main] Real time: 0.000 sec; CPU: 0.002 sec
a.fa    22825884
a.fa    22825884
a.fa    22825884
a.fa    22825884
a.fa    22826020






# $bwa shm -l
# hg19.fa 5490045949
# T2T.mask.part_001_test.fa       5536455571
# example.anno_check_bracket.fa   22825996
# T2T.mask.part_001_test2.fa      5536455571
# a.fa    22825884
# a.fa    22825884
# [main] Version: 0.7.17-r1188
# [main] CMD: bwa shm -l
# [main] Real time: 0.000 sec; CPU: 0.002 sec


# 比对,看有没有什么问题
bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' a.fa example.fq.gz -o a.fa.reads.sam

[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' a.fa example.fq.gz -o a.fa.reads.sam
bwa: bwa.c:329: bwa_mem2idx: Assertion `k == l_mem' failed.
ÒÑ·ÅÆú


# 第三次修改
# 给fasta序列添加上注释信息
perl anno_fa.pl example.fa a.fa example.map --add_string ";MMMMM"


# 构建bwa mem库
bwa index -a bwtsw a.fa

# bwa shm 建库
bwa shm a.fa


$perl anno_fa.pl example.fa a.fa example.map --add_string ";MMMMM"
[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa index -a bwtsw a.fa
[bwa_index] Pack FASTA... 0.23 sec
[bwa_index] Construct BWT for the packed sequence...
[BWTIncCreate] textLength=26052488, availableWord=13832852
[BWTIncConstructFromPacked] 10 iterations done. 22817064 characters processed.
[bwt_gen] Finished constructing BWT in 12 iterations.
[bwa_index] 7.33 seconds elapse.
[bwa_index] Update BWT... 0.16 sec
[bwa_index] Pack forward-only FASTA... 0.06 sec
[bwa_index] Construct SA from BWT and Occ... 2.15 sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa index -a bwtsw a.fa
[main] Real time: 10.051 sec; CPU: 9.945 sec
[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa shm a.fa
[M::bwa_idx_load_from_disk] read 0 ALT contigs
shm_open(): File exists
[E::main_shm] failed to stage the index in shared memory
[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa shm a.fa
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[main] Version: 0.7.17-r1188
[main] CMD: bwa shm a.fa
[main] Real time: 0.073 sec; CPU: 0.072 sec
[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa shm -l |grep -w a.fa
[main] Version: 0.7.17-r1188
[main] CMD: bwa shm -l
[main] Real time: 0.000 sec; CPU: 0.002 sec
a.fa    22825884
a.fa    22825884
a.fa    22825884
a.fa    22825884
a.fa    22826020
a.fa    22825884

[wubin@login2 ~/05.tests/PMSeq/PMDB_fa_format]
$bwa mem -t 2 -M -Y -a -R '@RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab' a.fa example.fq.gz -o a.fa.reads.sam
[M::main_mem] load the bwa index from shared memory
[M::process] read 2 sequences (86 bp)...
[M::mem_process_seqs] Processed 2 reads in 0.002 CPU sec, 0.001 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -t 2 -M -Y -a -R @RG\tID:24P9100213\tPL:MGI-2000\tLB:library1\tSM:24P9100213\tCN:genelab -o a.fa.reads.sam a.fa example.fq.gz
[main] Real time: 0.003 sec; CPU: 0.005 sec






#===============================关于bwa shm a.fa============================
# 可以看出,反复做bwa shm的操作,一次成功,一次失败,下一次又成功
# 失败的时候会=========删除/dev/shm/bwaidx-T2T.mask.part_001.fa========
# 再次运行bwa shm, 成功后会====再次创建 /dev/shm/bwaidx-T2T.mask.part_001.fa====
# 但在这个过程中, bwactl中的T2T.mask.part_001.fa会====每成功一次,就增加一个记录====

[wubin@login2 /mnt/lustre/user/wubin/03.Clinical_Data/PMseq/03.run/20240409130508_test2/V350251040/24P9000196/sh.e.o/R02.Map_host]
$bwa shm -l |grep T2T.mask.part_001.fa
[main] Version: 0.7.17-r1188
[main] CMD: bwa shm -l
[main] Real time: 0.000 sec; CPU: 0.003 sec
T2T.mask.part_001.fa    5536451524
T2T.mask.part_001.fa    5536455715
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571
[wubin@login2 /mnt/lustre/user/wubin/03.Clinical_Data/PMseq/03.run/20240409130508_test2/V350251040/24P9000196/sh.e.o/R02.Map_host]
$bwa shm  /mnt/lustre/user/wubin/01.Program/Scripts/03.data/PMSeq/02.genomes_refseq/Library/human/mask_split/T2T.mask.part_001.fa
[M::bwa_idx_load_from_disk] read 0 ALT contigs

shm_open(): File exists
[E::main_shm] failed to stage the index in shared memory
[wubin@login2 /mnt/lustre/user/wubin/03.Clinical_Data/PMseq/03.run/20240409130508_test2/V350251040/24P9000196/sh.e.o/R02.Map_host]
$
[wubin@login2 /mnt/lustre/user/wubin/03.Clinical_Data/PMseq/03.run/20240409130508_test2/V350251040/24P9000196/sh.e.o/R02.Map_host]
$bwa shm -l |grep T2T.mask.part_001.fa
[main] Version: 0.7.17-r1188
[main] CMD: bwa shm -l
[main] Real time: 0.000 sec; CPU: 0.003 sec
T2T.mask.part_001.fa    5536451524
T2T.mask.part_001.fa    5536455715
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571
[wubin@login2 /mnt/lustre/user/wubin/03.Clinical_Data/PMseq/03.run/20240409130508_test2/V350251040/24P9000196/sh.e.o/R02.Map_host]
$bwa shm  /mnt/lustre/user/wubin/01.Program/Scripts/03.data/PMSeq/02.genomes_refseq/Library/human/mask_split/T2T.mask.part_001.fa
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[main] Version: 0.7.17-r1188
[main] CMD: bwa shm /mnt/lustre/user/wubin/01.Program/Scripts/03.data/PMSeq/02.genomes_refseq/Library/human/mask_split/T2T.mask.part_001.fa
[main] Real time: 10.388 sec; CPU: 10.372 sec
[wubin@login2 /mnt/lustre/user/wubin/03.Clinical_Data/PMseq/03.run/20240409130508_test2/V350251040/24P9000196/sh.e.o/R02.Map_host]
$bwa shm -l |grep T2T.mask.part_001.fa
[main] Version: 0.7.17-r1188
[main] CMD: bwa shm -l
[main] Real time: 0.000 sec; CPU: 0.002 sec
T2T.mask.part_001.fa    5536451524
T2T.mask.part_001.fa    5536455715
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571
T2T.mask.part_001.fa    5536455571


#=======================================最终测试出的逻辑================================
#$bwa shm -l |grep -w a.fa
#[main] Version: 0.7.17-r1188
#[main] CMD: bwa shm -l
#[main] Real time: 0.000 sec; CPU: 0.002 sec
#a.fa    22825884
#a.fa    22825884
#a.fa    22825884
#a.fa    22825884
#a.fa    22826020
#a.fa    22825884


# 可以看出bwactl中的a.fa有6条记录
# 但bwaidx-a.fa只有一条
# 前提: 存在/dev/shm/bwactl
# 1. 当/dev/shm 目录下有bwaidx-a.fa时,就直接用/dev/shm/bwaidx-a.fa
#   如果/dev/shm 目录下没有bwaidx-a.fa时,就直接从/home/wubin/05.tests/PMSeq/PMDB_fa_format/a.fa载入参考序列
#
# 2. 当bwactl中有多条a.fa的记录时,bwa 采用的是第一条记录
#
# 3. 当bwactl中第一条a.fa与/dev/shm/bwaidx-a.fa不匹配时,比对就会报错:
#  bwa: bwa.c:329: bwa_mem2idx: Assertion k == l_mem failed.

# 4. 这里的匹配,似乎还不是指 bwactl中第一条a.fa 与 /dev/shm/bwaidx-a.fa 是由同一个a.fa 被 bwa shm 产生
#   而是某种差异不要太大? 这一点还没有测试清楚。
#   既往测试中,只要改动不大,比如将后缀从 XXXXX 改成了YYYYY,
#   新增的记录都是 a.fa    22825884, 此时新的/dev/shm/bwaidx-a.fa也能兼容旧的bwactl

# 5. 反复做bwa shm的操作,一次成功,一次失败,下一次又成功
# 失败的时候会=========删除/dev/shm/bwaidx-T2T.mask.part_001.fa========
# 再次运行bwa shm, 成功后会====再次创建 /dev/shm/bwaidx-T2T.mask.part_001.fa====
# 但在这个过程中, bwactl中的T2T.mask.part_001.fa会====每成功一次,就增加一个记录====

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值