dbNSFP4.6a是按照染色体分的,整理成一个文件用于snpeff注释突变。
dbNSFP_sort.pl在"https://pcingola.github.io/SnpEff/snpsift/dbnsfp/"下载,直接使用该代码需要的内存太大,笔者在150GB内存的服务器上运行过,没有成功。
ls dbNSFP4.6a_variant.chr*gz | while read id;do zcat $id | head -n 1000 |gzip > temp/${id:0:0-3}.gz;done
version="4.6a"
zcat dbNSFP${version}_variant.chr* | perl ../dbNSFP_sort.pl 7 8 > dbNSFP${version}_hg19.txt
该代码是调用 dbNSFP_sort.pl进行分染色体整理,占用的内存不需要全部读入,而是分染色体处理。最后合并的方式。
#!/bin/bash
version="4.6a"
chromesome=("1" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "2" "20" "21" "22" "3" "4" "5" "6" "7" "8" "9" "M" "X" "Y")
for chroi in ${chromesome[@]}
do
zcat dbNSFP${version}_variant.chr${chroi}.gz | perl ../dbNSFP_sort.pl 7 8 >temp_${chroi}.txt
cat temp_${chroi}.txt | grep -P "^\\." > dot_${chroi}.txt
cat temp_${chroi}.txt | grep -P -v "^\\." > nodot_${chroi}.txt
cat nodot_${chroi}.txt | sed -n "1p" > dbNSFP${version}_hg19.txt
cat nodot_${chroi}.txt | sed -n "1p" > newNodot_${chroi}.txt
sed -i "1d" nodot_${chroi}.txt
rm temp_${chroi}.txt
done
for chroi in ${chromesome[@]}
do
for chroj in ${chromesome[@]}
do
cat nodot_${chroi}.txt | grep -P "^${chroj}\t" >> newNodot_${chroj}.txt
done
rm nodot_${chroi}.txt
done
for chroi in ${chromesome[@]}
do
echo "dealing ============> newNodot_${chroi}.txt"
cat newNodot_${chroi}.txt | perl ../dbNSFP_sort.pl 7 8 > nodot_${chroi}.txt
sed -i "1d" nodot_${chroi}.txt
rm newNodot_${chroi}.txt
done
for chroi in ${chromesome[@]}
do
cat dot_${chroi}.txt >> dbNSFP${version}_hg19.txt
rm dot_${chroi}.txt
done
for chroi in ${chromesome[@]}
do
cat nodot_${chroi}.txt >> dbNSFP${version}_hg19.txt
rm nodot_${chroi}.txt
done
最后
version="4.6a"
bgzip dbNSFP${version}_hg19.txt
tabix -s 1 -b 2 -e 2 dbNSFP${version}_hg19.txt.gz