基因家族分析

最新推荐文章于 2023-08-28 18:41:56 发布
小梁学生信
最新推荐文章于 2023-08-28 18:41:56 发布
阅读量2.4k
点赞数 1
本文链接：https://blog.csdn.net/a2505568381/article/details/118001505
版权

#下载拟南芥基因组信息
#wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz
#wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz
#wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/pep/Arabidopsis_thaliana.TAIR10.pep.all.fa.gz
#wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.41.gff3.gz
#
#解压压缩文件
#gunzip *gz

#观察数据，ID保持一致,也就是gff中第9列，ID标签和parent标签与蛋白序列和cds序列里面的ID是否一致；
#处理GFF 文件里面ID中一些不必要的信息，gene:  transcript: 删除；与蛋白质中的ID保持一致：Arabidopsis_thaliana.TAIR10.pep.all.fa 

#sed 's#gene:##' Arabidopsis_thaliana.TAIR10.41.gff3|sed  's#transcript:##' |sed 's#CDS:##' >Arabidopsis_thaliana.TAIR10.41.gff31
#mv Arabidopsis_thaliana.TAIR10.41.gff31 Arabidopsis_thaliana.TAIR10.41.gff3

#获取基因与mRNA的对应关系，后面会用到；
perl script/mRNAid_to_geneid.pl Arabidopsis_thaliana.TAIR10.41.gff3 mRNA2geneID.txt
perl script/geneid_to_mRNAid.pl Arabidopsis_thaliana.TAIR10.41.gff3 geneid2mRNAid.txt

#第一次搜索结构域
hmmsearch --domtblout WRKY_hmm_out.txt --cut_tc WRKY.hmm Arabidopsis_thaliana.TAIR10.pep.all.fa

#提取结构域序列，最后的evalue值根据实际情况可调,注意脚本提取的是第一个domain，如要提取其他domain，请修改脚本27行$a[9]==1为第一个，$a[9]==2为第二个，依次类推
perl script/domain_xulie.pl WRKY_hmm_out.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_domain.fa 1.2e-28

###########以下部分为建立物种特异模型再次搜索，可根据自己基因家族情况选做这部分内容#############################
#clusterW多序列比对快捷方法
echo "1\nWRKY_domain.fa\n2\n1\nWRKY_domain.aln\nWRKY_domain.dnd\nX\n\n\nX\n" |clustalw

#利用比对结果建立物种特异hmm模型
hmmbuild WRKY_domain_new.hmm WRKY_domain.aln

#新建物种特异hmm模型，再次搜索

hmmsearch --domtblout WRKY_domain_new_out.txt --cut_tc WRKY_domain_new.hmm Arabidopsis_thaliana.TAIR10.pep.all.fa

############################################################################################################

#筛选 hmm搜索结果，可以用excel手动筛选，筛选标准，
#筛选EValue  <0.001
#如果只想用hmmer搜索一次，可将下面的文件：WRKY_domain_new_out.txt 替换成第一次hmmer搜索生成的文件：WRKY_hmm_out.txt
grep -v "^#" WRKY_domain_new_out.txt|awk '$7<0.001 {print}' >WRKY_domain_new_out_selected.txt


#去除重复的hmmer搜索的转录本ID，多个转录本ID保留一个作为基因的代表，此步建议对脚本输出的文件手动筛选，挑选ID：
perl script/select_redundant_mRNA.pl mRNA2geneID.txt WRKY_domain_new_out_selected.txt WRKY_remove_redundant_IDlist.txt

#请手动挑选完mRNA的ID放在第一列，也就是挑选一个转录本ID代表这个基因，存成新的文件WRKY_removed_redundant_IDlist.txt：
#利用脚本得到对应基因的蛋白序列：
perl script/get_fa_by_id.pl WRKY_removed_redundant_IDlist.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_pep_need_to_confirm.fa


#将上面WRKY_pep_need_to_confirm.fa文件中的蛋白序列，再手动验证一下，把不需要的ID删除，最终确认的ID存成新文件：WRKY_removed_redundant_and_confirmed_IDlist.txt
#手动确认结构域，CDD，SMART，PFAM
#确定分子量大小：http://web.expasy.org/protparam/
#perl script/stat_protein_fa.pl WRKY_pep_need_to_confirm.fa WRKY_pep_need_to_confirm.MW.txt
#三大数据库网站，筛选之后去除一些不确定的基因ID，最终得到可靠的基因家族基因列表,存储在文件：WRKY_removed_redundant_and_confirmed_IDlist.txt ; 

#脚本提取hmm结果文件，重新筛选一下hmm的结果：

#get_data_by_id.pl 会读取第一个文件的第一列ID，然后到第二个文件中去筛选对应ID（第二个文件第一列作为ID）的数据输出到第三个文件中
perl script/get_data_by_id.pl WRKY_removed_redundant_and_confirmed_IDlist.txt WRKY_domain_new_out_selected.txt WRKY_domain_new_out_removed_redundant.txt

#截取得到序列上的保守结构域序列，注意脚本提取的是第一个domain，如要提取其他domain，请修改脚本27行$a[9]==1为第一个，$a[9]==2为第二个，依次类推

perl script/domain_xulie.pl WRKY_domain_new_out_removed_redundant.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_domain_confirmed.fa 0.1

#得到对应基因的蛋白序列全长，脚本会读取第一个文件的第一列的ID信息，根据ID提取相应的序列：

perl script/get_fa_by_id.pl WRKY_removed_redundant_and_confirmed_IDlist.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_pep_confirmed.fa

#得到对应基因的cds序列，脚本会读取第一个文件的第一列的ID信息，根据ID提取相应的序列：

perl script/get_fa_by_id.pl WRKY_removed_redundant_and_confirmed_IDlist.txt Arabidopsis_thaliana.TAIR10.cds.all.fa WRKY_cds_confirmed.fa






########################进化树分析##########################################
#cd $workdir 回到工作路径
mkdir gene_tree_analysis
cd gene_tree_analysis
cp ../WRKY_domain_confirmed.fa .
cp ../WRKY_pep_confirmed.fa .
cp ../WRKY_cds_confirmed.fa .
cp ../WRKY_domain_new_out_removed_redundant.txt .


#########################利用meme软件做motif分析################################33

#cd $workdir
mkdir meme_motif_analysis
cd meme_motif_analysis
#搜索结构域：
#-nmotifs 10  搜索motif的总个数
#-minw 6   motif的最短长度
#-maxw 50   motif的最大长度

/biosoft/meme/meme-v4.12.0/bin/meme ../WRKY_pep_confirmed.fa -protein -oc ./ -nostatus -time 18000 -maxsize 6000000 -mod anr -nmotifs 10 -minw 6 -maxw 100

#蛋白质保守结构域搜索
								输出结果	因马可夫模型	蛋白序列
hmmsearch --cut_tc --domtblout HSP_xian.txt HSP20.hmm HSP_pep_confirmed.fa

##################################基因结构分析structure####################

#回到工作路径 my_gene_family

mkdir gene_structure_analysis
cd gene_structure_analysis
cp ../WRKY_removed_redundant_and_confirmed_IDlist.txt .



#获得基因的在染色体上的外显子，CDS，UTR位置信息，用于绘制基因结构图
#注意脚本读取的是第一个（-in1）文件第一列信息，里面是转录本ID；然后把转录本对应的位置cds utr等信息筛选出来
perl ../script/get_gene_exon_from_gff.pl -in1 WRKY_removed_redundant_and_confirmed_IDlist.txt -in2 ../Arabidopsis_thaliana.TAIR10.41.gff3 -out gene_exon_info.gff


################################基因定位到染色体###############################################
# 回到工作路径  my_gene_family
mkdir map_to_chr
cd map_to_chr
cp ../WRKY_removed_redundant_and_confirmed_IDlist.txt .    #WRKY基因家族ID列表文件


#获得基因的在染色体上的位置信息，用于绘制染色体位置图,注意提取的是基因位置还是mRNA位置,以下代码是提取的 mRNA位置
perl ../script/get_gene_weizhi.pl -in1 WRKY_removed_redundant_and_confirmed_IDlist.txt -in2 ../Arabidopsis_thaliana.TAIR10.41.gff3 -out mrna_location.txt

#获得基因组染色体长度：
samtools faidx ../Arabidopsis_thaliana.TAIR10.dna.toplevel.fa

cp ../Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai .

#绘图参考：http://www.omicsclass.com/article/397



#######################共线性分析，基因加倍与串联重复分析MCScanX##################################
#回到工作路径  my_gene_family

mkdir MCScanX
cd MCScanX

mkdir mcscan

#获取基因对应的蛋白序列信息，注意：基因的第一个转录本为代表序列；
#从geneid2mRNAid.txt文件中每个基因挑一个转录本ID存储到allmRNAID.txt（一列）

#获取基因组基因对应转录本的位置信息
perl ../script/get_mRNA_position.pl allmRNAID.txt ../Arabidopsis_thaliana.TAIR10.41.gff3 AT.gff

#获取对应蛋白序列
perl ../script/get_fa_by_id.pl allmRNAID.txt ../Arabidopsis_thaliana.TAIR10.pep.all.fa pep.fa

#blast建库
makeblastdb -in pep.fa  -dbtype prot -title pep.fa

#blastall  比对，所有基因对所有基因
blastall -i pep.fa -d pep.fa -e 1e-10  -p blastp  -b 5 -v 5 -m 8 -o mcscan/AT.blast
cp AT.gff mcscan/AT.gff


#运行MCSCANX  查找共线性
/biosoft/MCScanX/MCScanX/MCScanX mcscan/AT

#下载示例文件，自己分析时需要用自己研究物种的染色体文件，和前面鉴定基因家族基因列表文件
#wget http://chibba.pgml.uga.edu/mcscan2/examples/family.ctl
#wget http://chibba.pgml.uga.edu/mcscan2/examples/MADS_box_family.txt

sed -i 's#at##g' family.ctl

#基因家族共线性绘图，注意MCSCAN绘图要回到：/biosoft/MCScanX/MCScanX/downstream_analyses 才能绘图
cd /biosoft/MCScanX/MCScanX/downstream_analyses 
sudo java family_circle_plotter -g /home/manager/gene2019/my_gene_family/MCScanX/mcscan/AT.gff -s /home/manager/gene2019/my_gene_family/MCScanX/mcscan/AT.collinearity -c /home/manager/gene2019/my_gene_family/MCScanX/family.ctl -f /home/manager/gene2019/my_gene_family/MCScanX/MADS_box_family.txt -o /home/manager/gene2019/my_gene_family/MCScanX/WRKY.circle.PNG


#找到我们分析的基因家族的共线性分析关系（大片段复制现象）：同样要回到mcscan的安装路径，才能分析

cd /biosoft/MCScanX/MCScanX/downstream_analyses 
perl /biosoft/MCScanX/MCScanX/downstream_analyses/detect_collinearity_within_gene_families.pl -i /home/manager/gene2019/my_gene_family/MCScanX/MADS_box_family.txt -d /home/manager/gene2019/my_gene_family/MCScanX/mcscan/AT.collinearity -o /home/manager/gene2019/my_gene_family/MCScanX/WRKY.collinear.pairs

#从MCSCAN分析的结果文件：AT.tandem 提取串联重复基因可以看，：http://www.omicsclass.com/article/399
cd /home/manager/gene2019/my_gene_family/MCScanX
perl ../script/get_tandem_gene.pl -id gene_list.txt -tandem mcscan/AT.tandem -od ./ -name WRKY

########基因加倍分析绘图，circos###############
#cd $workdir 回到工作路径
mkdir circos
cd circos

cp ../MCScanX/mcscan/AT.collinearity .   
cp ../MCScanX/WRKY.collinear.pairs .

#准备circos绘图数据文件，脚本从gff里面获得位置信息并整理数据
perl ../script/colline_v3.pl -gff ../MCScanX/AT.gff -list WRKY.collinear.pairs -colline AT.collinearity -od data -name WRKY

#绘图，主要准备config3.txt配置文件，以及染色体长度文件等等。
cd data

/biosoft/circos/circos-0.69-6/bin/circos -conf config3.txt -outputdir ./ -outputfile WRKY



###############################复制基因查找 blast方法 及KAKS分析#################################

#回到工作路径  my_gene_family

mkdir gene_duplication_kaks_blast
cd gene_duplication_kaks_blast
cp ../WRKY_removed_redundant_and_confirmed_IDlist.txt .
cp ../WRKY_cds_confirmed.fa .
#blast建库,DNA序列,all vs all 比对，结果说明见：http://www.omicsclass.com/article/505
makeblastdb -in WRKY_cds_confirmed.fa -dbtype nucl -title WRKY_cds_confirmed.fa 
blastall -i WRKY_cds_confirmed.fa -d WRKY_cds_confirmed.fa -p blastn -e 1e-20  -m 8 -o WRKY_cds_confirmed_blast.out

#获取基因cds序列的长度：
samtools faidx WRKY_cds_confirmed.fa

perl ../script/KAKS_SHAIXUAN.pl -in1 WRKY_cds_confirmed.fa.fai -in2 WRKY_cds_confirmed_blast.out -out duplication_gene.out

###kaks分析###

echo "AT1G66600.1\nAT1G66560.1" >dupid.txt
perl ../script/get_fa_by_id.pl dupid.txt WRKY_cds_confirmed.fa dup_gene_paired1.fa
echo "1\ndup_gene_paired1.fa\n2\n1\ndup_gene_paired1.aln\ndup_gene_paired1.dnd\nX\n\n\nX\n" |clustalw

#格式转换axt  如果遇到报错not equal，可参考：http://www.omicsclass.com/article/700
/biosoft/KaKs_Calculator2.0/src/AXTConvertor dup_gene_paired1.aln dup_gene_paired1.axt
/biosoft/KaKs_Calculator2.0/bin/Linux/KaKs_Calculator  -i dup_gene_paired1.axt -o dup_gene_paired1.kaks.result

#注意筛选基因的位置信息：在同一条染色体，且距离小于100K
#分离时间计算：http://www.omicsclass.com/question/896

##############################不同物种之间的共线性分析#############################################
# 回到工作路径  my_gene_family
mkdir mcscan_between_genome
cd mcscan_between_genome

#获取基因对应的cds序列信息，注意：基因的第一个转录本为代表序列；
#从geneid2mRNAid.txt文件中每个基因挑一个转录本ID存储到allCDSID.txt（一列）

#得到基因组上所有基因的位置信息，bed文件；以及cds序列;
perl ../script/get_mRNA_bed.pl -in1 ../Arabidopsis_thaliana.TAIR10.41.gff3 -in2  allCDSID.txt -out ATH.bed

#获取基因对应的cds序列
perl ../script/get_fa_by_id.pl allCDSID.txt ../Arabidopsis_thaliana.TAIR10.cds.all.fa ATH.cds


#同样的道理准备，准备白菜的基因组，bed文件和，cds文件；

#处理ID：
#sed 's#gene:##' Brassica_rapa.Brapa_1.0.41.chr.gff3|sed  's#transcript:##' |sed 's#CDS:##' >Brassica_rapa.Brapa_1.0.41.chr.gff31
#mv Brassica_rapa.Brapa_1.0.41.chr.gff31 Brassica_rapa.Brapa_1.0.41.chr.gff3

perl ../script/mRNAid_to_geneid.pl Brassica_rapa.Brapa_1.0.41.chr.gff3 BrmRNA2geneID.txt
perl ../script/geneid_to_mRNAid.pl Brassica_rapa.Brapa_1.0.41.chr.gff3 Brgeneid2mRNAid.txt


#获取白菜基因对应的cds序列信息，注意：基因的第一个转录本为代表序列；
#从Brgeneid2mRNAid.txt文件中每个基因挑一个转录本ID存储到BrallCDSID.txt（一列）

#得到白菜基因组上所有基因的位置信息，bed文件；以及cds序列;
perl ../script/get_mRNA_bed.pl -in1 Brassica_rapa.Brapa_1.0.41.chr.gff3 -in2  BrallCDSID.txt -out rapa.bed

#获取基因对应的cds序列
perl ../script/get_fa_by_id.pl BrallCDSID.txt Brassica_rapa.Brapa_1.0.cds.all.fa rapa.cds


#注意，不知道为什么，这个python版本的mcscan如果ID后面有.1 运行会报错，所以我们把拟南芥和白菜的ID统一都去除一下.1

sed -i 's#\.1##' rapa.cds
sed -i 's#\.1##' ATH.cds
sed -i 's#\.1##' rapa.bed
sed -i 's#\.1##' ATH.bed

/biosoft/miniconda/miniconda2/bin/python -m jcvi.compara.catalog ortholog ATH rapa --cscore=0.7
#对共线性区域进行过滤
/biosoft/miniconda/miniconda2/bin/python -m jcvi.compara.synteny screen --minsize=0 --minspan=30 --simple ATH.rapa.anchors   ATH.rapa.anchors.new
#绘制共线性图片：准备两个配置文件为输入文件：

/biosoft/miniconda/miniconda2/bin/python -m jcvi.graphics.karyotype  --format=pdf  --figsize=15x5 mcscan_seqid mcscan_layout


########################结合转录组分析基因家族成员表达量########################################

#回到工作路径  my_gene_family

mkdir rna_seq_heatmap
cd rna_seq_heatmap

cp ../WRKY_removed_redundant_and_confirmed_IDlist.txt .
sed -i 's/\.1//' WRKY_removed_redundant_and_confirmed_IDlist.txt


perl ../script/get_data_by_id.pl WRKY_removed_redundant_and_confirmed_IDlist.txt GSE121407_fpkm.txt gene_family_fpkm.txt


###########################################以下blast为可选分析内容########################################################################
#blastp比对寻找基因家族成员，WRKY部分
#NCBI上搜索WRKY蛋白序列：搜索条件：WRKY[title] NOT putative[title] AND plants[filter]
#参考文献：https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4955-8             

#回到工作路径  my_gene_family
mkdir blast
cd blast

#blast比对首先建库  #蛋白质序列
makeblastdb -in WRKY_NCBI_pep.fasta -dbtype prot -title WRKY_NCBI_pep.fasta   
#
#blastp比对
blastall -i ../Arabidopsis_thaliana.TAIR10.pep.all.fa -d WRKY_NCBI_pep.fasta -p blastp -e 1e-10 -b 1 -v 1 -m 8 -o ncbi_WRKY_blast.out 

#获得比对上的候选基因，存储在wrky.fa文件中
perl ../script/get_fa_by_id.pl ncbi_WRKY_blast.out ../Arabidopsis_thaliana.TAIR10.pep.all.fa wrky.fa

#然后将 wrky.fa 提交到 NCBI CDD  SMART 上确认，把不存在结构域的基因ID可以删除

#######################基因上游顺势作用原件分析#######################################

#回到工作路径 my_gene_family

mkdir gene_promoter
cd gene_promoter
cp ../WRKY_removed_redundant_and_confirmed_IDlist.txt .

#得到基因在染色体上的位置，此脚本会把基因组所有的序列读入内存，如果基因组较大，可能因为内存不足使脚本运行不成功，可以分染色体分开分析：
perl ../script/get_gene_weizhi.pl -in1 WRKY_removed_redundant_and_confirmed_IDlist.txt -in2 ../Arabidopsis_thaliana.TAIR10.41.gff3 -out mrna_location.txt
#根据位置信息提取，promoter序列 1500
perl ../script/get_promoter.pl ../Arabidopsis_thaliana.TAIR10.dna.toplevel.fa mrna_location.txt promoter.fa

#生成 GSDS配置文件
cat WRKY_removed_redundant_and_confirmed_IDlist.txt|awk 'BEGIN{OFS="\t"}{print $1,"0","1500","CDS","."}' >gene.bed
#生成feature文件
cat PlantCARE_9210__plantCARE/plantCARE_output_PlantCARE_9210.tab|grep "Arabidopsis"|awk -F"\t"  'BEGIN{OFS="\t"} {print $1,$4,$4+length($3),$2}'>feature.bed
小梁学生信
关注
1
点赞
踩
27

收藏

觉得还不错? 一键收藏
0
评论
基因家族分析

#下载拟南芥基因组信息#wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz#wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/cds/Arabidopsis_thaliana.
复制链接

扫一扫