使用awk将gencode gtf注释文件转换为bed格式

最新推荐文章于 2024-07-29 08:00:39 发布

melody359

最新推荐文章于 2024-07-29 08:00:39 发布

阅读量2.8k

点赞数 2

分类专栏：生物信息

本文链接：https://blog.csdn.net/qq_42756195/article/details/108441565

版权

生物信息专栏收录该内容

2 篇文章 1 订阅

订阅专栏

gencode注释文件下载

1.gencode注释文件下载地址：https://www.gencodegenes.org/human/release_35lift37.html

安装gawk

安装gawk主要是为了调用gensub功能，有关gensub功能解释见：
http://blog.sina.com.cn/s/blog_6e5e78bf0100waml.html

sudo apt-get install gawk

格式转换

#首先看一下gtf文件，我们所需要的是基因的位置注释性息，也就是第三列为gene的行
#description: evidence-based annotation of the human genome (GRCh38), version 35 (Ensembl 101), mapped to GRCh37 with gencode-backmap
#provider: GENCODE
#contact: gencode-help@ebi.ac.uk
#format: gff3
#date: 2020-06-03
chr1	HAVANA	gene	11869	14409	.	+	.	gene_id "ENSG00000223972.5_4"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2_4"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1	HAVANA	transcript	11869	14409	.	+	.	gene_id "ENSG00000223972.5_4"; transcript_id "ENST00000456328.2_1"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-202"; level 2; transcript_support_level 1; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2_4"; havana_transcript "OTTHUMT00000362751.1_1"; remap_num_mappings 1; remap_status "full_contig"; remap_target_status "overlap";
chr1	HAVANA	exon	11869	12227	.	+	.	gene_id "ENSG00000223972.5_4"; transcript_id "ENST00000456328.2_1"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-202"; exon_number 1; exon_id "ENSE00002234944.1_1"; level 2; transcript_support_level 1; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2_4"; havana_transcript "OTTHUMT00000362751.1_1"; remap_original_location "chr1:+:11869-12227"; remap_status "full_contig";
chr1	HAVANA	exon	12613	12721	.	+	.	gene_id "ENSG00000223972.5_4"; transcript_id "ENST00000456328.2_1"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-202"; exon_number 2; exon_id "ENSE00003582793.1_1"; level 2; transcript_support_level 1; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2_4"; havana_transcript "OTTHUMT00000362751.1_1"; remap_original_location "chr1:+:12613-12721"; remap_status "full_contig";
chr1	HAVANA	exon	13221	14409	.	+	.	gene_id "ENSG00000223972.5_4"; transcript_id "ENST00000456328.2_1"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-202"; exon_number 3; exon_id "ENSE00002312635.1_1"; level 2; transcript_support_level 1; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2_4"; havana_transcript "OTTHUMT00000362751.1_1"; remap_original_location "chr1:+:13221-14409"; remap_status "full_contig";
#🔛看一下目标格式
1	11869	14409	DDX11L1	+	transcribed_unprocessed_pseudogene
1	14404	29570	WASH7P	-	unprocessed_pseudogene
1	29554	31109	MIR1302-2HG	+	lncRNA
1	34554	36081	FAM138A	-	lncRNA
1	52473	53312	OR4G4P	+	unprocessed_pseudogene
1	57598	64116	OR4G11P	+	transcribed_unprocessed_pseudogene
1	65419	71585	OR4F5	+	protein_coding
1	89295	133723	AL627309.1	-	lncRNA
1	89551	91105	AL627309.3	-	lncRNA
1	131025	134836	CICP27	+	processed_pseudogene
#即第一列 为染色体编号 原始文件中带有chr，第二列第三列为基因位置，第四列为gene_symbol,第五列是我自己额外想要的基因类型的注释
#如果每一行的列数一致的话，就比较好操作了，直接提取对应列就可以了
awk awk 'BEGIN{ORS="\t"}{print NF}'  gencode.v35lift37.annotation.gtf
	36	38	38	38	38	38	38	36	38	38	38	38	36	38	38	38	26	48	50	50	50	50	50	50	50	50	50	50	50	50	50	50	50	50	50	50	
#可以看到每一行的列書目並不相同,就原文件來說，我們的第1 4 5列位置是 相同
#首先挑選出第三列是 gene 的行
awk '$3=="gene" {print $0}' gencode.v35lift37.annotation.gtf >gencode.v35lift37.annotation_justGene.gtf 
#看一下
more gencode.v35lift37.annotation_justGene.gtf
chr1	HAVANA	gene	11869	14409	.	+	.	gene_id "ENSG00000223972.5_4"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_ge
ne "OTTHUMG00000000961.2_4"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1	HAVANA	gene	14404	29570	.	-	.	gene_id "ENSG00000227232.5_4"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; level 2; hgnc_id "HGNC:38034"; havana_gene "OTTHUMG00
000000958.1_4"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1	HAVANA	gene	29554	31109	.	+	.	gene_id "ENSG00000243485.5_8"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; level 2; hgnc_id "HGNC:52482"; tag "ncRNA_host"; havana_gene "OT
THUMG00000000959.2_8"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1	HAVANA	gene	34554	36081	.	-	.	gene_id "ENSG00000237613.2_5"; gene_type "lncRNA"; gene_name "FAM138A"; level 2; hgnc_id "HGNC:32334"; havana_gene "OTTHUMG00000000960.1_5";
 remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1	HAVANA	gene	52473	53312	.	+	.	gene_id "ENSG00000268020.3_4"; gene_type "unprocessed_pseudogene"; gene_name "OR4G4P"; level 2; hgnc_id "HGNC:14822"; havana_gene "OTTHUMG00
000185779.1_4"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
#接下來選出我想要的 1 4 5  列 以及 gene_name gene_type
awk '{printf $1=gensub("^chr","",1,$1)"\t"} {printf $4"\t"$5"\t"}{n=10;while($n !="gene_name"){n++}} {printf $(n+1)=gensub(/[";]/,"","g",$(n+1))"\t"$7"\t"} {n=10;while($n!="gene_type"){n++}} {printf  $(n+1)=gensub(/[";]/,"","g",$(n+1))"\n"}' gencode.v35lift37.annotation_justGene.gtf  >gencode.v35lift37.annotation_gene_type.bed
#genesub(pattern,target,替換幾次,替换第及列)
#{n=10;while($n !="gene_name"){n++}} 寻找第几列为gene_name
#{printf $(n+1)=gensub(/[";]/,"","g",$(n+1))"\t"$7"\t"} 上一步找到了第及列是gene_name 则第n+1列肯定是我们要的内容，同时替换掉其中的冒号和分号，$7是链的这正负，顺便输出了一下 
#{n=10;while($n!="gene_type"){n++}} {printf  $(n+1)=gensub(/[";]/,"","g",$(n+1))"\n" 用来寻找gene_type