gencode注释文件下载
1.gencode注释文件下载地址:https://www.gencodegenes.org/human/release_35lift37.html
安装gawk
安装gawk主要是为了调用gensub功能,有关gensub功能解释见:
http://blog.sina.com.cn/s/blog_6e5e78bf0100waml.html
sudo apt-get install gawk
格式转换
#首先看一下gtf文件,我们所需要的是基因的位置注释性息,也就是第三列为gene的行
#description: evidence-based annotation of the human genome (GRCh38), version 35 (Ensembl 101), mapped to GRCh37 with gencode-backmap
#provider: GENCODE
#contact: gencode-help@ebi.ac.uk
#format: gff3
#date: 2020-06-03
chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5_4"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2_4"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5_4"; transcript_id "ENST00000456328.2_1"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-202"; level 2; transcript_support_level 1; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2_4"; havana_transcript "OTTHUMT00000362751.1_1"; remap_num_mappings 1; remap_status "full_contig"; remap_target_status "overlap";
chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5_4"; transcript_id "ENST00000456328.2_1"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-202"; exon_number 1; exon_id "ENSE00002234944.1_1"; level 2; transcript_support_level 1; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2_4"; havana_transcript "OTTHUMT00000362751.1_1"; remap_original_location "chr1:+:11869-12227"; remap_status "full_contig";
chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000223972.5_4"; transcript_id "ENST00000456328.2_1"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-202"; exon_number 2; exon_id "ENSE00003582793.1_1"; level 2; transcript_support_level 1; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2_4"; havana_transcript "OTTHUMT00000362751.1_1"; remap_original_location "chr1:+:12613-12721"; remap_status "full_contig";
chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000223972.5_4"; transcript_id "ENST00000456328.2_1"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-202"; exon_number 3; exon_id "ENSE00002312635.1_1"; level 2; transcript_support_level 1; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2_4"; havana_transcript "OTTHUMT00000362751.1_1"; remap_original_location "chr1:+:13221-14409"; remap_status "full_contig";
#🔛看一下目标格式
1 11869 14409 DDX11L1 + transcribed_unprocessed_pseudogene
1 14404 29570 WASH7P - unprocessed_pseudogene
1 29554 31109 MIR1302-2HG + lncRNA
1 34554 36081 FAM138A - lncRNA
1 52473 53312 OR4G4P + unprocessed_pseudogene
1 57598 64116 OR4G11P + transcribed_unprocessed_pseudogene
1 65419 71585 OR4F5 + protein_coding
1 89295 133723 AL627309.1 - lncRNA
1 89551 91105 AL627309.3 - lncRNA
1 131025 134836 CICP27 + processed_pseudogene
#即第一列 为染色体编号 原始文件中带有chr,第二列第三列为基因位置,第四列为gene_symbol,第五列是我自己额外想要的基因类型的注释
#如果每一行的列数一致的话,就比较好操作了,直接提取对应列就可以了
awk awk 'BEGIN{ORS="\t"}{print NF}' gencode.v35lift37.annotation.gtf
36 38 38 38 38 38 38 36 38 38 38 38 36 38 38 38 26 48 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
#可以看到每一行的列書目並不相同,就原文件來說,我們的第1 4 5列位置是 相同
#首先挑選出第三列是 gene 的行
awk '$3=="gene" {print $0}' gencode.v35lift37.annotation.gtf >gencode.v35lift37.annotation_justGene.gtf
#看一下
more gencode.v35lift37.annotation_justGene.gtf
chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5_4"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_ge
ne "OTTHUMG00000000961.2_4"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1 HAVANA gene 14404 29570 . - . gene_id "ENSG00000227232.5_4"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; level 2; hgnc_id "HGNC:38034"; havana_gene "OTTHUMG00
000000958.1_4"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1 HAVANA gene 29554 31109 . + . gene_id "ENSG00000243485.5_8"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; level 2; hgnc_id "HGNC:52482"; tag "ncRNA_host"; havana_gene "OT
THUMG00000000959.2_8"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1 HAVANA gene 34554 36081 . - . gene_id "ENSG00000237613.2_5"; gene_type "lncRNA"; gene_name "FAM138A"; level 2; hgnc_id "HGNC:32334"; havana_gene "OTTHUMG00000000960.1_5";
remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
chr1 HAVANA gene 52473 53312 . + . gene_id "ENSG00000268020.3_4"; gene_type "unprocessed_pseudogene"; gene_name "OR4G4P"; level 2; hgnc_id "HGNC:14822"; havana_gene "OTTHUMG00
000185779.1_4"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";
#接下來選出我想要的 1 4 5 列 以及 gene_name gene_type
awk '{printf $1=gensub("^chr","",1,$1)"\t"} {printf $4"\t"$5"\t"}{n=10;while($n !="gene_name"){n++}} {printf $(n+1)=gensub(/[";]/,"","g",$(n+1))"\t"$7"\t"} {n=10;while($n!="gene_type"){n++}} {printf $(n+1)=gensub(/[";]/,"","g",$(n+1))"\n"}' gencode.v35lift37.annotation_justGene.gtf >gencode.v35lift37.annotation_gene_type.bed
#genesub(pattern,target,替換幾次,替换第及列)
#{n=10;while($n !="gene_name"){n++}} 寻找第几列为gene_name
#{printf $(n+1)=gensub(/[";]/,"","g",$(n+1))"\t"$7"\t"} 上一步找到了第及列是gene_name 则第n+1列肯定是我们要的内容,同时替换掉其中的冒号和分号,$7是链的这正负,顺便输出了一下
#{n=10;while($n!="gene_type"){n++}} {printf $(n+1)=gensub(/[";]/,"","g",$(n+1))"\n" 用来寻找gene_type