有时候我们遇见的gene id 与protein id 不相符,此时需要从gtf文件提取
使用蛋白质的序列进行eggnog 注释后结果如下query是XP
但通过featurecount 进行表达定量后的的gene id 如下
如何找到对应关系?
R 可以完成
通过对gtf文件的X9 进行处理 ,使得gene id 与对应的蛋白id 能够对应成表
以下是代码
library(readr)
# 读取 gtf 文件
gtf <- read_delim("~/Program/crab/data/GCF_024679095.1_ASM2467909v1_genomic.gtf.gz",
delim = "\t", escape_double = FALSE,
col_names = FALSE, comment = "#", trim_ws = TRUE)
# 选取 编码 基因
cds = gtf %>% filter(X3 == "CDS" )
# 提取 X9
cds = cds %>% dplyr::select(9) %>% separate(X9,into = c("1","2","3","4","5","6","7","8","9"),sep = ";") %>%
dplyr::select(1,7)
# 替换 文字
cds$`1` = cds$`1` %>% str_replace_all("gene_id \"","") %>%
str_replace_all("\"","")
cds$`7` = cds$`7` %>% str_replace_all(" protein_id \"","") %>%
str_replace_all("\"","")
# 形成表格并复查
name = cds %>% dplyr::select(gene_id = `1`,protein_id=`7`) %>% unique() %>%
filter(str_detect(protein_id,"XP_"))