下载gtf
wget ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz
提取位置信息
grep -v '#' Homo_sapiens.GRCh37.75.gtf | grep "protein_coding" | awk -F "[\t=:;]" \
'BEGIN{OFS="\t"}$3=="gene"{print $1,$4,$5,$10}' > all_gene_positions.txt
R 处理
all_gene <- read.csv('./all_gene_positions.txt',sep = '\t', header = FALSE)
all_gene <- separate(all_gene, col = V4, into = c('none', 'none2', 'gene'), sep = ' ')
all_gene <- select(all_gene, -c(4,5))
colnames(all_gene) <- c('chr', 'start', 'end', 'gene')
write.table(all_gene, './gene_position_clean.txt', sep = '\t', quote = FALSE, row.names = FALSE)