Affymetrix Human Transcriptome Array 2.0芯片 - Gene_ID转换

最新推荐文章于 2024-07-30 17:44:52 发布

Ehontoo

最新推荐文章于 2024-07-30 17:44:52 发布

阅读量1.3k

点赞数 1

文章标签： r语言

本文链接：https://blog.csdn.net/Ehontoo/article/details/134153339

版权

本文介绍了AffymetrixGPL16686、GPL17586和GPL19251三种基因芯片平台的区别，以及使用R语言进行GSE77532和GSE110359数据处理的方法，包括手动和biomaRt包的id转换和数据过滤过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

GPL16686芯片平台分析

天涯清水
IP属地: 重庆
0.635
2022.06.20 22:33:45
字数 302
阅读 3,005

https://www.jianshu.com/p/d3463f3a3a67

芯片分析中经常会遇到Affymetrix Human Transcriptome Array 2.0芯片,由于目前还没有现成的R包可以用，因此分析方法也不统一。见生信技能树Jimmy老师HTA2.0芯片比较麻烦，其实这类常见的有3个平台，3种类型：
- GPL17586 [HTA-2_0] Affymetrix Human Transcriptome Array 2.0 [transcript (gene) version]
- GPL19251 [HuGene-2_0-st] Affymetrix Human Gene 2.0 ST Array [probe set (exon) version]
- GPL16686 [HuGene-2_0-st] Affymetrix Human Gene 2.0 ST Array [transcript (gene) version]

对于这三种平台可以去Affymetrix的官网去查看其区别，也可以去NCBI去查看:
GPL17586
GPL19251
GPL16686

方法一

我按照芯片分析的一般流程进行分析，以GPL16686平台，GSE77532 为例：

读入soft文件，手动下载GSE77532对应的soft文件，实际应该下载GPL16686对应的GPL16686.soft文件，网速原因，只能退而求其次。

rm(list = ls())
options(stringsAsFactors = F)

#读入soft文件
library(GEOquery)
gse77532 <- getGEO(filename = "GSE77532_family.soft.gz",destdir = ".")
dim(gse77532)

y <- gse77532@gpls$GPL16686@dataTable@table
dim(y)

head(y)
y[1:4,1:8]

id转换

#### id conversion

library(clusterProfiler)
ENTREZID<- bitr(y[,6], fromType = "ACCNUM", 
                toType=c("SYMBOL","ENSEMBL","ENTREZID"),
                OrgDb = org.Hs.eg.db)
ls(package:clusterProfiler)

dim(ENTREZID)
ENTREZID[1:5,1:4]

save(y,ENTREZID,file = "ids.Rdata")

#ids过滤探针
table(y$GB_ACC %in% ENTREZID$ACCNUM)
y1 <- y[y$GB_ACC %in% ENTREZID$ACCNUM,]
y1[1:5,1:8]
y2 <- y1[,c(1,6)]
names(y2) <- c("probe_id","ACCNUM")

#合并y2与ENTREZID
ids <- merge(y2,ENTREZID,by ="ACCNUM",all=F)
ids[1:5,1:5]
dim(ids)
##载入表达矩阵
load("GSE77532_exprSet.Rdata")

exprSet <- exprSet2
exprSet[1:5,1:6]

#过滤表达矩阵

exprSet <- exprSet[rownames(exprSet) %in% ids$probe_id,]
dim(exprSet)
exprSet[1:5,1:5]

#ids过滤探针
ids <- ids[match(rownames(exprSet),ids$probe_id),]
dim(ids)
ids[1:2,1:5]
ids <- ids[,c(2,3)]
dim(ids)
ids[1:2,1:2]
#合并表达矩阵和ids

idcombine <- function(exprSet, ids){
  tmp <- by(exprSet,
            ids$SYMBOL,
            function(x) rownames(x)[which.max(rowMeans(x))])
  probes <- as.character(tmp)
  print(dim(exprSet))
  exprSet <- exprSet[rownames(exprSet) %in% probes,]
  
  print(dim(exprSet))
  rownames(exprSet) <- ids[match(rownames(exprSet), ids$probe_id),2]
  return(exprSet)
}

new_exprSet <- idcombine(exprSet,ids)
new_exprSet[1:4,1:6]

方法二

id 转换用biomaRt包，更方便一些，只是网速支持不下来。

GPL17586平台芯片
#
rm(list = ls())
options(stringsAsFactors = F)

#加载R包

library(GEOquery)

#读入soft文件

GSE110359 <- getGEO(filename = "GSE110359_family.soft.gz",destdir = ".")
dim(GSE110359)

y <- GSE110359@gpls$GPL17586@dataTable@table
 

dim(y)

head(y)
y[1:4,1:15]
View(head(y))## you need to check this , which column do you need

probe2gene <- y[,c(2,8)] 

library(stringr)  
probe2gene$symbol=trimws(str_split(probe2gene$gene_assignment,'//',simplify = T)[,2])
plot(table(table(probe2gene$symbol)),xlim=c(1,50))
head(probe2gene)


dim(probe2gene)
View(head(probe2gene))
ids2 <- probe2gene[,c(1,3)]
View(head(ids))
ids2[1:20,1:2]#含有缺失值
save(ids2,probe2gene,file='GSE110359-probe2gene.Rdata')

load("GSE110359-probe2gene.Rdata")
#####
View(head(probe2gene))
dim(probe2gene)

ID 转换

library(biomaRt)

x <- probe2gene$probeset_id
value <- x
attr <- c("affy_hta_2_0","hgnc_symbol")

ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") 



ids <- getBM(attributes = attr,
             filters = "affy_hta_2_0",
             values = value,
             mart = ensembl,
             useCache = F)

dim(ids)#[1] 1041    2
View(head(ids))

save(ids,file = "GPL17586_ids.Rdata")


#去重之后
table(unique(ids$hgnc_symbol))#28262

attributes <- listAttributes(ensembl)
View(attributes) # 查看转换格式

save(ids,ensembl,y,file = "ensembl.Rdata")