library(pathview)
library(TCGAbiolinks)
library(SummarizedExperiment)
library(clusterProfiler)
#下载HCC的Gene expression文件
query <- GDCquery(project = "TCGA-LIHC",
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "results",
sample.type = c("normal"),
legacy = TRUE)
GDCdownload(query)
#获得表达文件
HCC.exp <- GDCprepare(query,
save = TRUE,
summarizedExperiment = TRUE,
save.filename = "LIHCIllumina_HiSeq.rda")
dataPrep_HCC <- TCGAanalyze_Preprocessing(object = HCC.exp,
cor.cut = 0.6,
datatype = "raw_count",
filename = "HCC_IlluminaHiSeq_RNASeqV2.png")
dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep_HCC,
geneInfo = TCGAbiolinks::geneInfo,
method = "gcContent") #18323 672
dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
method = "quantile",
qnt.cut = 0.25)
HCC_clin <- GDCquery_clinic(project = "TCGA-LIHC", type = "Clinical")
dataFiltHCC <- subset(dataFilt,
select = substr(colnames(dataFilt),1,12) %in% HCC_clin$bcr_patient_barcode)
#下载正常的样本
query <- GDCquery(project = "TCGA-LIHC",
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "results",
sample.type = c("Solid Tissue Normal"),
legacy = TRUE)
GDCdownload(query)
#获得表达文件
HCC.normal.exp <- GDCprepare(query,
save = TRUE,
summarizedExperiment = TRUE,
save.filename = "LIHC_normal_Illumina_HiSeq.rda")
dataPrep_HCC_normal <- TCGAanalyze_Preprocessing(object = HCC.normal.exp,
cor.cut = 0.6,
datatype = "raw_count",
filename = "HCC_noraml_IlluminaHiSeq_RNASeqV2.png")
dataNorm_normal <- TCGAanalyze_Normalization(tabDF = dataPrep_HCC_normal,
geneInfo = TCGAbiolinks::geneInfo,
method = "gcContent")
dataFilt_normal <- TCGAanalyze_Filtering(tabDF = dataNorm_normal,
method = "quantile",
qnt.cut = 0.25)
#取正常样本与肿瘤样本中相同的gene交集
c <- intersect(row.names(dataFiltHCC),row.names(dataFilt_normal))
dim(dataFiltHCC)
class(dataFiltHCC)
dataFiltHCC <- as.data.frame(dataFiltHCC)
dataFiltHCC$geneid <- row.names(dataFiltHCC)
library(dplyr)
length(c)
dataFiltHCC1 <- dataFiltHCC %>%
filter(geneid %in% c)
row.names(dataFiltHCC1) <- dataFiltHCC1$geneid
names(dataFiltHCC1)
dataFiltHCC1 <- dataFiltHCC1[,-372]
dataFiltHCC1 <- as.matrix(dataFiltHCC1)
dataFilt_normal <- as.data.frame(dataFilt_normal)
dataFilt_normal$geneid <- row.names(dataFilt_normal)
names(dataFilt_normal)
dataFilt_normal1 <- dataFilt_normal %>%
filter(geneid %in% c)
row.names(dataFilt_normal1) <- dataFilt_normal1$geneid
colnames(dataFilt_normal1)
row.names(dataFilt_normal1)
dataFilt_normal1 <- dataFilt_normal1[,-51]
dataFilt_normal1 <- as.matrix(dataFilt_normal1)
row.names(dataFilt_normal1)
#排序
dataFilt_normal1 <- dataFilt_normal1[order(row.names(dataFilt_normal1),decreasing=FALSE),]
dataFiltHCC1 <- dataFiltHCC1[order(row.names(dataFiltHCC1),decreasing=FALSE),]
# should be TRUE,TRUE说明两个矩阵的行名一模一样
all(row.names(dataFilt_normal1) == row.names(dataFiltHCC1))
#差异分析
dataDEGs <- TCGAanalyze_DEA(mat1 = dataFiltHCC1,
mat2 = dataFilt_normal1,
Cond1type = "Tumor",
Cond2type = "Normal",
fdr.cut = 0.01 ,
logFC.cut = 1,
method = "glmLRT")
> head(dataDEGs)
logFC logCPM LR PValue FDR
155060 -2.571642 2.321918 156.36454 7.047070e-36 7.143353e-35
90288 -1.676365 1.398311 28.34992 1.012510e-07 1.675663e-07
A4GALT -1.396513 2.638181 53.84503 2.169434e-13 4.951607e-13
AACS -1.616439 3.692283 117.12599 2.693991e-27 1.477036e-26
AADAT 2.179292 3.500549 135.76657 2.244217e-31 1.638009e-30
AAGAB -1.150286 4.838394 244.67963 3.752987e-55 2.604312e-53
> dim(dataDEGs)
[1] 5461 5
dataDEGs$ID <- row.names(dataDEGs)
dataDEGs <- dataDEGs[,-6]
d <- intersect(MethylMixResults$MethylationDrivers,dataDEGs$ID )
HCC_GO <- dataDEGs %>%
dplyr::filter(ID %in% d )
HCC_GO <- data.frame(ID = HCC_GO $ID,logFC =HCC_GO $logFC)
library(org.Hs.eg.db)
library(topGO)
library(GSEABase)
library(clusterProfiler)
#将txt中的一行或一列标成一个向量
x <- HCC_GO$ID #total_RNA是一个只有一行基因标准名的txt文件
#将标准的gene转化为ENTREZID格式
eg = bitr(x, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
#进行GO富集
ego <- enrichGO(gene = eg$ENTREZID,
OrgDb = org.Hs.eg.db,
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.2,
qvalueCutoff = 0.2,
readable = T)
df_ego <- as.data.frame(ego)
write.table(df_ego,"sample.csv",sep=",")
sample <- read.csv("I:/test/sample.csv", stringsAsFactors=FALSE)
#生成GOplot图
library(GOplot)
# Generate the plotting object
circ <- circle_dat(sample, HCC_GO)
#sample的列名为 :Category ID Term Genes adj_pval
#GO_log的列名为:ID logFC
#绘制GOplot
chord <- chord_dat(data = circ, genes = HCC_GO, process = sample$Term)
GOChord(chord, space = 0.02, gene.order = 'logFC',gene.space = 0.25, gene.size = 2,process.label = 6)
# DEGs TopTable
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(dataDEGs,"Tumor","Normal",
dataFiltHCC1[,colnames(dataFiltHCC1)],
dataFilt_normal1[,colnames(dataFilt_normal1)])
dataDEGsFiltLevel$GeneID <- 0
> head(dataDEGsFiltLevel)
mRNA logFC FDR Tumor Normal Delta GeneID
ALB ALB 1.159244 7.042187e-16 2491918.90 5975905.3 2888743.2 0
HP HP 1.738657 1.612502e-16 640987.55 2297407.6 1114457.4 0
FTL FTL -1.144761 3.796445e-10 515336.83 250421.6 589937.3 0
GLUL GLUL -2.998764 5.611403e-14 187833.75 25261.8 563269.1 0
SPP1 SPP1 -5.972776 6.830736e-21 71733.61 1227.8 428448.7 0
APOA1 APOA1 1.226569 1.459487e-09 294893.27 740950.3 361706.8 0
dataDEGsFiltLevel$ID <- row.names(dataDEGsFiltLevel)
head(dataDEGsFiltLevel)
> head(dataDEGsFiltLevel)
mRNA logFC FDR Tumor Normal Delta GeneID
ALB ALB 1.159244 7.042187e-16 2491918.90 5975905.3 2888743.2 0
HP HP 1.738657 1.612502e-16 640987.55 2297407.6 1114457.4 0
FTL FTL -1.144761 3.796445e-10 515336.83 250421.6 589937.3 0
GLUL GLUL -2.998764 5.611403e-14 187833.75 25261.8 563269.1 0
SPP1 SPP1 -5.972776 6.830736e-21 71733.61 1227.8 428448.7 0
APOA1 APOA1 1.226569 1.459487e-09 294893.27 740950.3 361706.8 0
dataDEGsFiltLevel <- dataDEGsFiltLevel[,-8]
dataDEGsFiltLevel1 <- dataDEGsFiltLevel %>%
dplyr::filter(mRNA %in% d)
a <- c(dataDEGsFiltLevel$mRNA)
dataDEGsFiltLevel$mRNA <- a
dim(dataDEGsFiltLevel1)
# Converting Gene symbol to geneID
eg = as.data.frame(bitr(dataDEGsFiltLevel1$mRNA,
fromType="SYMBOL",
toType="ENTREZID",
OrgDb="org.Hs.eg.db"))
eg <- eg[!duplicated(eg$SYMBOL),]
dataDEGsFiltLevel1 <- dataDEGsFiltLevel1[dataDEGsFiltLevel1$mRNA %in% eg$SYMBOL,]
dataDEGsFiltLevel1 <- dataDEGsFiltLevel1[order(dataDEGsFiltLevel1$mRNA,decreasing=FALSE),]
eg <- eg[order(eg$SYMBOL,decreasing=FALSE),]
# table(eg$SYMBOL == dataDEGsFiltLevel$mRNA) should be TRUE
all(eg$SYMBOL == dataDEGsFiltLevel1$mRNA)
dataDEGsFiltLevel1$GeneID <- eg$ENTREZID
dataDEGsFiltLevel_sub <- subset(dataDEGsFiltLevel1, select = c("GeneID", "logFC"))
genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC)
names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID
library(pathview)
# pathway.id: hsa05214 is the glioma pathway
# limit: sets the limit for gene expression legend and color
> head(genelistDEGs)
4363 215 84519 54988 113622 60312
-1.731574 -1.392554 -1.294594 1.384184 -1.312951 -1.418266
hsa05225 <- pathview::pathview(gene.data = genelistDEGs,
pathway.id = "hsa05225",
species = "hsa",
limit = list(gene=as.integer(max(abs(genelistDEGs)))))
TCGA-HCC-差异分析-富集分析-pathview
最新推荐文章于 2024-08-10 08:27:02 发布