ChIPseeker软件包实现了以下功能:检索峰值附近最近的基因、注释峰值的基因组区域、估计芯片峰值数据集之间重叠重要性的统计方法,以及整合地理数据库,以便用户将自己的数据集与存储在数据库中的数据集进行比较。比较可以用来推断合作规则,因此可以用来生成假设。实现了几个可视化功能,以总结峰实验的覆盖范围、结合到TSS区域的峰的平均轮廓和热图、基因组注释、到TSS的距离以及峰或基因的重叠。
######ChIPseeker####
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("ChIPseeker")
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(GenomicFeatures)
library(GenomicRanges)
library(clusterProfiler)
library(ggplot2)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
### 1. 读入chip-seq峰值的bed文件
bed_file <- "/Users/test_summits.bed"
## .bed或.bed.gz 文件,peak的位置,可以由macs2软件生成。
peak <- readPeakFile(bed_file)
class(peak) # "GRanges"
peak
### 2.作图展示peak位置
# plot peak coverage
covplot(peak, weightCol="V5")
# 画指定区域
covplot(peak, weightCol="V5", chrs=c("chr17", "chr18"), xlim=c(4.5e7, 5e7))
covplot(peak, weightCol="V5", chrs=c("chr17", "chr18"))
# 得到启动子区域
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
class(promoter) #"GRanges"
# 计算tag矩阵
tagMatrix <- getTagMatrix(peak, windows=promoter)
# plot the heatmap of tagMatrix
tagHeatmap(tagMatrix, xlim=c(-3000, 3000), color="red")
# # 以上两行代码等同于下面一行代码
# #plot the heatmap of peaks align to flank sequences of TSS
# peakHeatmap(bed_file, TxDb=txdb, upstream=3000, downstream=3000, color="red")
# plot the profile of peaks
plotAvgProf(tagMatrix, xlim=c(-3000, 3000),
xlab="Genomic Region (5'->3')",
ylab = "Read Count Frequency")
# 加上95%置信区间
plotAvgProf(tagMatrix, xlim=c(-3000, 3000),
conf = 0.95, resample = 1000)
### 3.peaks注释
## 注释峰值Annotate peaks
peakAnno <- annotatePeak(peak, tssRegion=c(-3000, 3000),
TxDb=txdb, annoDb="org.Hs.eg.db")
class(peakAnno) #"csAnno"
# 保存文件
write.table(peakAnno, file = 'peak1.txt',sep = '\t',
quote = FALSE, row.names = FALSE)
# 统计作图
plotAnnoPie(peakAnno)
plotAnnoBar(peakAnno)
#dev.new()
vennpie(peakAnno)
# upsetplot(peakAnno, vennpie=TRUE) #wrong
plotDistToTSS(peakAnno,
title="Distribution of transcription factor-binding loci\nrelative to TSS")
### 4.富集分析
library(ReactomePA)
pathway1 <- enrichPathway(as.data.frame(peakAnno)$geneId)
head(pathway1, 2)
gene <- seq2gene(peak, tssRegion = c(-1000, 1000), flankDistance = 3000, TxDb=txdb)
pathway2 <- enrichPathway(gene)
head(pathway2, 2)
dotplot(pathway2)
### 5. 比较不同实验富集到的peaks(先注释好)
files <- getSampleFiles() #自带bed.gz文件
txdb_hg19 <- TxDb.Hsapiens.UCSC.hg19.knownGene
peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb_hg19,
tssRegion=c(-3000, 3000), verbose=FALSE)
plotAnnoBar(peakAnnoList)
genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
#names(genes) = sub("_", "\n", names(genes))
compKEGG <- compareCluster(geneCluster = genes,
fun = "enrichKEGG",
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
# fun :One of "groupGO", "enrichGO", "enrichKEGG", "enrichDO" or "enrichPathway" .
dotplot(compKEGG, showCategory = 15, title = "KEGG Pathway Enrichment Analysis")
genes= lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
vennplot(genes)
p <- GRanges(seqnames=c("chr1", "chr3"),
ranges=IRanges(start=c(1, 100), end=c(50, 130)))
shuffle(p, TxDb=txdb_hg19)
# peaks重叠的统计学意义
# calculate overlap significant of ChIP experiments
# based on the genome coordinations
enrichPeakOverlap(queryPeak = files[[5]],
targetPeak = unlist(files[1:4]),
TxDb = txdb_hg19,
pAdjustMethod = "BH",
nShuffle = 50,
chainFile = NULL,
verbose = FALSE)