R与bioconductor--IRanges GRanges AnnotationHub Biostrings BSgenome GenomicRanges GenomicFeatures rtra

6 篇文章 0 订阅
博主自学了coursera上来自约翰霍普金斯大学<使用Bioconductor分析基因组科学数据>,很不错,推荐给大家

R基本类型(R Base Type)

library(method)
method包,里面有as()方法,相当于as.***(),用于对象数据类型的转换

Now, in Bioconductor, we often have very complicated objects and we kind of want to do the same thing but for very complicated objects. And for that, we don't have as.something function. But we have a general function that's inside the methods package.
And the function is just called as. And the way you use it is as, object and whatever you want to cast it as. So this here is very similar to the as.matrix, but it works for very general types of objects. So matrix could be some of the many different new types of objects we'll learn about in Bioconductor, and we can cast between them using the as function.
 

IRanges和 GRanges 对象

在GenomicRanges和IRanges包里
速度更快,用法稍稍复杂

IRanges
重要的function
reduce()将两个IRange合并,等价于 union()
disjoin()找出non-overlapping的部分
resize()将IRange对象变换大小,fix 参数指定位置。类似的还有shift(),flank()
最重要的function
ov<- findOverlaps()
queryHits(ov)
countOverlaps()
注释最近的基因名nearest()

GRanges
flank()
promoters()
seqinfo()
seqlengths()
seqlevels()
seqnames()
gaps()
genome()#设置GRanges的染色体名称

DataFrame()更加适合存储IRange对象
findOverlaps()
subsetByOverlaps()
makeGRangesFromDataFrame()将data.frame转化成GRanges

dropSeqlevels()
keepSeqlevels()
keepStandardChromosomes()
newStyle <- mapSeqlevels(seqlevels(gr),""NCBI)
renameSeqlevels(gr,newStyle)


AnnotationHub
# AnnotationHub可以方便访问各种在线数据库资源
library(AnnotationHub)
ahub = AnnotationHub()
ahub = subset(ahub,species =="Homo sapiens")
qhs = query(ahub,c("H3K4me3","Gm12878"))
gr1 = qhs[[4]]
qhs = query(ahub,"RefSeq")
genes = qhs[[1]]
prom = promoters(genes)



Biostrings
library(Biostrings)
dna1 = DNAString("ACGT-G")
dna2 = DNAStringSet(c("ACGCT","ACG","ACGTT"))
rev(dna2)#倒序排列
reverse(dna2)#生物学互补
reverseComplement(dna2)#生物学反向互补
translate(dna2)#翻译
alphabetFrequency(dna2)#统计单个碱基出现频率
letterFrequency(dna2,letters = "GC")
dinucleotideFrequency(dna2)#统计双个碱基出现频率
consensusMatrix(dna2)#方便用于寻找motif


BSgenome

library(BSgenome)
available.genomes()#查看所有可以下载的基因组
biocLite("BSgenome.Scerevisiae.UCSC.sacCer2")
library("BSgenome.Scerevisiae.UCSC.sacCer2")
library(BSgenome.Hsapiens.UCSC.hg19)
genome<- BSgenome.Hsapiens.UCSC.hg19
seqnames(genome)
seqlengths(genome)
letterFrequency(genome$chrI,"GC",as.prob = TRUE)
#bsapply类似于apply函数,需要param,param提供了应用的"对象"和"函数"
#bsapply再提供"函数"的参数
param = new("BSParams",X = Scerevisiae,FUN = letterFrequency)
bsapply(param,"GC")
unlist(bsapply(param,"GC"))
sum(unlist(bsapply(param,"GC")))/sum(seqlengths(genome))
unlist(bsapply(param,"GC",as.prob = TRUE))


Biostrings-Matching

library("BSgenome.Scerevisiae.UCSC.sacCer2")
dnaseq <- DNAString("ACGTACGT")
matchPattern(dnaseq,Scerevisiae$chrI)
countPattern(dnaseq,Scerevisiae$chrI)
vmatchPattern(dnaseq,Scerevisiae)#搜索全部染色体
#以下是序列比对的方法
matchPVM()
pairwiseAlignment()
trimLRPatterns()

BSgenome-Views

library("BSgenome.Scerevisiae.UCSC.sacCer2")
dnaseq <- DNAString("ACGTACGT")
vi = matchPattern(dnaseq,Scerevisiae$chrI)
vi#这个view对象底层就是IRange,所以用于IRange的方法也能用在这里
ranges(vi)
Scerevisiae$chrI[57932:57939]
alphabetFrequency(vi)
shift(vi,10)#view对象还存储了目标"染色体"对象
#多"染色体"比对
gr = vmatchPattern(dnaseq,Scerevisiae)
gr
vi2 = Views(Scerevisiae,gr)
vi2
library(AnnotationHub)
ahub = AnnotationHub()
qh = query(ahub,c("sacCer2","genes"))
genes = ahub[["AH7048"]]
prom = promoters(genes)
prom = trim(prom)#有些gene在染色体边界处,所以会有warning,使用前要trim
promView = Views(Scerevisiae,prom)
promView
gcProm = letterFrequency(promView,"GC",as.prob = TRUE)
plot(density(gcProm))


GenomicRanges-Rle
方便coverage类型的数据操作
library(GenomicRanges)
rl <- Rle(c(1,1,1,1,1,1,2,2,2,2,2,4,4,2))#相当于每个position上的coverage
rl
runLength(rl)
runValue(rl)
as.numeric(rl)
ir = IRanges(start = c(2,8),width = 4)
ir
vec = as.numeric(rl)
mean(vec[2:5])
mean(vec[8:11])
aggregate(rl,ir ,FUN = mean)#查看Rle在IRanges区域的平均覆盖度
ir = IRanges(start = 1:5,width = 3)
ir
coverage(ir)#查看覆盖度
slice(rl,2)#coverage比2高的区域
vi = Views(rl,IRanges(c(2,8),width = 2))
mean(vi)
gr <- GRanges(seqnames = "chr1",ranges = IRanges(start = 1:10,width=3))
rl <- coverage(gr)
rl
vi = Views(rl,as(GRanges("chr1",ranges = IRanges(3,7)),"RangesList"))
vi = Views(rl,GRanges("chr1",ranges = IRanges(3,7)))
vi$chr1

GenomicRanges-Lists
GenomicRanges中的Lists
library(GenomicRanges)
gr1 <- GRanges(seqnames = "chr1",ranges = IRanges(start = 1:4,width = 3))
gr2 <- GRanges(seqnames = "chr2",ranges = IRanges(start = 1:4,width = 3))
gL <- GRangesList(gr1 = gr1,gr2 = gr2)
gL#存储了GRanges对象的List
start(gL)
seqnames(gL)
elementNROWS(gL)
sapply(gL,length)
shift(gL,10)
findOverlaps(gL,gr2)


GenomicFeatures
载入常见注释信息
library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
txdb#gene_id实际上是entrez id
gr = GRanges(seqnames = "chr1",strand = "+",ranges = IRanges(start = 11874,end = 14409))
subsetByOverlaps(genes(txdb),gr)#和该区域重叠的基因
subsetByOverlaps(genes(txdb),gr,ignore.strand =TRUE)
subsetByOverlaps(transcripts(txdb),gr)#输出三个位置相同的是pre-RNA
subsetByOverlaps(exons(txdb),gr)
subsetByOverlaps(exonsBy(txdb,by = "tx"),gr)#tx为转录本的简写
#并不是所有的基因都有CDS,并不是所有的转录本都有CDS
#很多数据库的处理方式:计算所有ORF阅读框,然后找到最长的那个作为CDS
subsetByOverlaps(cds(txdb),gr)
subsetByOverlaps(cdsBy(txdb,by = "tx"),gr)#查看哪儿个转录本有CDS
subsetByOverlaps(exonsBy(txdb,by = "tx"),gr)["2"]#可以看出来CDS两端有3'和5'非转录区
transcriptLengths()#查看某一基因的转录本长度
#bioconductor上面基因组注释比较全,转录组注释可以用以下函数自己创建
makeTxDbFromBiomart()
makeTxDbFromUCSC()

rtracklayer-Data Import
读入常见格式数据
library(rtracklayer)
#?import查看可导入的文件类型
library(AnnotationHub)
ahub = AnnotationHub()
table(ahub$rdataclass)
ahub.bw = subset(ahub,rdataclass=="BigWigFile"&species=="Homo sapiens")
bw = ahub.bw[[1]]
bw
import(bw,which=GRanges("chr22",ranges=IRanges(1,10^8)))#读入部分信息
#GRanges对象处理速度较慢
gr.chr22 = import(bw,which=GRanges("chr22",ranges=IRanges(1,10^8)))
#rle对象处理速度更快
rle.chr22 = import(bw,which=GRanges("chr22",ranges=IRanges(1,10^8)),as ="Rle")
rle.chr22$chr22
ahub.chain = subset(ahub,rdataclass == "ChainFile")
ahub.chain
ahub.chain = subset(ahub.chain,species=="Homo sapiens")
# 将不同版本,甚至人类和猴子的基因组进行转换
query(ahub.chain,c("hg18","hg19"))
chain = query(ahub.chain,c("hg18","hg19"))[[1]]
gr.hg18 = liftOver(gr.chr22,chain)
class(gr.hg18)
length(gr.hg18)
length(gr.chr22)


最后是完整的代码片段
library(Biostrings)
dna1 = DNAString("ACGT-G")
dna2 = DNAStringSet(c("ACGCT","ACG","ACGTT"))
rev(dna2)#倒序排列
reverse(dna2)#生物学互补
reverseComplement(dna2)#生物学反向互补
translate(dna2)#翻译
alphabetFrequency(dna2)#统计单个碱基出现频率
letterFrequency(dna2,letters = "GC")
dinucleotideFrequency(dna2)#统计双个碱基出现频率
consensusMatrix(dna2)#方便用于寻找motif

library(BSgenome)
available.genomes()#查看所有可以下载的基因组
# source("https://bioconductor.org/biocLite.R")
# biocLite("BSgenome.Scerevisiae.UCSC.sacCer2")
library("BSgenome.Scerevisiae.UCSC.sacCer2")
genome<- BSgenome.Scerevisiae.UCSC.sacCer2
seqnames(genome)
seqlengths(genome)
letterFrequency(genome$chrI,"GC",as.prob = TRUE)
#bsapply类似于apply函数,需要param,param提供了应用的"对象"和"函数"
#bsapply再提供"函数"的参数
param = new("BSParams",X = Scerevisiae,FUN = letterFrequency)
bsapply(param,"GC")
unlist(bsapply(param,"GC"))
sum(unlist(bsapply(param,"GC")))/sum(seqlengths(genome))
unlist(bsapply(param,"GC",as.prob = TRUE))

library("BSgenome.Scerevisiae.UCSC.sacCer2")
dnaseq <- DNAString("ACGTACGT")
matchPattern(dnaseq,Scerevisiae$chrI)
countPattern(dnaseq,Scerevisiae$chrI)
vmatchPattern(dnaseq,Scerevisiae)#搜索全部染色体
#以下是序列比对的方法
matchPVM()
pairwiseAlignment()
trimLRPatterns()

library("BSgenome.Scerevisiae.UCSC.sacCer2")
dnaseq <- DNAString("ACGTACGT")
vi = matchPattern(dnaseq,Scerevisiae$chrI)
vi#这个view对象底层就是IRange,所以用于IRange的方法也能用在这里
ranges(vi)
Scerevisiae$chrI[57932:57939]
alphabetFrequency(vi)
shift(vi,10)#view对象还存储了目标"染色体"对象
#多"染色体"比对
gr = vmatchPattern(dnaseq,Scerevisiae)
gr
vi2 = Views(Scerevisiae,gr)
vi2
library(AnnotationHub)
ahub = AnnotationHub()
qh = query(ahub,c("sacCer2","genes"))
genes = ahub[["AH7048"]]
prom = promoters(genes)
prom = trim(prom)#有些gene在染色体边界处,所以会有warning,使用前要trim
promView = Views(Scerevisiae,prom)
promView
gcProm = letterFrequency(promView,"GC",as.prob = TRUE)
plot(density(gcProm))

library(GenomicRanges)
rl <- Rle(c(1,1,1,1,1,1,2,2,2,2,2,4,4,2))#相当于每个position上的coverage
rl
runLength(rl)
runValue(rl)
as.numeric(rl)
ir = IRanges(start = c(2,8),width = 4)
ir
vec = as.numeric(rl)
mean(vec[2:5])
mean(vec[8:11])
aggregate(rl,ir ,FUN = mean)#查看Rle在IRanges区域的平均覆盖度
ir = IRanges(start = 1:5,width = 3)
ir
coverage(ir)#查看覆盖度
slice(rl,2)#coverage比2高的区域
vi = Views(rl,IRanges(c(2,8),width = 2))
mean(vi)
gr <- GRanges(seqnames = "chr1",ranges = IRanges(start = 1:10,width=3))
rl <- coverage(gr)
rl
vi = Views(rl,as(GRanges("chr1",ranges = IRanges(3,7)),"RangesList"))
vi = Views(rl,GRanges("chr1",ranges = IRanges(3,7)))
vi$chr1

library(GenomicRanges)
gr1 <- GRanges(seqnames = "chr1",ranges = IRanges(start = 1:4,width = 3))
gr2 <- GRanges(seqnames = "chr2",ranges = IRanges(start = 1:4,width = 3))
gL <- GRangesList(gr1 = gr1,gr2 = gr2)
gL#存储了GRanges对象的List
start(gL)
seqnames(gL)
elementNROWS(gL)
sapply(gL,length)
shift(gL,10)
findOverlaps(gL,gr2)

library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
txdb#gene_id实际上是entrez id
gr = GRanges(seqnames = "chr1",strand = "+",ranges = IRanges(start = 11874,end = 14409))
subsetByOverlaps(genes(txdb),gr)#和该区域重叠的基因
subsetByOverlaps(genes(txdb),gr,ignore.strand =TRUE)
subsetByOverlaps(transcripts(txdb),gr)#输出三个位置相同的是pre-RNA
subsetByOverlaps(exons(txdb),gr)
subsetByOverlaps(exonsBy(txdb,by = "tx"),gr)#tx为转录本的简写
#并不是所有的基因都有CDS,并不是所有的转录本都有CDS
#很多数据库的处理方式:计算所有ORF阅读框,然后找到最长的那个作为CDS
subsetByOverlaps(cds(txdb),gr)
subsetByOverlaps(cdsBy(txdb,by = "tx"),gr)#查看哪儿个转录本有CDS
subsetByOverlaps(exonsBy(txdb,by = "tx"),gr)["2"]#可以看出来CDS两端有3'和5'非转录区
transcriptLengths()#查看某一基因的转录本长度
#bioconductor上面基因组注释比较全,转录组注释可以用以下函数自己创建
makeTxDbFromBiomart()
makeTxDbFromUCSC()


library(rtracklayer)
#?import查看可导入的文件类型
library(AnnotationHub)
ahub = AnnotationHub()
table(ahub$rdataclass)
ahub.bw = subset(ahub,rdataclass=="BigWigFile"&species=="Homo sapiens")
bw = ahub.bw[[1]]
bw
import(bw,which=GRanges("chr22",ranges=IRanges(1,10^8)))#读入部分信息
#GRanges对象处理速度较慢
gr.chr22 = import(bw,which=GRanges("chr22",ranges=IRanges(1,10^8)))
#rle对象处理速度更快
rle.chr22 = import(bw,which=GRanges("chr22",ranges=IRanges(1,10^8)),as ="Rle")
rle.chr22$chr22
ahub.chain = subset(ahub,rdataclass == "ChainFile")
ahub.chain
ahub.chain = subset(ahub.chain,species=="Homo sapiens")
# 将不同版本,甚至人类和猴子的基因组进行转换
query(ahub.chain,c("hg18","hg19"))
chain = query(ahub.chain,c("hg18","hg19"))[[1]]
gr.hg18 = liftOver(gr.chr22,chain)
class(gr.hg18)
length(gr.hg18)
length(gr.chr22)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值