例 分析头颈部鳞状细胞癌组织在感染或未感染人乳头瘤病毒免疫谱系的转录特征
(一)原始数据的获取和预处理
在NCBI的GEO DataSets搜索有关人类研究的鳞状细胞癌实验研究结果,选取存储序列号为GSE139324的数据作为原始数据来源。选择其中的GSM4138110、GSM4138112、GSM4138113、GSM4138156、GSM4138158、GSM4138160样本,其中样本均来源于头颈部鳞状细胞癌患者,采样自外周血单核细胞,前三个样本人乳头瘤病毒阴性,后三个样本人乳头瘤病毒阳性。在GEO Accession页面下载各个样本的细胞代码 barcodes.tsv.gz、基因ID genes.tsv.gz和表达矩阵 matrix.mtx.gz用于后续研究。
图1 GEO数据库GSE139324数据集
最终整理成六个文件夹并且统一修改文件名,放在工作目录下
图2 整理下载数据及命名
首先,读取下载数据的R代码如下:
library(Seurat)
library(tidyverse)
library(dplyr)
library(org.Hs.eg.db)
library(clusterProfiler)
library(patchwork)
library(clustree)
library(pheatmap)
library(SingleR)
library(magrittr)
library(ggrepel)
library(reshape2)
library(GSEABase)
library(clusterProfiler)
library(msigdbr)
library(GSEA)
library(GSVA)
library(magrittr)#加载需要的R包
dir=c("HPV_NEG1/","HPV_NEG2/","HPV_NEG3/","HPV_POS1/","HPV_POS2","HPV_POS3")
names(dir)=c('NEG1','NEG2','NEG3',"POS1","POS2","POS3")
scRNAlist<-list()
for(i in 1:length(dir)){
counts<-Read10X(data.dir=dir[i])#读取单细胞数据
scRNAlist[[i]]<-CreateSeuratObject(counts,min.cells=3,min.features=300)}#创建Seurat对象
for(i in 1:length(dir)){
scRNAlist[[i]][["percent.mt"]]<-PercentageFeatureSet(scRNAlist[[i]],pattern="^MT-")#计算每个细胞的线粒体基因含量
scRNAlist[[i]][["percent.ribo"]]<-PercentageFeatureSet(scRNAlist[[i]],pattern="^RP[SL][[:digit:]]")#计算每个细胞核糖体基因含量
scRNAlist[[i]][["percent.red"]]<-PercentageFeatureSet(scRNAlist[[i]],pattern="^HB-")#计算每个细胞红细胞基因含量
scRNAlist[[i]]<-subset(scRNAlist[[i]],#设置相应的阈值过滤低质量细胞
subset=percent.mt<=20&
percent.ribo<=100&
percent.red<=20)}
for(i in 1:length(scRNAlist)){
scRNAlist[[i]]<-NormalizeData(scRNAlist[[i]])#数据归一化
scRNAlist[[i]]<-FindVariableFeatures(scRNAlist[[i]],selection.method="vst",nfeatures=3000)}#寻找高变基因
scRNA.anchors<-FindIntegrationAnchors(object.list=scRNAlist,anchor.features=3000)
scRNA1<-IntegrateData(anchorset=scRNA.anchors)#采用锚定点整合多个样本
scRNA2<-merge(scRNAlist[[1]],c(scRNAlist[[2]],scRNAlist[[3]],scRNAlist[[4]],scRNAlist[[5]],scRNAlist[[6]]))
(二)降维聚类及细胞类群注释
DefaultAssay(scRNA1)<-"integrated"#调整active.ident
scRNA1<-ScaleData(scRNA1)#数据标准化
scRNA1<-RunPCA(scRNA1,features=VariableFeatures(scRNA1))#PCA降维
plot1<-ElbowPlot(scRNA1,ndims=50,reduction="pca")#根据“肘部法则”,选取转折处的主成分数量
plot1
图3 PCA降维主成分分布比例
scRNA1<-RunPCA(scRNA1,npcs=16,verbose=T)#选择主成分为16重新降维
scRNA1<-FindNeighbors(scRNA1,reduction="pca",dims=1:10)
scRNA1<-FindClusters(scRNA1,resolution=seq(0.4,1.2,by=0.1))#调整参数进行聚类
clustree(scRNA1)#查看各参数的分组情况进行选择
图4 细胞亚群聚类参数分布
scRNA1<-FindClusters(scRNA1,resolution=0.7)#选择resolution=0.7重新聚类
scRNA1<-RunUMAP(scRNA1,reduction="pca",dims=1:10)
scRNA1<-RunTSNE(scRNA1,dims=1:10)
DefaultAssay(scRNA1)<-"RNA"
load("ref_Human_all.RData")#加载人类参考marker基因
refdata<-ref_Human_all
scRNA1<-JoinLayers(scRNA1)
testdata<-GetAssayData(scRNA1,slot="data")#提取表达矩阵
clusters<-scRNA1@meta.data$seurat_clusters
cellpred<-SingleR(test=testdata,ref=refdata,labels=refdata$label.main,
method="cluster",clusters=clusters,
assay.type.test="logcounts",assay.type.ref="logcounts")#细胞类群注释
celltype<-data.frame(ClusterID=rownames(cellpred),celltype=cellpred$labels,stringsAsFactors=FALSE)
scRNA1@meta.data$celltype="NA"
for(i in 1:nrow(celltype)){
scRNA1@meta.data[which(scRNA1@meta.data$seurat_clusters==celltype$ClusterID[i]),'celltype']<-celltype$celltype[i]}
DimPlot(scRNA1,reduction="umap",group.by="celltype",label=T)
图5 umap细胞亚群聚类
DimPlot(scRNA1,reduction="tsne",group.by="celltype",label=T)
图6 tSNE细胞亚群聚类
(三)Marker基因展示
Idents(scRNA1)=scRNA1@meta.data$celltype#调整active.ident为细胞类群
degs<-FindAllMarkers(scRNA1,logfc.threshold=0.5,
test.use="roc",
return.thresh=0.25,
min.pct=0.3,only.pos=T)#寻找marker基因
degs_sig<-degs%>%filter(pct.1>0.3&power>0.25)%>%filter(cluster!="other")%>%arrange(cluster,-power)
degs_top20<-degs_sig%>%group_by(cluster)%>%top_n(20,power)%>%top_n(20,avg_diff)%>%arrange(cluster,-power)#选取前20的marker基因
表1 marker基因展示(部分)
gene | myAUC | avg_diff | power | avg_log2FC | pct.1 | pct.2 | cluster |
CD79A | 0.9830 | 2.6721 | 0.9660 | 7.5486 | 0.9690 | 0.0220 | B_cell |
FTL | 0.9820 | 2.1252 | 0.9640 | 3.1327 | 1.0000 | 0.9750 | Monocyte |
CST3 | 0.9820 | 2.9469 | 0.9640 | 5.7309 | 0.9710 | 0.1680 | Monocyte |
CD74 | 0.9820 | 2.0616 | 0.9640 | 3.0619 | 1.0000 | 0.7900 | B_cell |
CTSS | 0.9780 | 2.3691 | 0.9560 | 4.0504 | 0.9880 | 0.4300 | Monocyte |
LYZ | 0.9750 | 3.8927 | 0.9500 | 6.5578 | 0.9760 | 0.2800 | Monocyte |
NKG7 | 0.9730 | 2.4126 | 0.9460 | 3.7102 | 0.9930 | 0.3280 | NK_cell |
FTH1 | 0.9720 | 1.6950 | 0.9440 | 2.5087 | 0.9980 | 0.9810 | Monocyte |
TYROBP | 0.9630 | 1.9013 | 0.9260 | 2.9926 | 0.9920 | 0.3910 | Monocyte |
S100A6 | 0.9630 | 1.5368 | 0.9260 | 2.3251 | 0.9970 | 0.8920 | Monocyte |
LST1 | 0.9610 | 2.5829 | 0.9220 | 5.0680 | 0.9430 | 0.1910 | Monocyte |
S100A9 | 0.9600 | 4.0490 | 0.9200 | 6.4097 | 0.9740 | 0.4570 | Monocyte |
GZMB | 0.9520 | 2.2850 | 0.9040 | 4.0168 | 0.9570 | 0.1490 | NK_cell |
CD37 | 0.9510 | 1.3778 | 0.9020 | 2.2023 | 0.9920 | 0.7600 | B_cell |
MS4A1 | 0.9500 | 2.0753 | 0.9000 | 7.7078 | 0.9050 | 0.0120 | B_cell |
CD79B | 0.9430 | 2.0574 | 0.8860 | 4.9544 | 0.9110 | 0.1020 | B_cell |
avgData<-testdata[degs_top20$gene,]%>%#计算每个基因在细胞类群的平均表达量
apply(1,function(x){
tapply(x,scRNA1$celltype,mean)#计算平均表达量
})%>%t
phData<-MinMax(scale(avgData),-2,2)#z-score标准化用于绘制热图
rownames(phData)<-degs_top20$gene
phres<-pheatmap(phData,color=colorRampPalette(c("darkblue","white","red3"))(100),
scale="row",cluster_rows=F,cluster_cols=F,show_rownames=T)
图7 marker基因初步展示
(四)细胞比例展示
x=scRNA1@meta.data$celltype
Clusters<-table(scRNA1@meta.data$celltype,scRNA1@meta.data$orig.ident)%>%melt()
colnames(Clusters)<-c("Celltype","Sample","Number")
cluster=c("B_cell","Monocyte","NK_cell","T_cells")
Clusters$Celltype<-factor(Clusters$Celltype,levels=cluster)
sample_color<-c("#fccccb","#bdb5e1","#b0d992","#f9d580")
p<-ggplot(data=Clusters,aes(x=Number,y=Sample,fill=Celltype))+
geom_bar(stat="identity",width=0.8,position="fill")+
scale_fill_manual(values=sample_color)+
theme_bw()+
theme(panel.grid=element_blank())+
labs(x="",y="Sample")+
theme(axis.text.y=element_text(size=12,colour="black"))+
theme(axis.text.x=element_text(size=12,colour="black"))
p
图8 各样本细胞比例分布
(五)各细胞类群差异分析
type=c("Monocyte","T_cells","NK_cell","B_cell")
r.deg=data.frame()
table(scRNA1@meta.data$orig.ident)
for(i in 1:4){#对各细胞类群进行差异分析
Idents(scRNA1)="celltype"
deg=FindMarkers(scRNA1,ident.1=c("NEG1","NEG2","NEG3"),ident.2=c("POS1","POS2","POS3"),group.by="orig.ident",subset.ident=type[i])
deg$celltype=type[i]
deg$unm=i-1
r.deg=rbind(deg,r.deg)}
r.deg$gene=rownames(r.deg)
r.deg<-subset(r.deg,p_val_adj<0.05&abs(avg_log2FC)>0.5)#筛选差异基因
r.deg$trend<-as.factor(ifelse(r.deg$avg_log2FC>0,'Up','Down'))#定义上调下调
r.deg$significance<-as.factor(ifelse(r.deg$p_val_adj<0.01,'Highly','Lowly'))#定义显著性
r.deg$sig_trend<-paste0(r.deg$trend," ",r.deg$significance)
r.deg$unm=r.deg$unm%>%as.vector(.)%>%as.numeric(.)
background<-r.deg%>%group_by(unm)%>%summarise(Min=min(avg_log2FC)-0.2,Max=max(avg_log2FC)+0.2)%>%as.data.frame()#绘制浅灰色背景
background$Celltype<-factor(type)
background$start<-background_position$unm-0.4
background$end<-background_position$unm+0.4
cluster_bar_position<-background_position#绘制细胞类群标注
cluster_bar_position$start<-cluster_bar_position$unm-0.5
cluster_bar_position$end<-cluster_bar_position$unm+0.5
cluster_bar_position$unm%<>%factor(.,levels=c(0:max(as.vector(.))))
cols<-c("Up Highly"="#ff0000","Down Highly"="#0000ff","Up Lowly"="black","Down Lowly"="black")
cols_cluster<-c("0"="#e3716e","1"="#eca680","2"="#7ac7e2","3"="#f7df87")
p1=ggplot()+geom_rect(data=background,aes(xmin=start,xmax=end,ymin=Min,ymax=Max),
fill="#525252",alpha=0.1)+###添加灰色背景色
geom_jitter(data=r.deg,aes(x=unm,y=avg_log2FC,colour=sig_trend),
size=2,position=position_jitter(seed=1))+
scale_x_continuous(breaks=seq(0,3,by=1),labels=c("Monocyte","T_cells","NK_cell","B_cell"))+scale_color_manual(values=cols)+
geom_rect(data=cluster_bar_position,aes(xmin=start,xmax=end,ymin=-0.4,ymax=0.4,fill=unm),color="black",alpha=1,show.legend=F)+scale_fill_manual(values=cols_cluster)+
labs(x="Cluster",y="Average log2FC")+theme_bw()+
theme(axis.text.y=element_text(colour='black',size=14),
axis.text.x=element_text(colour='black',size=12,vjust=75))
p1
图9 各细胞差异分析结果
(六)富集分析
for (i in 1:4){
i=4
Idents(scRNA1)="celltype"
deg=FindMarkers(scRNA1,ident.1=c("NEG1","NEG2","NEG3"),ident.2=c("POS1","POS2","POS3"),group.by="orig.ident",subset.ident=type[i])
deg$celltype=type[i]
deg$unm=i-1
degs.list=rownames(deg)
df<-bitr(degs.list,fromType="SYMBOL",toType=c("ENTREZID"),OrgDb=org.Hs.eg.db)
eg=enrichGO(gene=degs.list,OrgDb=org.Hs.eg.db,keyType="SYMBOL",ont="all",pvalueCutoff=0.5,qvalueCutoff=0.5)
barplot(eg,x="GeneRatio",color="p.adjust",showCategory=8,split="ONTOLOGY")+facet_grid(ONTOLOGY~.,scale='free')
dev.print(png,file=paste0(cluster[i],"-go.png"),width=2000,height=2500,res=300)
ek<-enrichKEGG(gene=df$ENTREZID,organism="hsa",pvalueCutoff=0.05,qvalueCutoff=0.05)
barplot(ek, x = "GeneRatio", color = "p.adjust",showCategory =15)
dev.print(png, file = paste0(cluster[i],"-kegg.png"), width = 2000, height = 2500, res = 300)
}
图10 各细胞GO和KEGG通路富集分析结果
左侧为GO功能注释结果,右侧为KEGG通路富集分析结果
在GSEA官网下载通路的参考数据集进行注释:
图11 GSEA官网
kegggmt2<-read.gmt("c2.all.v2023.2.Hs.symbols.gmt")
kegg_list=split(kegggmt2$gene,kegggmt2$term)
x=AverageExpression(scRNA1)
exp=x[["RNA"]]
exp1=as.matrix(exp)
m_df<-msigdbr(species="Homo sapiens",category="C2",subcategory="KEGG")
fgsea_sets<-m_df%>%split(x=.$gene_symbol,f=.$gs_name)
es.max<-gsva(exp1,fgsea_sets,mx.diff=FALSE)
pheatmap::pheatmap(es.max[1:20,],
cluster_rows=T,
cluster_cols=T,
show_colnames=T,
scale="row", color=colorRampPalette(c("#Fc8002","white","#AAAAAA","#1663a9"))(100))
图12 GSEA富集分析热图