library("stringr")
library("dplyr")
library("ggplot2")
library("readr")
加载数据(将第1列转为行名):
if(T){
dataFilt=read_csv(file = 'dataFilt.csv')
dataFilt=as.data.frame(dataFilt)
rownames(dataFilt)=dataFilt[,1]
dataFilt=dataFilt[,-1]
dataFilt=log(dataFilt+1) #取对数
} #测序数据
if(T){
data_cl=read_csv(file = 'data_cl_index.csv')
data_cl=as.data.frame(data_cl)
rownames(data_cl)=data_cl[,1]
data_cl=data_cl[,-1]
} #临床数据
一、合并临床和测序数据
1 更改dataFilt使其能与data_cl匹配
dataFilt_t=t(dataFilt) #转置
dataFilt_t=as.data.frame(dataFilt_t)
dataFilt_t <- add_rownames(dataFilt_t, "Sample") #add_rownames()使row.names变为第一列(data.frame)
colnames(data_cl)[1]="Sample"
dataFilt_t$Sample=str_sub(dataFilt_t$Sample,1,12) #取病人编号
2 合并(便于后面任意提取基因)
data_m <- inner_join(data_cl,dataFilt_t, by= "Sample") #inner_join按by指定合并数据
二、获取包含“Gene表达”、“对应gleason评分”的data.frame
1 从data_m中提取Gene和临床信息
Gene='CHAC1'
gene_cl<- data_m %>% dplyr:: select( 1:ncol(data_cl),all_of( Gene ))
2 gene_cl加上临床信息“gleason_sum”
t_gleason1=as.numeric(substr(data_m$primary_gleason_grade,9,9)) #substr()用于从“Pattern 3”中提取数字,“9”为数字对于字符序
t_gleason2=as.numeric(substr(data_m$secondary_gleason_grade,9,9))
t_gleason_sum=t_gleason1+t_gleason2
gene_cl=cbind(gene_cl,t_gleason_sum)
3 gene_cl中提取Gene和“gleason_sum”
gene_gleason=select(gene_cl,'Sample',Gene,'t_gleason_sum') #后三个元素为列名
三、ggplot作图
gene_gleason$t_gleason_sum<- as.factor(gene_gleason$t_gleason_sum) #离散变量转factor
p = ggplot(gene_gleason,aes(x = t_gleason_sum, y = gene_gleason[,Gene],color=t_gleason_sum)) +
geom_boxplot() +
xlab("gleason score") +
ylab(Gene) +
theme_bw() #白色背景
p