药物基因组信息学实验一
1. 查找抗肿瘤药物地西他滨(DECITABINE,NSC#127716)在细胞系中的敏感性数据。
# 设置路径
setwd("D:/R_Learn")
# install.packages("xlsx")
library(xlsx)
# 导入数据1
DNA_Exome_Seq <- read.xlsx("drug_gene_data/data1/DNA__Exome_Seq_AA_changing.xls",
startRow = 11, endRow = 12956, header = T, sheetName = "Results", encoding = 'UTF-8')
# 数据预处理
dna_seq <- DNA_Exome_Seq[, -(1:6)]
rownames(dna_seq) <- DNA_Exome_Seq[, 1]
dna_seq[dna_seq == "-"] <- 0
2. 基于获得的地西他滨药物敏感性数据,识别遗传变异会影响该药物敏感性的基因。
# 导入数据2
QueryDrugData <- read.xlsx("drug_gene_data/data1/QueryDrugData.xls",
startRow = 10, endRow = 21, header = T, sheetName = "Results", encoding = 'UTF-8')
# 数据预处理
drug_data <- QueryDrugData[, -(1:7)]
drug_data[drug_data == "-"] <- 0
# 选择用平均值代替最终IC50值
drug_result <- apply(drug_data, 2, FUN = function(x){sum(as.numeric(x)) / sum(as.numeric(x) != 0)})
# 创建向量p储存每个基因p值
p <- vector()
# 计算每个基因p值
for (i in 1:nrow(dna_seq)) {
x <- which(dna_seq[i, ] > 0)
y <- which(dna_seq[i, ] == 0)
p[i] <- wilcox.test(drug_result[x], drug_result[y])$p.value
}
# 找出会影响该药物敏感性的基因
gene1 <- rownames(dna_seq[which(p < 0.05), ])
gene1
3. 获取细胞系中的基因表达数据,探索这些遗传改变会影响地西他滨敏感性的基因,其表达水平是否也与该药物的敏感性相关。
# 导入数据3
RNA_Seq <- read.xlsx("drug_gene_data/data1/RNA__RNA_seq_composite_expression.xls",
startRow = 11, endRow = 23819, header = T, sheetName = "Results", encoding = 'UTF-8')
# 数据预处理
rna_seq <- RNA_Seq[, -(1:6)]
rownames(rna_seq) <- RNA_Seq[, 1]
rna_seq <- rna_seq[RNA_Seq$Gene.name.d %in% gene1, ]
# 求其表达水平该药物的敏感性相关系数
cor_result <- apply(rna_seq, 1, FUN = function(x){cor(x, drug_result)})
# 进行相关性检验
cortest_result <- apply(rna_seq, 1, FUN = function(x){cor.test(x, drug_result)$p.value})
# 找出表达水平也与该药物的敏感性相关的gene
gene2 <- rownames(rna_seq[intersect(which(cor_result > 0.3), which(cortest_result <= 0.05)), ])
gene2
药物基因组信息学实验二(ABC转运蛋白家族正则表达式练习)
# 设置路径
setwd("D:/R_Learn")
# 导入数据
Human_exon_array_GBM_expression <- read.table("drug_gene_data/data2/Human_exon_array_GBM_expression.txt")
HPRD_Release9_062910 <- read.table("drug_gene_data/data2/BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt", sep = "\t")
# 找出ABC转运蛋白家族成员在不同样本中的表达值
t <- grep("\\bABC[A-G][0-9]+\\b", rownames(Human_exon_array_GBM_expression))
ABC <- Human_exon_array_GBM_expression[t, ]
dim(ABC)
# (1)找出与ABC转运蛋白家族成员互作的蛋白有哪些?
a <- grep("\\bABC[A-G][0-9]+\\b", paste(HPRD_Release9_062910[, 1], HPRD_Release9_062910[, 4]))
result1 <- HPRD_Release9_062910[a, ]
dim(result1)
# (2)找出ABC转运蛋白家族内部互作的记录?
b <- grep("\\bABC[A-G][0-9]+\\b \\bABC[A-G][0-9]+\\b", paste(HPRD_Release9_062910[, 1], HPRD_Release9_062910[, 4]))
result2 <- HPRD_Release9_062910[b, ]
dim(result2)
# (3)找出每个蛋白自身与自身互作的记录?(例如文件中第一行)
c <- grep("\\b(.+)\\b \\b\\1\\b", paste(HPRD_Release9_062910[, 1], HPRD_Release9_062910[, 4]))
result3 <- HPRD_Release9_062910[c, ]
dim(result3)
# (4)找出每个蛋白自身与自身互作的记录,且该记录被>3条参考文献支持。
d <- grep("\\b(.+)\\b \\b\\1\\b \\d+,\\d+,\\d+,\\d+",
paste(HPRD_Release9_062910[, 1], HPRD_Release9_062910[, 4], HPRD_Release9_062910[, 8]))
result4 <- HPRD_Release9_062910[d, ]
dim(result4)
药物基因组信息学实验三
1. 基于NCI-60突变数据,识别乳腺癌的治疗药物tamoxifen敏感性相关的20个基因,这些基因参与哪些功能?
# 设置路径
setwd("D:/R_Learn")
# install.packages("readxl")
library(readxl)
# 导入数据1
DNA_Exome_Seq <- read_excel("drug_gene_data/data1/DNA__Exome_Seq_AA_changing.xls",
sheet = "Results", col_names = T, na = "-", skip = 10)
# 数据预处理
DNA_Exome_Seq <- as.data.frame(DNA_Exome_Seq)
dna_seq <- DNA_Exome_Seq[, -(1:6)]
rownames(dna_seq) <- DNA_Exome_Seq$`Gene name d`
dna_seq[is.na(dna_seq)] <- 0
# 导入数据2
QueryDrugData <- read_excel("drug_gene_data/data3/QueryDrugData.xls",
sheet = "Results", col_names = T, na = "-", skip = 9)
# 数据预处理
QueryDrugData <- as.data.frame(QueryDrugData)
drug_data <- QueryDrugData[, -(1:7)]
drug_data[is.na(drug_data)] <- 0
# 选择用平均值代替最终IC50值
drug_result <- apply(drug_data, 2, FUN = function(x){sum(as.numeric(x)) / sum(as.numeric(x) != 0)})
# 导入数据3
RNA_Seq <- read_excel("drug_gene_data/data1/RNA__RNA_seq_composite_expression.xls",
sheet = "Results", col_names = T, skip = 10)
# 数据预处理
RNA_Seq <- as.data.frame(RNA_Seq)
rna_seq <- RNA_Seq[, -(1:6)]
row.names(rna_seq) <- RNA_Seq$`Gene name d`
rna_seq <- rna_seq[rownames(rna_seq) %in% rownames(dna_seq), ]
# 求其表达水平该药物的敏感性相关系数
cor_result <- apply(rna_seq, 1, FUN = function(x){cor(x, drug_result)})
# 进行相关性检验
cortest_result <- apply(rna_seq, 1, FUN = function(x){cor.test(x, drug_result)$p.value})
# 符合检验的基因
gene1 <- names(cortest_result[which(cortest_result <= 0.05)])
# 敏感性相关最大的20个基因
gene2 <- names(cor_result[order(abs(cor_result[gene1]), decreasing = T)[1:20]])
2.探索基于NCI-60数据获得的tamoxifen敏感性相关的基因在乳腺癌样本中的突变情况。
# 导入数据
data <- read.table("/drug_gene_data/data3/Data.txt", header = T)
clin <- read.table("drug_gene_data/data3/clin.txt", header = T)
# 突变基因
tb_gene <- intersect(gene2, data[, 1])
# 突变基因、样本、生存时间、生存情况整合
result <- vector()
for (i in tb_gene) {
for (j in 1:nrow(data)) {
if (i == data[j, 1]) {
sample <- data[j, 3]
num <- which(sample == clin[, 1])
survival <- clin[num, 2]
events <- clin[num, 3]
result <- rbind(result, c(i, sample, survival, events))
}
}
}
# 格式调整
colnames(result) <- c("tb_gene", "sample", "survival", "events")
result <- as.data.frame(result)
result$survival <- as.numeric(result$survival)
result$events <- as.factor(result$events)
head(result)
# 突变情况:突变的样本数/总共的样本数
tb_result <- length(table(result[, 2])) / length(table(data[, 3]))
3.这些突变基因对于乳腺癌样本的治疗效果影响如何?哪一个对于治疗效果影响最大?
mean_day <- vector( )
for ( m in tb_gene) {
t <- 0
sum <- 0
for ( n in 1 : nrow( result) ) {
if ( m == result[ n, 1 ] ) {
if ( result[ n, 4 ] == 0 ) {
temp <- result[ n, 3 ] * 1.5
t <- t + 1
} else {
temp <- result[ n, 3 ]
t <- t + 1
}
sum <- temp + sum
}
}
mean <- sum / t
mean_day <- c( mean_day, mean)
}
best_gene <- tb_gene[ which.max( mean_day) ]
worst_gene <- tb_gene[ which.min( mean_day) ]
药物基因组信息学实验四(药物应答标志物的识别及个体药物应答的预测)
1.获取疾病基因表达数据、样本药物反应数据
setwd( "D:/R_Learn" )
genes_norm <- read.table( "drug_gene_data/data4/genes.normalized.txt" , header = T)
response_data <- read.table( "drug_gene_data/data4/Response.txt" , header = T)
genes <- genes_norm[ , - ( 1 : 2 ) ]
rownames( genes) <- genes_norm$ GeneID
SID <- gsub( "\\." , "-" , substr( colnames( genes) , 1 , 7 ) )
colnames( genes) <- SID
rate <- apply( genes, 1 , FUN = function ( x) { length( x[ which( x == 0 ) ] ) / length( x) } )
genes <- genes[ - which( rate >= 0.9 ) , ]
response_0 <- response_data[ which( response_data[ , 2 ] == 0 ) , ]
response_1 <- response_data[ which( response_data[ , 2 ] == 1 ) , ]
SID_0 <- intersect( colnames( genes) , response_0$ SID)
genes_0 <- genes[ , SID_0]
SID_1 <- intersect( colnames( genes) , response_1$ SID)
genes_1 <- genes[ , SID_1]
2.利用差异表达或单因素回归初步筛选药物反应标志物
p <- apply( matrix( 1 : nrow( genes) ) , 1 , FUN = function ( x) { t.test( genes_0[ x, ] , genes_1[ x, ] ) $ p.value} )
p_fdr <- p.adjust( p, method= "fdr" )
gene_ids <- rownames( genes) [ order( p_fdr) ] [ 1 : 20 ]
3.利用多因素回归进一步筛选标志物
order_genes_0 <- genes_0[ order( p_fdr) [ 1 : 20 ] , ]
order_genes_1 <- genes_1[ order( p_fdr) [ 1 : 20 ] , ]
bind_genes <- cbind( order_genes_0, order_genes_1)
y <- matrix( nrow = 1 , ncol = ncol( bind_genes) )
rownames( y) <- c( "y" )
colnames( y) <- c( SID_0, SID_1)
y[ 1 , 1 : ncol( order_genes_0) ] <- 0
y[ 1 , ( ncol( order_genes_0) + 1 ) : ncol( bind_genes) ] <- 1
los_genes <- rbind( bind_genes, y)
los_genes <- t( los_genes)
los_genes <- as.data.frame( los_genes)
glm <- glm( y ~ ., data = los_genes, family = binomial( link = logit) )
summary( glm)
result <- summary( glm) $ coef
result <- result[ - 1 , ]
gene_ids_result <- rownames( result) [ which( result[ , 4 ] < 0.05 ) ]
4. 构建样本药物反应打分测度,并对每个样本打分
markers <- which( rownames( result) %in% gene_ids_result)
score <- matrix( nrow = nrow( los_genes) , ncol= 1 , dimnames = list( rownames( los_genes) , "score" ) )
for ( m in 1 : nrow( los_genes) ) {
t <- c( )
for ( n in 1 : length( markers) ) {
t <- c( t, los_genes[ m, markers[ n] ] * result[ markers[ n] , 4 ] )
}
score[ m, ] <- sum( t)
}
los_genes <- cbind( los_genes, score)