药物基因组信息学实验

小何同学#

已于 2024-05-19 00:14:29 修改

阅读量975

点赞数 15

文章标签： r语言

于 2023-12-23 21:00:00 首次发布

本文链接：https://blog.csdn.net/m0_69913686/article/details/135134368

版权

药物基因组信息学实验一

1. 查找抗肿瘤药物地西他滨（DECITABINE，NSC#127716）在细胞系中的敏感性数据。

# 设置路径
setwd("D:/R_Learn")

# install.packages("xlsx") 
library(xlsx)

# 导入数据1
DNA_Exome_Seq <- read.xlsx("drug_gene_data/data1/DNA__Exome_Seq_AA_changing.xls", 
                 startRow = 11, endRow = 12956, header = T, sheetName = "Results", encoding = 'UTF-8')

# 数据预处理
dna_seq <- DNA_Exome_Seq[, -(1:6)]
rownames(dna_seq) <- DNA_Exome_Seq[, 1]
dna_seq[dna_seq == "-"] <- 0

2. 基于获得的地西他滨药物敏感性数据，识别遗传变异会影响该药物敏感性的基因。

# 导入数据2
QueryDrugData <- read.xlsx("drug_gene_data/data1/QueryDrugData.xls", 
                 startRow = 10, endRow = 21, header = T, sheetName = "Results", encoding = 'UTF-8')

# 数据预处理
drug_data <- QueryDrugData[, -(1:7)]
drug_data[drug_data == "-"] <- 0
# 选择用平均值代替最终IC50值
drug_result <- apply(drug_data, 2, FUN = function(x){sum(as.numeric(x)) / sum(as.numeric(x) != 0)})

# 创建向量p储存每个基因p值
p <- vector()

# 计算每个基因p值
for (i in 1:nrow(dna_seq)) {
  x <- which(dna_seq[i, ] > 0)
  y <- which(dna_seq[i, ] == 0)
  p[i] <- wilcox.test(drug_result[x], drug_result[y])$p.value
}

# 找出会影响该药物敏感性的基因
gene1 <- rownames(dna_seq[which(p < 0.05), ])
gene1

3. 获取细胞系中的基因表达数据，探索这些遗传改变会影响地西他滨敏感性的基因，其表达水平是否也与该药物的敏感性相关。

# 导入数据3
RNA_Seq <- read.xlsx("drug_gene_data/data1/RNA__RNA_seq_composite_expression.xls", 
                 startRow = 11, endRow = 23819, header = T, sheetName = "Results", encoding = 'UTF-8')

# 数据预处理
rna_seq <- RNA_Seq[, -(1:6)]
rownames(rna_seq) <- RNA_Seq[, 1]
rna_seq <- rna_seq[RNA_Seq$Gene.name.d %in% gene1, ]

# 求其表达水平该药物的敏感性相关系数
cor_result <- apply(rna_seq, 1, FUN = function(x){cor(x, drug_result)})

# 进行相关性检验
cortest_result <- apply(rna_seq, 1, FUN = function(x){cor.test(x, drug_result)$p.value})

# 找出表达水平也与该药物的敏感性相关的gene
gene2 <- rownames(rna_seq[intersect(which(cor_result > 0.3), which(cortest_result <= 0.05)), ])
gene2

药物基因组信息学实验二（ABC转运蛋白家族正则表达式练习）

# 设置路径
setwd("D:/R_Learn")

# 导入数据
Human_exon_array_GBM_expression <- read.table("drug_gene_data/data2/Human_exon_array_GBM_expression.txt")
HPRD_Release9_062910 <- read.table("drug_gene_data/data2/BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt", sep = "\t")

# 找出ABC转运蛋白家族成员在不同样本中的表达值
t <- grep("\\bABC[A-G][0-9]+\\b", rownames(Human_exon_array_GBM_expression))
ABC <- Human_exon_array_GBM_expression[t, ]
dim(ABC)

# （1）找出与ABC转运蛋白家族成员互作的蛋白有哪些？
a <- grep("\\bABC[A-G][0-9]+\\b", paste(HPRD_Release9_062910[, 1], HPRD_Release9_062910[, 4]))
result1 <- HPRD_Release9_062910[a, ]
dim(result1)

# （2）找出ABC转运蛋白家族内部互作的记录？
b <- grep("\\bABC[A-G][0-9]+\\b \\bABC[A-G][0-9]+\\b", paste(HPRD_Release9_062910[, 1], HPRD_Release9_062910[, 4]))
result2 <- HPRD_Release9_062910[b, ]
dim(result2)

# （3）找出每个蛋白自身与自身互作的记录？（例如文件中第一行）
c <- grep("\\b(.+)\\b \\b\\1\\b", paste(HPRD_Release9_062910[, 1], HPRD_Release9_062910[, 4]))
result3 <- HPRD_Release9_062910[c, ]
dim(result3)

# （4）找出每个蛋白自身与自身互作的记录，且该记录被>3条参考文献支持。
d <- grep("\\b(.+)\\b \\b\\1\\b \\d+,\\d+,\\d+,\\d+",
          paste(HPRD_Release9_062910[, 1], HPRD_Release9_062910[, 4], HPRD_Release9_062910[, 8]))
result4 <- HPRD_Release9_062910[d, ]
dim(result4)

药物基因组信息学实验三

1. 基于NCI-60突变数据，识别乳腺癌的治疗药物tamoxifen敏感性相关的20个基因，这些基因参与哪些功能？

# 设置路径
setwd("D:/R_Learn")

# install.packages("readxl")
library(readxl)
# 导入数据1
DNA_Exome_Seq <- read_excel("drug_gene_data/data1/DNA__Exome_Seq_AA_changing.xls", 
                 sheet = "Results", col_names = T, na = "-", skip = 10)

# 数据预处理
DNA_Exome_Seq <- as.data.frame(DNA_Exome_Seq)
dna_seq <- DNA_Exome_Seq[, -(1:6)]
rownames(dna_seq) <- DNA_Exome_Seq$`Gene name d`
dna_seq[is.na(dna_seq)] <- 0

# 导入数据2
QueryDrugData <- read_excel("drug_gene_data/data3/QueryDrugData.xls", 
                 sheet = "Results", col_names = T, na = "-", skip = 9)

# 数据预处理
QueryDrugData <- as.data.frame(QueryDrugData)
drug_data <- QueryDrugData[, -(1:7)]
drug_data[is.na(drug_data)] <- 0
# 选择用平均值代替最终IC50值
drug_result <- apply(drug_data, 2, FUN = function(x){sum(as.numeric(x)) / sum(as.numeric(x) != 0)})

# 导入数据3
RNA_Seq <- read_excel("drug_gene_data/data1/RNA__RNA_seq_composite_expression.xls", 
                      sheet = "Results", col_names = T, skip = 10)

# 数据预处理
RNA_Seq <- as.data.frame(RNA_Seq)
rna_seq <- RNA_Seq[, -(1:6)]
row.names(rna_seq) <- RNA_Seq$`Gene name d`
rna_seq <- rna_seq[rownames(rna_seq) %in% rownames(dna_seq), ]

# 求其表达水平该药物的敏感性相关系数
cor_result <- apply(rna_seq, 1, FUN = function(x){cor(x, drug_result)})

# 进行相关性检验
cortest_result <- apply(rna_seq, 1, FUN = function(x){cor.test(x, drug_result)$p.value})

# 符合检验的基因
gene1 <- names(cortest_result[which(cortest_result <= 0.05)])

# 敏感性相关最大的20个基因
gene2 <- names(cor_result[order(abs(cor_result[gene1]), decreasing = T)[1:20]])

2.探索基于NCI-60数据获得的tamoxifen敏感性相关的基因在乳腺癌样本中的突变情况。

# 导入数据
data <- read.table("/drug_gene_data/data3/Data.txt", header = T)
clin <- read.table("drug_gene_data/data3/clin.txt", header = T)

# 突变基因
tb_gene <- intersect(gene2, data[, 1])

# 突变基因、样本、生存时间、生存情况整合
result <- vector()
for (i in tb_gene) {
  for (j in 1:nrow(data)) {
    if (i == data[j, 1]) {
      sample <- data[j, 3]
      num <- which(sample == clin[, 1])
      survival <- clin[num, 2]
      events <- clin[num, 3]
      result <- rbind(result, c(i, sample, survival, events))
    }
  }
}

# 格式调整
colnames(result) <- c("tb_gene", "sample", "survival", "events")
result <- as.data.frame(result)
result$survival <- as.numeric(result$survival)
result$events <- as.factor(result$events)
head(result)

# 突变情况：突变的样本数/总共的样本数
tb_result <- length(table(result[, 2])) / length(table(data[, 3]))

3.这些突变基因对于乳腺癌样本的治疗效果影响如何？哪一个对于治疗效果影响最大？

# 突变基因对于乳腺癌样本的治疗效果影响（平均生存时间）
mean_day <- vector()
for (m in tb_gene) {
  t <- 0
  sum <- 0
  for (n in 1:nrow(result)) {
    if (m == result[n, 1]) {
      if (result[n, 4] == 0) {
        temp <- result[n, 3] * 1.5
        t <- t + 1
      }else{
        temp <- result[n, 3]
        t <- t + 1
      }
      sum <- temp + sum
    }
  }
  mean <- sum / t
  mean_day <- c(mean_day, mean)
}

# 治疗效果最好的突变基因
best_gene <- tb_gene[which.max(mean_day)]

# 治疗效果最差的突变基因
worst_gene <- tb_gene[which.min(mean_day)]

药物基因组信息学实验四（药物应答标志物的识别及个体药物应答的预测）

1.获取疾病基因表达数据、样本药物反应数据

# 设置路径
setwd("D:/R_Learn")

# 读取数据
genes_norm <- read.table("drug_gene_data/data4/genes.normalized.txt", header = T)
response_data <- read.table("drug_gene_data/data4/Response.txt", header = T)

# 处理行名、列名（与response的SID一致）
genes <- genes_norm[, -(1:2)]
rownames(genes) <- genes_norm$GeneID
SID <- gsub("\\.", "-", substr(colnames(genes), 1, 7))
colnames(genes) <- SID

# 去除表达值0占比大于90%的基因
rate <- apply(genes, 1, FUN = function(x){length(x[which(x == 0)]) / length(x)})
genes <- genes[- which(rate >= 0.9), ]

# 将response的0和1分成两组
response_0 <- response_data[which(response_data[, 2] == 0), ]
response_1 <- response_data[which(response_data[, 2] == 1), ]

# 根据0和1的SID给genes分组
SID_0 <- intersect(colnames(genes), response_0$SID)
genes_0 <- genes[, SID_0]
SID_1 <- intersect(colnames(genes), response_1$SID)
genes_1 <- genes[, SID_1]

2.利用差异表达或单因素回归初步筛选药物反应标志物

# 将genes_0和genes_1进行t检验
p <- apply(matrix(1:nrow(genes)), 1, FUN = function(x){t.test(genes_0[x, ], genes_1[x, ])$p.value})

# 选出矫正后p值前二十的基因
p_fdr <- p.adjust(p, method="fdr")
gene_ids <- rownames(genes)[order(p_fdr)][1:20]

3.利用多因素回归进一步筛选标志物

# 合并筛选后的0和1基因表达谱
order_genes_0 <- genes_0[order(p_fdr)[1:20], ]
order_genes_1 <- genes_1[order(p_fdr)[1:20], ]
bind_genes <- cbind(order_genes_0, order_genes_1)

# 因变量y
y <- matrix(nrow = 1, ncol = ncol(bind_genes))
rownames(y) <- c("y")
colnames(y) <- c(SID_0, SID_1)
y[1, 1:ncol(order_genes_0)] <- 0
y[1, (ncol(order_genes_0) + 1):ncol(bind_genes)] <- 1

# 合并、转置、转化为数据框格式
los_genes <- rbind(bind_genes, y)
los_genes <- t(los_genes)
los_genes <- as.data.frame(los_genes)

# 多因素logistic回归分析
glm <- glm(y ~ ., data = los_genes, family = binomial(link = logit))

# 查看并整合回归结果
summary(glm)
result <- summary(glm)$coef
result <- result[-1, ]

# 选出Pr<0.05的基因
gene_ids_result <- rownames(result)[which(result[, 4] < 0.05)]

4. 构建样本药物反应打分测度，并对每个样本打分

# 标记基因位置
markers <- which(rownames(result) %in% gene_ids_result)

# 分数矩阵
score <- matrix(nrow = nrow(los_genes), ncol=1, dimnames = list(rownames(los_genes), "score"))

# 表达值*Pr作为打分测度，进行打分
for (m in 1:nrow(los_genes)) {
  t <- c()
  for (n in 1:length(markers)) {
    t <- c(t, los_genes[m, markers[n]] * result[markers[n], 4])
  }
  score[m, ] <- sum(t)
}

# 合并打分结果
los_genes <- cbind(los_genes, score)