快速提取TCMSP中药成分对应靶点基因,并根据UniProt网站对应的symbol数据,给提取出来的靶点基因添加symbol。
library(rvest)
library(httr)
library(jsonlite)
library(dplyr)
library(data.table)
library(tidyverse)
# 需要把下一行双引号里的文本替换成文“本段程序”所在目录的路径,并把‘\’,改成‘\\’
setwd("E:\\code\\R")
obThresholdValue = 30
dlThresholdValue = 0.18
# 填写药名称
names <- c(
'Aiye',
'Ganjiang',
'Huanglian',
'Wumei'
)
# 填写药对应的链接,与上面填写的药名称顺序一致
urls <- c(
'https://tcmsp-e.com/tcmspsearch.php?qr=Folium%20Artemisiae%20Argyi&qsr=herb_en_name&token=cfc8e3eceab35ac296c1a1d56dd3d812',
'https://tcmsp-e.com/tcmspsearch.php?qr=Zingiberis%20Rhizoma&qsr=herb_en_name&token=cfc8e3eceab35ac296c1a1d56dd3d812',
'https://tcmsp-e.com/tcmspsearch.php?qr=Coptidis%20Rhizoma&qsr=herb_en_name&token=cfc8e3eceab35ac296c1a1d56dd3d812',
'https://tcmsp-e.com/tcmspsearch.php?qr=Mume%20Fructus&qsr=herb_en_name&token=cfc8e3eceab35ac296c1a1d56dd3d812'
)
uniprotData <-
tibble(fread("uniprotkb_reviewed.tsv")) %>%
filter(!is.na(`Gene Names`)) %>%
select(`Protein names`,`Gene Names`) %>%
mutate(`Gene Names` = gsub('\\s.*', '', `Gene Names`), Symbol = gsub('\\s.*', '', `Gene Names`)) %>%
filter(!str_detect(Symbol, '\\[a-z\\]|\\-|\\_')) %>%
mutate(
protein_names_space = gsub('-', ' ', `Protein names`),
protein_names_first = gsub('\\s+\\(.*', '', `Protein names`),
protein_names_first_space = gsub('-', ' ', gsub('\\s+\\(.*', '', `Protein names`))
)
symbol_all <- tibble(Drug=NA,MOL_ID=NA,molecule_name=NA,target_name=NA,Symbol=NA)
num <- 1
for (url in urls) {
name <- names[num]
print(name)
num <- num + 1
web <- read_html(GET(url,encoding="UTF-8", config(ssl_verifypeer = FALSE)))
# read all the script
tcmsp <- web %>% html_elements("script") %>% html_text()
test1 <- str_extract_all(tcmsp,"data:\\s\\[.*\\]")
test2 <- unlist(test1[12])
drug_m <- str_replace(test2[1], "data:","") %>% fromJSON(simplifyVector = TRUE) %>% mutate(across(c(ob, dl), as.numeric)) %>% filter(ob >= obThresholdValue & dl >= dlThresholdValue)
drug_t <- str_replace(test2[2], "data:","") %>% fromJSON(simplifyVector = TRUE) %>% semi_join(drug_m, by = "MOL_ID") %>% tibble() %>% add_column(Drug = name, .before = 'molecule_ID')
all_symbol <- drug_t %>% left_join(uniprotData, join_by(target_name == `Protein names`)) %>% filter(!is.na(`Gene Names`)) %>% select(Drug,MOL_ID, molecule_name, target_name, Symbol)
all_symbol1 <- drug_t %>% left_join(uniprotData, join_by(target_name == protein_names_space)) %>% filter(!is.na(`Gene Names`)) %>% select(Drug,MOL_ID, molecule_name, target_name, Symbol)
all_symbol2 <- drug_t %>%left_join(uniprotData, join_by(target_name == protein_names_first)) %>% filter(!is.na(`Gene Names`)) %>% select(Drug,MOL_ID, molecule_name, target_name, Symbol)
all_symbol3 <- drug_t %>%left_join(uniprotData, join_by(target_name == protein_names_first_space)) %>% filter(!is.na(`Gene Names`)) %>% select(Drug,MOL_ID, molecule_name, target_name, Symbol)
symbol_all <- symbol_all %>% add_row(all_symbol) %>% add_row(all_symbol1) %>% add_row(all_symbol2) %>% add_row(all_symbol3) %>% distinct(target_name, molecule_name,Drug, .keep_all = TRUE)
}
symbol_all <- symbol_all %>% filter(!is.na(Symbol)) %>% dplyr::rename(MolId=MOL_ID,MolName=molecule_name,TargetName=target_name)
drug_symbol <- symbol_all %>% select(Symbol) %>% distinct(Symbol, .keep_all = TRUE)
write.table(symbol_all, file="AllDrugTargetSymbol.txt", sep="\t", row.names=FALSE, quote = FALSE)