TCMSP快速提取药物成分并添加基因对应symbol

快速提取TCMSP中药成分对应靶点基因,并根据UniProt网站对应的symbol数据,给提取出来的靶点基因添加symbol。

library(rvest)
library(httr)
library(jsonlite)
library(dplyr)
library(data.table)
library(tidyverse)

# 需要把下一行双引号里的文本替换成文“本段程序”所在目录的路径,并把‘\’,改成‘\\’
setwd("E:\\code\\R")

obThresholdValue = 30
dlThresholdValue = 0.18


# 填写药名称
names <- c(
  'Aiye', 
  'Ganjiang', 
  'Huanglian', 
  'Wumei'
)

# 填写药对应的链接,与上面填写的药名称顺序一致
urls <- c(
    'https://tcmsp-e.com/tcmspsearch.php?qr=Folium%20Artemisiae%20Argyi&qsr=herb_en_name&token=cfc8e3eceab35ac296c1a1d56dd3d812',
    'https://tcmsp-e.com/tcmspsearch.php?qr=Zingiberis%20Rhizoma&qsr=herb_en_name&token=cfc8e3eceab35ac296c1a1d56dd3d812',
    'https://tcmsp-e.com/tcmspsearch.php?qr=Coptidis%20Rhizoma&qsr=herb_en_name&token=cfc8e3eceab35ac296c1a1d56dd3d812',
    'https://tcmsp-e.com/tcmspsearch.php?qr=Mume%20Fructus&qsr=herb_en_name&token=cfc8e3eceab35ac296c1a1d56dd3d812'
)

uniprotData <-
  tibble(fread("uniprotkb_reviewed.tsv")) %>%
  filter(!is.na(`Gene Names`)) %>%
  select(`Protein names`,`Gene Names`) %>%
  mutate(`Gene Names` = gsub('\\s.*', '', `Gene Names`), Symbol = gsub('\\s.*', '', `Gene Names`)) %>%
  filter(!str_detect(Symbol, '\\[a-z\\]|\\-|\\_')) %>%
  mutate(
    protein_names_space = gsub('-', ' ', `Protein names`),
    protein_names_first = gsub('\\s+\\(.*', '', `Protein names`),
    protein_names_first_space = gsub('-', ' ', gsub('\\s+\\(.*', '', `Protein names`))
  )

symbol_all <- tibble(Drug=NA,MOL_ID=NA,molecule_name=NA,target_name=NA,Symbol=NA)

num <- 1
for (url in urls) {
  name <- names[num]
  print(name)
  num <- num + 1
  web <- read_html(GET(url,encoding="UTF-8", config(ssl_verifypeer = FALSE)))
  # read all the script
  tcmsp <- web %>% html_elements("script") %>% html_text()
  test1 <- str_extract_all(tcmsp,"data:\\s\\[.*\\]")
  test2 <- unlist(test1[12])

  drug_m <- str_replace(test2[1], "data:","") %>% fromJSON(simplifyVector = TRUE) %>% mutate(across(c(ob, dl), as.numeric)) %>% filter(ob >= obThresholdValue & dl >= dlThresholdValue)
  drug_t <- str_replace(test2[2], "data:","") %>% fromJSON(simplifyVector = TRUE) %>% semi_join(drug_m, by = "MOL_ID") %>% tibble() %>% add_column(Drug = name, .before = 'molecule_ID')
  
  all_symbol <- drug_t %>% left_join(uniprotData, join_by(target_name == `Protein names`)) %>% filter(!is.na(`Gene Names`)) %>% select(Drug,MOL_ID, molecule_name, target_name, Symbol)
  all_symbol1 <- drug_t %>% left_join(uniprotData, join_by(target_name == protein_names_space)) %>% filter(!is.na(`Gene Names`)) %>% select(Drug,MOL_ID, molecule_name, target_name, Symbol)
  all_symbol2 <- drug_t %>%left_join(uniprotData, join_by(target_name == protein_names_first)) %>% filter(!is.na(`Gene Names`)) %>% select(Drug,MOL_ID, molecule_name, target_name, Symbol)
  all_symbol3 <- drug_t %>%left_join(uniprotData, join_by(target_name == protein_names_first_space)) %>% filter(!is.na(`Gene Names`)) %>% select(Drug,MOL_ID, molecule_name, target_name, Symbol)
  
  symbol_all <- symbol_all %>% add_row(all_symbol) %>% add_row(all_symbol1) %>% add_row(all_symbol2) %>% add_row(all_symbol3) %>% distinct(target_name, molecule_name,Drug, .keep_all = TRUE)
}

symbol_all <- symbol_all %>% filter(!is.na(Symbol)) %>% dplyr::rename(MolId=MOL_ID,MolName=molecule_name,TargetName=target_name)

drug_symbol <- symbol_all %>% select(Symbol) %>% distinct(Symbol, .keep_all = TRUE)

write.table(symbol_all, file="AllDrugTargetSymbol.txt", sep="\t", row.names=FALSE, quote = FALSE)


评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

water_wkp#

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值