python爬取drugbank

爬虫代码:

#coding:utf-8
import requests, json, random, time
from bs4 import BeautifulSoup

def dig(drugbank_accession_number="DB00460"):
    url = "https://go.drugbank.com/drugs/" + drugbank_accession_number
    # url = "https://en.wikipedia.org/wiki/Verteporfin"
    headers = {
        "User-Agent": "User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    # 代理IP池
    proxies = {
        "http": "http://127.0.0.1:7890",
        "https": "http://127.0.0.1:7890",
    }
    # proxy = random.choice(proxy_pool)  # 随机选择代理IP

    # # 发送请求获取响应
    response = requests.get(url, headers=headers, proxies=proxies)
    # print(response.text)
    soup = BeautifulSoup(response.content, 'html.parser')
    # soup = BeautifulSoup(hhhh(), 'html.parser')

    # 提取Drug Name
    drug_name = soup.find('dt', {'id': 'generic-name'}).find_next_sibling('dd').text.strip()

    # # 提取DrugBank Accession Number
    # drugbank_accession_number = soup.find('dt', {'id': 'drugbank-accession-number'}).find_next_sibling('dd').text.strip()

    # 提取Background
    background = soup.find('dt', {'id': 'background'}).find_next_sibling('dd').text.strip()

    # 提取Type
    type_value = soup.find('dt', {'id': 'type'}).find_next_sibling('dd').text.strip()

    # 提取Chemical Formula
    if soup.find('dt', {'id': 'chemical-formula'}):
        chemical_formula = soup.find('dt', {'id': 'chemical-formula'}).find_next_sibling('dd').text.strip()
    else:
        chemical_formula = ''

    # drug text
    drug_text = ''
    if background !='':
        drug_text += background + ' '
    if drug_name != '':
        drug_text += drug_name
    if type_value !='':
        drug_text += ' is of the type {}'.format(type_value)
    drug_text += ', number {}'.format(drugbank_accession_number)
    if chemical_formula != '':
        drug_text += ' and has the molecular formula {}.'.format(chemical_formula)

    with open('drug_text.json', 'a', encoding='utf-8') as f:
        f.write(json.dumps({drug_name: drug_text}, ensure_ascii=False) + '\n')
    with open('drug_order_name.json', 'a', encoding='utf-8') as f:
        f.write(json.dumps({drugbank_accession_number: drug_name}, ensure_ascii=False) + '\n')
# dig()

def main():
    # 从0到1709找到每个药物的DrugBank Accession Number,然后调用dig函数获取相关信息
    with open('id2node.json', 'r', encoding='utf-8') as f:
        id2node = json.load(f)
        
    for i in range(1007,len(id2node)):
        drugbank_accession_number = id2node[str(i)]
        print("{},{}".format(i,drugbank_accession_number), end='')
        dig(drugbank_accession_number)
        print(', over.')
        time.sleep(3)
        # break
main()

其中,

    # 代理IP池
    proxies = {
        "http": "http://127.0.0.1:7890",
        "https": "http://127.0.0.1:7890",
    }

指的是本地的vpn代理,我用的是clash客户端,默认是"http://127.0.0.1:7890",

  • 10
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值