python+selenium爬虫爬取TCMSP药品成分的相关信息

最新推荐文章于 2022-07-08 16:35:25 发布

淅淅的雨声

最新推荐文章于 2022-07-08 16:35:25 发布

阅读量1.3k

点赞数 1

分类专栏：爬虫

本文链接：https://blog.csdn.net/weixin_44879975/article/details/119678994

版权

爬虫专栏收录该内容

4 篇文章 1 订阅

订阅专栏

import time

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

class Spider():

    #登陆函数
    def Login(self):
        url = 'https://tcmsp-e.com/'

        # 躲避检测
        option = Options()
        option.add_experimental_option('excludeSwitches', ['enable-automation'])
        web = Chrome(options=option)
        web.maximize_window()
        web.get(url)
        web.find_element_by_xpath('//*[@id="main-menu-user"]/li[3]/a').click()

        time.sleep(1)

        web.find_element_by_xpath('//*[@id="main-menu-user"]/li[@class="dropdown user offline open"]/ul[@class="dropdown-menu pull-right"]/li[2]/a').click()
        input("登陆是否完成完成?")
        return web

    #判断是否存在下一项匹配药物
    def is_Next_elementexist(self,web,num):
        # num表示匹配第几个tr
        try:
          web.find_element_by_xpath('//*[@id="grid"]/div[@class="k-grid-content k-auto-scrollable"]/table/tbody/tr[{}]/td[2]/a'.format(num))
          return True
        except:
            return False

    def __init__(self):
        web = self.Login()
        self.spider(web)


    #爬取
    def spider(self,web):
        goods_list = ['当归','竹叶','桃仁']
        #设置变量用来判断是第一次搜索还是第二次以上
        num_mark = 0
        for name in goods_list:
            # 搜索药品
            if num_mark == 0:
                web.find_element_by_xpath('//*[@id="index-page"]/body/div[@class="container"]/form/div[@class="row"]/div[@class="input-group col-md-6 row"]/input').send_keys(name,Keys.ENTER)
                num_mark = 1
            elif num_mark == 1:
                web.find_element_by_xpath('//*[@id="search-page"]/body/div[@class="container"]/form/div[@class="row"]/div[@class="input-group col-md-6 row"]/input[@class="form-control"]').send_keys(name,Keys.ENTER)

            time.sleep(1)

            # num表示第几行数据
            num = 1
            # mark_exist用来标记是否存在数据,0表示没有
            mark_exist = 0
            while self.is_Next_elementexist(web,num):
                search_result_text = web.find_element_by_xpath('//*[@id="grid"]/div[@class="k-grid-content k-auto-scrollable"]/table/tbody/tr[{}]/td[2]/a'.format(num)).text
                if search_result_text == name:
                    web.find_element_by_xpath('//*[@id="grid"]/div[@class="k-grid-content k-auto-scrollable"]/table/tbody/tr[{}]/td[2]/a'.format(num)).click()
                    mark_exist = 1
                    break
                num += 1

            if mark_exist == 1:
                # 点击下载
                time.sleep(2)
                web.find_element_by_xpath('//*[@id="ingredients"]/div[@class="k-header k-grid-toolbar"]/a[1]').click()
                # 跳转到第二页
                web.find_element_by_xpath('//*[@id="search-page"]/body/div[@class="container"]/div[@class="row"]/ul[@class="nav nav-tabs"]/li[2]/a').click()
                # 下载第二页
                time.sleep(2)
                web.find_element_by_xpath('//*[@id="t_info"]/div[@class="k-header k-grid-toolbar"]/a[@class="k-button k-button-icontext k-grid-excel"]').click()
                # 跳转到第三页
                web.find_element_by_xpath('//*[@id="search-page"]/body/div[@class="container"]/div[@class="row"]/ul[@class="nav nav-tabs"]/li[3]/a').click()
                time.sleep(2)
                # 下载第三页
                web.find_element_by_xpath('//*[@id="d_info"]/div[@class="k-header k-grid-toolbar"]/a[1]').click()
            else:
                print(name,'不存在!')

最后写个主函数自己调用。需要下载的药物名称写道goods_list列表中，如果不存在会输出该种药物不存在。浏览器的默认下载路径需要提前自己设置好，否则可能下载的太多会显得很凌乱。

淅淅的雨声

关注

1
点赞
踩
12

收藏

觉得还不错? 一键收藏
1
评论
python+selenium爬虫爬取TCMSP药品成分的相关信息

import timefrom selenium.webdriver import Chromefrom selenium.webdriver.chrome.options import Optionsfrom selenium.webdriver.common.keys import Keysclass Spider(): #登陆函数 def Login(self): url = 'https://tcmsp-e.com/' # 躲避检测
复制链接

扫一扫