import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
class Spider():
#登陆函数
def Login(self):
url = 'https://tcmsp-e.com/'
# 躲避检测
option = Options()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
web = Chrome(options=option)
web.maximize_window()
web.get(url)
web.find_element_by_xpath('//*[@id="main-menu-user"]/li[3]/a').click()
time.sleep(1)
web.find_element_by_xpath('//*[@id="main-menu-user"]/li[@class="dropdown user offline open"]/ul[@class="dropdown-menu pull-right"]/li[2]/a').click()
input("登陆是否完成完成?")
return web
#判断是否存在下一项匹配药物
def is_Next_elementexist(self,web,num):
# num表示匹配第几个tr
try:
web.find_element_by_xpath('//*[@id="grid"]/div[@class="k-grid-content k-auto-scrollable"]/table/tbody/tr[{}]/td[2]/a'.format(num))
return True
except:
return False
def __init__(self):
web = self.Login()
self.spider(web)
#爬取
def spider(self,web):
goods_list = ['当归','竹叶','桃仁']
#设置变量用来判断是第一次搜索还是第二次以上
num_mark = 0
for name in goods_list:
# 搜索药品
if num_mark == 0:
web.find_element_by_xpath('//*[@id="index-page"]/body/div[@class="container"]/form/div[@class="row"]/div[@class="input-group col-md-6 row"]/input').send_keys(name,Keys.ENTER)
num_mark = 1
elif num_mark == 1:
web.find_element_by_xpath('//*[@id="search-page"]/body/div[@class="container"]/form/div[@class="row"]/div[@class="input-group col-md-6 row"]/input[@class="form-control"]').send_keys(name,Keys.ENTER)
time.sleep(1)
# num表示第几行数据
num = 1
# mark_exist用来标记是否存在数据,0表示没有
mark_exist = 0
while self.is_Next_elementexist(web,num):
search_result_text = web.find_element_by_xpath('//*[@id="grid"]/div[@class="k-grid-content k-auto-scrollable"]/table/tbody/tr[{}]/td[2]/a'.format(num)).text
if search_result_text == name:
web.find_element_by_xpath('//*[@id="grid"]/div[@class="k-grid-content k-auto-scrollable"]/table/tbody/tr[{}]/td[2]/a'.format(num)).click()
mark_exist = 1
break
num += 1
if mark_exist == 1:
# 点击下载
time.sleep(2)
web.find_element_by_xpath('//*[@id="ingredients"]/div[@class="k-header k-grid-toolbar"]/a[1]').click()
# 跳转到第二页
web.find_element_by_xpath('//*[@id="search-page"]/body/div[@class="container"]/div[@class="row"]/ul[@class="nav nav-tabs"]/li[2]/a').click()
# 下载第二页
time.sleep(2)
web.find_element_by_xpath('//*[@id="t_info"]/div[@class="k-header k-grid-toolbar"]/a[@class="k-button k-button-icontext k-grid-excel"]').click()
# 跳转到第三页
web.find_element_by_xpath('//*[@id="search-page"]/body/div[@class="container"]/div[@class="row"]/ul[@class="nav nav-tabs"]/li[3]/a').click()
time.sleep(2)
# 下载第三页
web.find_element_by_xpath('//*[@id="d_info"]/div[@class="k-header k-grid-toolbar"]/a[1]').click()
else:
print(name,'不存在!')
最后写个主函数自己调用。需要下载的药物名称写道goods_list列表中,如果不存在会输出该种药物不存在。浏览器的默认下载路径需要提前自己设置好,否则可能下载的太多会显得很凌乱。