页面全部为JS动态加载,算是练手,因为写的实在太累了,就不封装方法了 ,也没写多线程,这个网站封IP 一次大概封30分钟左右
import re
import requests
from fake_useragent import UserAgent
from lxml import etree
import sys
# 设置页面
COOKIE = '此处写入登录的COOKIE'
ua = UserAgent()
# 最初始的访问页面
name = input('输入要查询的药名:')
url = f'http://www.smpaa.cn/smpaweb/pubserver/pubserver/drugSearch.action?ypcpm={name}&scqymc=&orderby=&pageIndex=1'
new_herders = {
'Connection': 'keep-alive',
'Cookie': COOKIE,
'Host': 'www.smpaa.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ua.random,
}
index_html = requests.get(url=url, headers=new_herders).text
# print(index_html)
# 匹配页数
page = re.findall(r"'.*','(\d+)'", index_html)
no_login = re.findall('如果没有登陆信息,需要登陆', index_html)
if '如果没有登陆信息,需要登陆' == no_login[0]:
print(f'WOW!!!COOKIE过期了')
sys.exit()
index_page = page[0]
# print(f'总页数:{page}')
if index_page != '0':
for i in range(int(index_page)):
print(f'一级页面共{index_page}页,正在爬取第{i + 1}页')
params = {
# 'ypcpm': '复方', # 测试
'ypcpm': name,
# 'pageIndex': '1', # 测试
'pageIndex': str(i),
}
url = 'http://www.smpaa.cn/smpaweb/pubserver/pubserver/drugSearch.action?'
html = requests.get(url=url, headers=new_herders, params=params)
html = html.text
# # 中西医分辨
# zxybz = re.findall(r"var zxybz='(.*)'.*中西",str1)
# 药品编号
bianma = re.findall(r"javascript:showdetail.'(\w+\d+)'", html)
print(f'编号:{bianma}')
for haoma in bianma:
# erji_url = 'http://www.smpaa.cn/smpaweb/pubserver/pubserver/pubserver/ypxxxxcx.action?qxmc=&jdmc=&yptbdm=Z05040000010010&orderby=&pageIndex=0' # 测试
erji_url = f'http://www.smpaa.cn/smpaweb/pubserver/pubserver/pubserver/ypxxxxcx.action?qxmc=&jdmc=&yptbdm={haoma}&orderby=&pageIndex=0'
erji_headers = {
'Cookie': COOKIE,
'Host': 'www.smpaa.cn',
'Origin': 'http://www.smpaa.cn',
# 'Referer': f'http://www.smpaa.cn/smpaweb/pubserver/pubserver/pubserver/drugSearch.action?ypcpm={name}&scqymc=&orderby=&pageIndex={str(i)}',
# 'Referer': f'http://www.smpaa.cn/smpaweb/pubserver/pubserver/pubserver/drugSearch.action?ypcpm={name}&scqymc=&orderby=&pageIndex={str(i)}', # 测试
'Upgrade-Insecure-Requests': '1',
'User-Agent': ua.random,
}
erji_html = requests.get(url=erji_url, headers=erji_headers).text
# 得到二级页面的总页数
erji_page = re.findall(r"'.*','(\d+)'", erji_html)
erji_page = erji_page[0]
if erji_page != '0':
for page in range(int(erji_page)):
print(f'编号:{haoma},二级页面共{erji_page}页,正在爬取第{page + 1}页')
params = {
# 'ypcpm': '%E5%A4%8D%E6%96%B9', # 测试
'ypcpm': name,
'scqymc': '',
'yptbdm': haoma,
# 'yptbdm': 'X00328400020010', # 测试
'orderby': '',
'recordIP': '1',
}
erji_url = 'http://www.smpaa.cn/smpaweb/pubserver/ypxxxxcx.action'
erji_parse_html = requests.post(url=erji_url, headers=erji_headers, params=params).text
html = etree.HTML(erji_parse_html)
list = html.xpath('//*[@id="table1"]/tr')
if list:
for i in list[1:-2]:
result_list = []
new_list = i.xpath('./td[@align="center"]/text()')
for j in new_list[:7]:
if '\t' in j:
j = ' '
result_list.append(j)
print(result_list)
print('*' * 20)
else:
print(f'编号:{haoma},不存在二级页面')
else:
print(f'没有此类药品')
# 懒得封装了 凑乎看吧 能用就行,太累了这个页面
# 小提示: 页面错误 说明是cookie过期,去换一个新的就行,