import requests
from bs4 import BeautifulSoup
from lxml import etree
import re
from prac.prac import insert_into_data_one
class NEW_CHALLENGE:
def __init__(self):
#请求地址
self.url = "https://www.yaopinnet.com/zhaoshang/zhaoshang7.asp?k=53"
#拼接地址
self.url_split = 'https://www.yaopinnet.com'
#登录信息
self.cookies = {
'ASPSESSIONIDSWRDRASR': '',
'ASPSESSIONIDSWSDRBSQ': '',
'hy_mid': '',
'hy_mid1': '',
'yaoyuanwangid': '',
'yaoyuanwangidcookie': ''
}
def get_data(self):
response_big = requests.get(url=self.url)
if response_big.status_code==200:
response_big.encoding = response_big.apparent_encoding
# print("response_big:", response_big.text)
return response_big.text
def sparse_data(self):
# 获取源码并返回源码
data = self.get_data()
soup = BeautifulSoup(data, 'html.parser')
# 定位到id为"shaixuan"的table
table = soup.find('table', id='shaixuan')
# print("table:",table)
# 提取tr标签
rows = table.find_all('tr')
for row in rows:
strong_text = row.find('td').find('strong').text
# print(f"类别: {strong_text}")
links = [(a['href'], a.text) for a in row.find_all('td')[1].find_all('a')]
print("links:",links)
print(f"类别: {strong_text}")
for href, medicine_name in links:
medicine_data_url = self.url_split + href
print(f"药品类:{medicine_name},url:{medicine_data_url}")
response_two = requests.get(url=medicine_data_url,cookies=self.cookies)
if response_two.status_code==200:
response_two.encoding = response_two.apparent_encoding
response_two_data = etree.HTML(response_two.text)
number_of_pages = response_two_data.xpath('//*[@id="fenye3"]/text()[1]')
try:
match = re.search(r'/(\d+)', number_of_pages[0])
number_of_pages_item = int(match.group(1)) if match else print("未找到匹配的数字")
number_of_pages_item = 31 if number_of_pages_item > 31 else number_of_pages_item+1
print("medicine_name",medicine_name,"medicine_data_url",medicine_data_url,type(medicine_data_url),"number_of_pages_item:",number_of_pages_item)
self.request_data_item(strong_text,medicine_name,medicine_data_url,number_of_pages_item)
except Exception as E:
p rint("没有页码")
continue
def request_data_item(self,strong_text,medicine_name1,medicine_data_url,number_of_pages_item):
for item in range(1, number_of_pages_item):
# 使用正则表达式进行匹配
match = re.search(r'\?k', medicine_data_url)
url_split_data = medicine_data_url.split('?')
complete_url = url_split_data[0]+ f'?pages={item}'+'&'+url_split_data[1] if match else medicine_data_url+f"?pages={item}" + '&k=&tedian1='
# print("请求地址url:",complete_url)
response = requests.get(url=complete_url, cookies=self.cookies)
if response.status_code == 200:
response.encoding = response.apparent_encoding
# print("response:", response.text)
xml_text = response.text
text = etree.HTML(xml_text)
image_path=text.xpath('//*[@id="chanpin_tupian"]/a/img/@src')
# 药品名称
medicine_name=text.xpath('//*[@id="chanpin_gaikuang"]/strong[1]/span/a/text()')
# 批准文号
Approval_number = text.xpath('//*[@id="chanpin_gaikuang"]/text()[2]')
# # 规格
specifications = text.xpath('//*[@id="chanpin_gaikuang"]/text()[3]')
# 经营企业
Operating_enterprise1 = text.xpath('//*[@id="chanpin_gaikuang"]/a/text()')
Operating_enterprise2 = text.xpath('//*[@id="chanpin_gaikuang"]/text()[4]')
Operating_enterprise = Operating_enterprise1 if len(Operating_enterprise1) > 0 else Operating_enterprise2
# 产品简介
Product_Introduction = text.xpath('//*[@id="chanpin_gongneng"]/text()')
#更新时间
update_time = text.xpath('//*[@id="chanpin_fabushijian"]/text()')
print("药品名称:",medicine_name)
print("批准文号:",Approval_number)
print("规格:",specifications)
print("经营企业:",Operating_enterprise)
print("产品简介:",Product_Introduction)
print("更新时间:",update_time)
print("图片地址:",image_path)
for medicine_name_item,Approval_number_item,specifications_item,Operating_enterprise_item,Product_Introduction_item,update_time_item, image_path_item in zip(medicine_name,Approval_number,specifications,Operating_enterprise,Product_Introduction,update_time,image_path):
insert_into_data_one(medicine_name_item,Approval_number_item,specifications_item,Operating_enterprise_item,Product_Introduction_item,update_time_item, image_path_item,strong_text,medicine_name1)
print(f"{medicine_name_item}保存到数据库中。。。。。。。。。。。。。。。。")
if __name__ == '__main__':
NEW_CHALLENGE_DATA = NEW_CHALLENGE()
NEW_CHALLENGE_DATA.sparse_data()
07-15
3826