爬取药源网数据

paul jeorgh
于 2024-09-11 11:07:33 发布
阅读量105
点赞数 3
文章标签： python 开发语言
本文链接：https://blog.csdn.net/2302_79777012/article/details/142134511
版权

import requests
from bs4 import BeautifulSoup
from  lxml  import etree
import re
from prac.prac import insert_into_data_one
class NEW_CHALLENGE:

       def  __init__(self):

             #请求地址
             self.url = "https://www.yaopinnet.com/zhaoshang/zhaoshang7.asp?k=53"
             #拼接地址
             self.url_split = 'https://www.yaopinnet.com'
             #登录信息
             self.cookies = {
                 'ASPSESSIONIDSWRDRASR': '',
                 'ASPSESSIONIDSWSDRBSQ': '',
                 'hy_mid': '',
                 'hy_mid1': '',
                 'yaoyuanwangid': '',
                 'yaoyuanwangidcookie': ''
             }

       def  get_data(self):

              response_big = requests.get(url=self.url)
              if  response_big.status_code==200:
                  response_big.encoding = response_big.apparent_encoding
                  # print("response_big:", response_big.text)
                  return response_big.text



       def sparse_data(self):

           # 获取源码并返回源码
           data = self.get_data()
           soup = BeautifulSoup(data, 'html.parser')
           # 定位到id为"shaixuan"的table
           table = soup.find('table', id='shaixuan')
           # print("table:",table)
           # 提取tr标签
           rows = table.find_all('tr')

           for row in rows:
               strong_text = row.find('td').find('strong').text
               # print(f"类别: {strong_text}")

               links = [(a['href'], a.text) for a in row.find_all('td')[1].find_all('a')]
               print("links:",links)

               print(f"类别: {strong_text}")
               for href, medicine_name in links:
                   medicine_data_url = self.url_split + href

                   print(f"药品类:{medicine_name},url:{medicine_data_url}")
                   response_two = requests.get(url=medicine_data_url,cookies=self.cookies)
                   if  response_two.status_code==200:
                              response_two.encoding = response_two.apparent_encoding
                              response_two_data =  etree.HTML(response_two.text)
                              number_of_pages = response_two_data.xpath('//*[@id="fenye3"]/text()[1]')
                              try:
                                  match = re.search(r'/(\d+)', number_of_pages[0])
                                  number_of_pages_item = int(match.group(1))  if match else print("未找到匹配的数字")
                                  number_of_pages_item = 31  if number_of_pages_item > 31  else number_of_pages_item+1
                                  print("medicine_name",medicine_name,"medicine_data_url",medicine_data_url,type(medicine_data_url),"number_of_pages_item:",number_of_pages_item)

                                  self.request_data_item(strong_text,medicine_name,medicine_data_url,number_of_pages_item)




                              except  Exception  as E:

                                    p  rint("没有页码")
                                    continue


       def   request_data_item(self,strong_text,medicine_name1,medicine_data_url,number_of_pages_item):
           for item in range(1, number_of_pages_item):

               # 使用正则表达式进行匹配
               match = re.search(r'\?k', medicine_data_url)
               url_split_data = medicine_data_url.split('?')
               complete_url = url_split_data[0]+ f'?pages={item}'+'&'+url_split_data[1]  if  match  else  medicine_data_url+f"?pages={item}" + '&k=&tedian1='

               # print("请求地址url:",complete_url)
               response = requests.get(url=complete_url, cookies=self.cookies)
               if response.status_code == 200:
                   response.encoding = response.apparent_encoding
                   # print("response:", response.text)
                   xml_text = response.text
                   text = etree.HTML(xml_text)
                   image_path=text.xpath('//*[@id="chanpin_tupian"]/a/img/@src')
                   # 药品名称
                   medicine_name=text.xpath('//*[@id="chanpin_gaikuang"]/strong[1]/span/a/text()')
                   # 批准文号
                   Approval_number = text.xpath('//*[@id="chanpin_gaikuang"]/text()[2]')
               #     # 规格
                   specifications = text.xpath('//*[@id="chanpin_gaikuang"]/text()[3]')
                   # 经营企业
                   Operating_enterprise1 = text.xpath('//*[@id="chanpin_gaikuang"]/a/text()')

                   Operating_enterprise2 = text.xpath('//*[@id="chanpin_gaikuang"]/text()[4]')

                   Operating_enterprise = Operating_enterprise1 if len(Operating_enterprise1) > 0 else Operating_enterprise2
                   # 产品简介
                   Product_Introduction = text.xpath('//*[@id="chanpin_gongneng"]/text()')
                   #更新时间
                   update_time = text.xpath('//*[@id="chanpin_fabushijian"]/text()')

                   print("药品名称:",medicine_name)
                   print("批准文号:",Approval_number)
                   print("规格:",specifications)
                   print("经营企业:",Operating_enterprise)
                   print("产品简介:",Product_Introduction)
                   print("更新时间:",update_time)
                   print("图片地址:",image_path)
                   for medicine_name_item,Approval_number_item,specifications_item,Operating_enterprise_item,Product_Introduction_item,update_time_item, image_path_item in zip(medicine_name,Approval_number,specifications,Operating_enterprise,Product_Introduction,update_time,image_path):

                           insert_into_data_one(medicine_name_item,Approval_number_item,specifications_item,Operating_enterprise_item,Product_Introduction_item,update_time_item, image_path_item,strong_text,medicine_name1)
                           print(f"{medicine_name_item}保存到数据库中。。。。。。。。。。。。。。。。")






if __name__ == '__main__':

    NEW_CHALLENGE_DATA = NEW_CHALLENGE()
    NEW_CHALLENGE_DATA.sparse_data()