6、一个简单的新氧的小爬虫

最新推荐文章于 2021-06-14 13:15:40 发布

「已注销」

最新推荐文章于 2021-06-14 13:15:40 发布

阅读量1k

点赞数 1

分类专栏： Python爬虫

本文链接：https://blog.csdn.net/jueyings/article/details/60466138

版权

Python爬虫专栏收录该内容

12 篇文章 0 订阅

订阅专栏

from bs4 import BeautifulSoup
import requests
import math

url_hos = []

for i in range(1,15):
    url_source = 'http://y.soyoung.com/hospital/0_0_0_0_0_0_415_0_0_2/{}'.format(i)
    web_db = requests.get(url_source)
    soup = BeautifulSoup(web_db.text,'lxml')
    for j in range(1,9):
        url_lists = soup.select('#bd > div.main > ul > li:nth-of-type({}) > div > div.name > a:nth-of-type(1)'.format(j))
        for url in url_lists:
            url_hos.append('http://y.soyoung.com/yy' + url.get('href')[1:])

for url_yy in url_hos:
    web_db = requests.get(url_yy)
    soup = BeautifulSoup(web_db.text,'lxml')
    yys = soup.select('#bd > div.abox.prior_list > div.head > span:nth-of-type(2)')
    for yy in yys:
        yy = yy.get_text()
        str1 = yy.replace('在线预约（','')
        str2 = str1.replace('）','')
        if(len(str2)>0):
            yy_num = int(str2)
            if(0 <= yy_num <= 12):
                yy_page = 1
            else:
                yy_page = math.ceil(yy_num/10)

            for page in range(1,yy_page + 1):
                url_page = url_yy + '?page={}'.format(page)
                web_db = requests.get(url_page)
                soup = BeautifulSoup(web_db.text,'lxml')
                hosts = soup.select('#hd > div.m_con_b > div.name_box > a.name')

                for host in hosts:
                    host_name = host.get_text()


                for x in range(1,13):
                    cps = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > p.title > a'.format(x))
                    money1s = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > p.price > span.num'.format(x))
                    money2s = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > p.price > del'.format(x))
                    yys = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > div.end > div.line > span'.format(x))
                    notes = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > div.end > div:nth-of-type(2) > span'.format(x))

                    for cp,money1,money2,yy,note in zip(cps,money1s,money2s,yys,notes):
                        data = {
                            'cp':cp.get_text(),
                            'money1':money1.get_text(),
                            'money2':money2.get_text(),
                            'yy':yy.get_text(),
                            'note':note.get_text(),
                            'host':host_name
                        }
                        print(data)