6、一个简单的新氧的小爬虫

from bs4 import BeautifulSoup
import requests
import math

url_hos = []

for i in range(1,15):
    url_source = 'http://y.soyoung.com/hospital/0_0_0_0_0_0_415_0_0_2/{}'.format(i)
    web_db = requests.get(url_source)
    soup = BeautifulSoup(web_db.text,'lxml')
    for j in range(1,9):
        url_lists = soup.select('#bd > div.main > ul > li:nth-of-type({}) > div > div.name > a:nth-of-type(1)'.format(j))
        for url in url_lists:
            url_hos.append('http://y.soyoung.com/yy' + url.get('href')[1:])

for url_yy in url_hos:
    web_db = requests.get(url_yy)
    soup = BeautifulSoup(web_db.text,'lxml')
    yys = soup.select('#bd > div.abox.prior_list > div.head > span:nth-of-type(2)')
    for yy in yys:
        yy = yy.get_text()
        str1 = yy.replace('在线预约(','')
        str2 = str1.replace(')','')
        if(len(str2)>0):
            yy_num = int(str2)
            if(0 <= yy_num <= 12):
                yy_page = 1
            else:
                yy_page = math.ceil(yy_num/10)

            for page in range(1,yy_page + 1):
                url_page = url_yy + '?page={}'.format(page)
                web_db = requests.get(url_page)
                soup = BeautifulSoup(web_db.text,'lxml')
                hosts = soup.select('#hd > div.m_con_b > div.name_box > a.name')

                for host in hosts:
                    host_name = host.get_text()


                for x in range(1,13):
                    cps = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > p.title > a'.format(x))
                    money1s = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > p.price > span.num'.format(x))
                    money2s = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > p.price > del'.format(x))
                    yys = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > div.end > div.line > span'.format(x))
                    notes = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > div.end > div:nth-of-type(2) > span'.format(x))

                    for cp,money1,money2,yy,note in zip(cps,money1s,money2s,yys,notes):
                        data = {
                            'cp':cp.get_text(),
                            'money1':money1.get_text(),
                            'money2':money2.get_text(),
                            'yy':yy.get_text(),
                            'note':note.get_text(),
                            'host':host_name
                        }
                        print(data)
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值