from bs4 import BeautifulSoup
import requests
import math
url_hos = []
for i in range(1,15):
url_source = 'http://y.soyoung.com/hospital/0_0_0_0_0_0_415_0_0_2/{}'.format(i)
web_db = requests.get(url_source)
soup = BeautifulSoup(web_db.text,'lxml')
for j in range(1,9):
url_lists = soup.select('#bd > div.main > ul > li:nth-of-type({}) > div > div.name > a:nth-of-type(1)'.format(j))
for url in url_lists:
url_hos.append('http://y.soyoung.com/yy' + url.get('href')[1:])
for url_yy in url_hos:
web_db = requests.get(url_yy)
soup = BeautifulSoup(web_db.text,'lxml')
yys = soup.select('#bd > div.abox.prior_list > div.head > span:nth-of-type(2)')
for yy in yys:
yy = yy.get_text()
str1 = yy.replace('在线预约(','')
str2 = str1.replace(')','')
if(len(str2)>0):
yy_num = int(str2)
if(0 <= yy_num <= 12):
yy_page = 1
else:
yy_page = math.ceil(yy_num/10)
for page in range(1,yy_page + 1):
url_page = url_yy + '?page={}'.format(page)
web_db = requests.get(url_page)
soup = BeautifulSoup(web_db.text,'lxml')
hosts = soup.select('#hd > div.m_con_b > div.name_box > a.name')
for host in hosts:
host_name = host.get_text()
for x in range(1,13):
cps = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > p.title > a'.format(x))
money1s = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > p.price > span.num'.format(x))
money2s = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > p.price > del'.format(x))
yys = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > div.end > div.line > span'.format(x))
notes = soup.select('#bd > div.abox.prior_list > div.list_set_box > ul > li:nth-of-type({}) > div.end > div:nth-of-type(2) > span'.format(x))
for cp,money1,money2,yy,note in zip(cps,money1s,money2s,yys,notes):
data = {
'cp':cp.get_text(),
'money1':money1.get_text(),
'money2':money2.get_text(),
'yy':yy.get_text(),
'note':note.get_text(),
'host':host_name
}
print(data)
6、一个简单的新氧的小爬虫
最新推荐文章于 2021-06-14 13:15:40 发布