代码如下:
from bs4 import BeautifulSoup
import requests
urls = ["http://bj.xiaozhu.com/search-duanzufang-p{}-0/".format(str(i)) for i in range(2, 5, 1)]
urls.insert(0, "http://bj.xiaozhu.com/")
max_of_pages = 30
def get_detail_page_url(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
page_urls = soup.select("#page_list > ul > li > a")
return [detail_page_url.get("href") for detail_page_url in page_urls]
def analyze_detail_page(url_str, num):
page_data = requests.get(url_str)
soup = BeautifulSoup(page_data.text, 'lxml')
title = soup.select("body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em")
address = soup.select("body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p")
monthly_rental = soup.select("#pricePart > div.day_l > span")
room_image = soup.select("#curBigImage")
owner_image = soup.select("#floatRightBox > div.js_box.clearfix > div.member_pic > a > img")
owner_name = soup.select("#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a")
owner_sex_blocks = soup.select("#floatRightBox > div.js_box.clearfix > div.w_240 > h6")
if len(owner_sex_blocks[0].find_all("span", class_="member_girl_ico")) > 0:
sex = "female"
else:
sex = "male"
print("Number: " + str(num))
print("Title: " + title[0].get_text())
print("Address: " + address[0].get("title"))
print("Monthly rental: " + monthly_rental[0].get_text())
print("Room image: " + room_image[0].get("src"))
print("Owner image: " + owner_image[0].get("src"))
print("Owner name: " + owner_name[0].get_text())
print("Owner sex: " + sex)
print("-------------------------------------------------------")
if __name__ == '__main__':
count = 1
for url in urls:
for page_url in get_detail_page_url(url):
if count < max_of_pages:
analyze_detail_page(page_url, count)
count += 1
else:
break
结果如下:
/Library/Frameworks/Python.framework/Versions/3.5/bin/python3.5 /Users/reed/PycharmProjects/web01/web_parse3.py
Number: 1
Title: 北京国贸CBD 近地铁旁精品舒适两居室
Address: 北京市朝阳区百子湾路16号后现代城
Monthly rental: 456
Room image: http://image.xiaozhustatic1.com/00,800,533/8,0,32,1048,1800,1200,10b2517f.jpg
Owner image: http://image.xiaozhustatic1.com/21/8,0,48,47,334,334,d6d989f6.jpg
Owner name: 知足小家
Owner sex: female
-------------------------------------------------------
Number: 2
Title: 【两居特价】二环鼓楼大街后海鸟巢温馨两居
Address: 北京市东城区鼓楼大街安德路55号院
Monthly rental: 398
Room image: http://image.xiaozhustatic1.com/00,800,533/8,0,79,1519,1800,1200,0af68cad.jpg
Owner image: http://image.xiaozhustatic1.com/21/8,0,61,1483,375,376,8c9f4abd.jpg
Owner name: 天天Tinny
Owner sex: female
-------------------------------------------------------
Number: 3
Title: 市中心地铁旁高端LOFT 可欣赏迷人夜景
Address: 北京市西城区广安门外大街
Monthly rental: 328
Room image: http://image.xiaozhustatic1.com/00,800,533/3,0,33,4595,3000,2000,1f72b9f8.jpg
Owner image: http://image.xiaozhustatic1.com/21/2,0,10,298