引言
本篇博文是对上一篇文章中存在问题的修正,上一篇文章中使用了selenium来爬取页面,效率比较低,这篇文章中,我直接使用requests库进行爬取并且增强了程序的健壮性。
思路
上一篇文章中已经分析了,这里就不重复造轮子了,请出门,右转。
文章地址
代码
# !/usr/bin/env python
# —*— coding: utf-8 —*—
# @Time: 2020/2/8 9:08
# @Author: Martin
# @File: fang.py
# @Software:PyCharm
import requests
import re
import pymongo
from lxml import etree
class FangSpider(object):
def __init__(self):
self.start_url = 'https://www.fang.com/SoufunFamily.htm'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
self.client = pymongo.MongoClient(host='localhost', port=27017)
self.db = self.client['fangtianxia']
def run(self):
try:
response = requests.get(self.start_url, headers=self.headers)
response.encoding = 'gbk'
self.parse_page(response.text)
except Exception as e:
print(e)
def parse_page(self, text):
html = etree.HTML(text)
a_list = html.xpath('//table[@id="senfe"]//td//a')
china_house = []
for a in a_list:
city_url = "".join(a.xpath('@href'))
city_name = "".join(a.xpath('text()'))
if city_name == '海外':
break
china_house.append((city_name, city_url))
self.parse_page_url(china_house)
def parse_page_url(self, china_house):
for city in china_house:
(city_name, city_url) = city
new_house_url = 'http://' + city_url.split("//")[-1].split(".")[0] + '.newhouse.fang.com/house/s/'
if city_name == '北京':
new_house_url = 'http://newhouse.fang.com/house/s'
html = self.parse_detail_page(new_house_url, city_name)
try:
end = "".join(html.xpath('//div[@class="page"]/ul/li[last()]/a[@class="last"]/@href')).strip()
end_url = new_house_url + '/' + end.split('/')[-2]
except:
print("未找到结束页码!")
continue
i = 2
while True:
next_url = new_house_url + "/b9" + str(i)
i += 1
self.parse_detail_page(next_url, city_name)
if next_url == end_url:
break
def parse_detail_page(self, url, city_name):
try:
r = requests.get(url, headers=self.headers)
except Exception as e:
print(e)
return ""
r.encoding = 'gbk'
html = etree.HTML(r.text)
li_list = html.xpath('//div[@id="newhouse_loupai_list"]//ul//li')
for li in li_list:
name = "".join(li.xpath('.//div[@class="nlcd_name"]/a/text()')).strip()
origin_url = "http://" + "".join(li.xpath('.//div[@class="nlcd_name"]/a/@href')).strip()
house_type = "".join(li.xpath('.//div[contains(@class,"house_type")]//text()'))
house_type = re.sub(r'\s', "", house_type)
address = "".join(li.xpath('.//div[@class="address"]/a/@title')).strip()
price = "".join(li.xpath('.//div[@class="nhouse_price"]//text()'))
price = re.sub(r'\s', "", price)
sale = "".join(li.xpath('.//div[@class="fangyuan"]/span/text()'))
label = "".join(li.xpath('.//div[@class="fangyuan"]//a//text()'))
house = {
'city_name': city_name,
'name': name,
'house_type': house_type,
'address': address,
'price': price,
'sale': sale,
'label': label,
'origin_url': origin_url
}
print(house)
self.save(house)
return html
def save(self, house):
self.db.fangtianxia.insert_one(house)
def close(self):
self.client.close()
if __name__ == '__main__':
spider = FangSpider()
spider.run()
spider.close()