房天下的爬取本身没有技术难点,不做过多讲解。只讲一讲在爬取过程中遇到的反扒问题。
房天下设置了页面的跳转作为反扒措施,即:在访问网页时会进行多次跳转才会进入到真正的目标页面。因此我编写了find_real_url
来解决这个问题。在编写程序时,通过观察跳转页面的网页源码,可以比较容易的观察到跳转网页的源码中存在新的url,因此在爬取时通过正则表达式获取新的url并访问,直到目标页面中的目标元素出现,即:self.condition中的条件得到满足。
import requests
from bs4 import BeautifulSoup
from urllib import parse
import re
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.exc import SQLAlchemyError
base = declarative_base()
engine = create_engine(
'mysql+pymysql://root:1117@127.0.0.1:3306/house',
max_overflow=500,
pool_size=100,
echo=False
)
class House(base):
__tablename__ = 'House'
id = Column(Integer, primary_key=True, autoincrement=True)
house = Column(String(100))
rent = Column(String(50))
mode = Column(String(50))
house_type = Column(String(50))
area = Column(String(50))
orientation = Column(String(50))
floor = Column(String(50))
decorate = Column(String(50))
# base.metadata.create_all(engine)
class HouseInformation(object):
def __init__(self, region_index):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
'Cookie': 'global_cookie=o89u8zrl9wdc5k7uzktd7382n1ak7ha9uks; integratecover=1; __utmc=147393320; city=sh; ASP.NET_SessionId=rkismcpoktv1clyayleoz0sf; Rent_StatLog=3d24a513-78b7-4d99-b62d-67f5aa046cbe; keyWord_recenthousesh=%5b%7b%22name%22%3a%22%e6%b5%a6%e4%b8%9c%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a025%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%95%bf%e5%ae%81%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a020%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%9d%99%e5%ae%89%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a021%2f%22%2c%22sort%22%3a1%7d%5d; __utma=147393320.1053454490.1583565953.1583583445.1583586280.4; __utmz=147393320.1583586280.4.4.utmcsr=search.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/captcha-43be97b647ad9f449d/redirect; Captcha=4166566F4C5A66774C765173364F68745A767155766248786D2F507365567664484D6F334E55656C77576F6C34395330324B702F70594A7A503636455250783564706B4B6B3773705972343D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; g_sourcepage=zf_fy%5Exq_pc; unique_cookie=U_o89u8zrl9wdc5k7uzktd7382n1ak7ha9uks*79; __utmb=147393320.12.10.1583586280',
'Connection': 'keep_alive'}
self.session = requests.session()
self.session.headers = self.headers
self.html = None
self.condition = None
self.url = None
self.region = region_index
self.info_dict = {}
self.base_url = 'https://sh.zu.fang.com/house-a0'
self.get_url()
def get_html(self):
self.condition = "len(self.html.select('div.houseList dl a')) != 0"
self.find_real_url()
part_url = self.html.select('div.houseList dl a')[0]['href']
self.url = parse.urljoin(self.url, part_url)
self.condition = "len(self.html.select('div.trl-item')) != 0"
self.find_real_url()
def get_url(self):
self.url = self.base_url + str(self.region)
url_region = self.url
max_page = self.get_max_page()
for page in range(1, max_page + 1):
self.url = url_region + r'/i3{}'.format(page)
with ThreadPoolExecutor(max_workers=max_page) as thread:
print('线程池已建立')
thread.submit(self.crawl)
def get_max_page(self):
self.condition = "len(self.html.select('span.txt')) != 0"
self.find_real_url()
max_page = int(self.html.select('span.txt')[0].text[1:-1])
return max_page
def find_real_url(self):
self.html = BeautifulSoup(self.session.get(self.url).text, 'lxml')
pattern = re.compile('location.href="(.*?)"', re.S)
while not eval(self.condition):
self.url = re.findall(pattern, str(self.html))[0]
self.html = BeautifulSoup(self.session.get(self.url).text, 'lxml')
def get_data(self):
self.info_dict['house'] = self.html.select('div.title')[0].text.replace(' ', '').replace('\n', '').replace('\r',
'')
self.info_dict['price'] = self.html.select('div.trl-item')[0].text.replace(' ', '').replace('\n', '')
for index, info in enumerate(self.html.select('div.tt')):
if index == 4:
self.info_dict['楼层'] = info.text.replace(' ', '').replace('\n', '')
self.info_dict[self.html.select('div.tt + div')[index].text] = info.text.replace(' ', '').replace('\n', '')
def save_data(self):
sqlsession = scoped_session(sessionmaker(engine))()
try:
house = House(house=self.info_dict['house'],
rent=self.info_dict['price'],
mode=self.info_dict['出租方式'],
house_type=self.info_dict['户型'],
area=self.info_dict['建筑面积'],
orientation=self.info_dict['朝向'],
floor=self.info_dict['楼层'],
decorate=self.info_dict['装修']
)
sqlsession.add(house)
sqlsession.commit()
print('保存成功')
except SQLAlchemyError:
print('保存失败')
sqlsession.rollback()
def crawl(self):
self.get_html()
self.get_data()
self.save_data()
if __name__ == '__main__':
with ProcessPoolExecutor(max_workers=3) as process:
for region in range(20, 25):
print('进程池已建立')
process.submit(HouseInformation, region)