一、前言
之前有买二手房的需求,苦于房源太多无从下手,一个一个看又太慢,所以借助 Python 批量爬取房源筛选出符合我们调解的房源,再看的话会节省很多时间。
二、思路
某家上房源比较真实可靠,这里就拿某家作为目标站点。
首先 打开 某家-选择区域,比如 “北京”-二手房,筛选条件勾选-海淀区-500到800万-2室-集体供暖
这时候结果有30页,点下第二页,复制URL
会发现URL里多了个 pg2 应该就是page2的意思,第二页,改成几就是第几页
https://bj.*****.com/ershoufang/haidian/pg2mw1hy1l2p6/
另外就是找的房子离上班的地方近一些,路程1小时内的。
这时候就可以上高德开放服务(https://lbs.amap.com/),去申请地图应用了
注册账号后,点击应用管理-我的应用-创建应用-填入名称和应用类型,点击新建
点击添加
然后添加Key名称,服务平台选择web服务
提交后会有一个key生成
并且有一定的免费额度,对于我们使用来说完全足够了。
之后就可以上代码了
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import random
import re
import requests
import time
import demjson
def getheaders(url, use='pc'):
"""user_agent部分来源:https://blog.csdn.net/IT__LS/java/article/details/78880903"""
referer = lambda url: re.search(
"^((http://)|(https://))?([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(/)", url).group()
"""正则来源:https://www.cnblogs.com/blacksonny/p/6055357.html"""
agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
headers = {
'User-Agent': agent,
'Referer': referer(url),
'DNT': "1",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9,en-CN;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
}
return headers
headers = getheaders('https://bj.*****.com/ershoufang/haidian/pg2mw1hy1l2p6/ ')
print(headers)
#代理,可无,频繁时可使用
proxy = '183.*.*.3:80'
proxies = {
#"http": "http://%(proxy)s/" % {'proxy': proxy},
"https": "http://%(proxy)s/" % {'proxy': proxy}
}
#从第一页到30页
for step_i in range(1,30):
#组合url并获取页面内容
response = requests.get('https://bj.*****.com/ershoufang/haidian/pg' + str(step_i) + 'mw1hy1l2p6/ ',headers=headers,proxies=proxies)
html = response.text
"""with open("html.txt" , "w",encoding='utf8') as f:
f=open("html.txt", mode='w')
f.write(html.replace(u'\xa0', u'').replace(u'\xb2', u'').replace(u'\xa9',u''))
f.flush()
f.close()"""
#我的所在位置,可用高德地图坐标拾取 https://lbs.amap.com/tools/picker
my_location = "116.656435,39.909946"
soup = BeautifulSoup(html,'lxml')
#页面房源列表
list_item = soup.find('ul','sellListContent').find_all('li')
for item in list_item:
#小区名称
community = item.find('div',"positionInfo").find('a').string.strip()
#地址查询
query_community_location = "https://restapi.amap.com/v3/place/text?key=49e6************084540d78&keywords=" + community + "&types=&city=北京&children=&offset=1&page=1&extensions=all"# //GET请求
res = requests.get(query_community_location)
location_info = res.text
#print(location_info)
json = demjson.decode(location_info)
#print(json)
if json['status'] == "1" and json['count'] != '0':
#print(json['geocodes'])
destination = str(json['pois'][0]['location'])
#print(destination)
#路线
query_transit = "https://restapi.amap.com/v3/direction/transit/integrated?key=49e6************084540d78&origin=" + my_location +"&destination=" + destination + "&city=北京&strategy=2&nightflag=0" #GET请求
query_transit_res = requests.get(query_transit)
query_transit_info = query_transit_res.text
query_transit_info_json = demjson.decode(query_transit_info)
#print(query_transit_info_json)
if query_transit_info_json['status'] == "1" and query_transit_info_json['count'] != "0":
try:
#距离
distance = query_transit_info_json['route']['transits'][0]['distance']
#此方案总步行距离
walking_distance = query_transit_info_json['route']['transits'][0]['walking_distance']
#耗时
duration = query_transit_info_json['route']['transits'][0]['duration']
#总价
price = query_transit_info_json['route']['transits'][0]['cost']
#耗时大于 70 分钟的不看
if int(int(duration) / 60) > 70:
continue
#距离以及交通信息
print(community)
print(item.find('a','noresultRecommend')['href'])
print(" 从我公司到-" + community + ":距离:" + str(int(distance) / 1000) + "公里_需要步行:" + str(int(walking_distance) / 1000) +"公里_共耗时:" + str(int(int(duration) / 60)) + " 分钟_" + "花费:" + price)
#地铁
query_metro = "https://restapi.amap.com/v3/place/around?key=49e6************084540d78&location=" + destination + "&keywords=地铁&radius=1000&offset=20&page=1&extensions=all";
query_metro_res = requests.get(query_metro)
query_metro_res_info = query_metro_res.text
query_metro_res_info_json = demjson.decode(query_metro_res_info)
if query_metro_res_info_json['status'] == "1":
print(" " + community + " 附近共有 " + query_metro_res_info_json['count'] + " 条地铁,分别是:")
for poi in query_metro_res_info_json['pois']:
print(" " + poi['name'] + " " + poi['address'] + " 距离小区:" + poi['distance'] + "米")
#查询详情页
res_detile_info = requests.get(item.find('a','noresultRecommend')['href'],headers=headers,proxies=proxies).text
detile_soup = BeautifulSoup(res_detile_info,'lxml')
#print(detile_soup)
ul = detile_soup.find('div','base').find('ul')
#房子信息
print(' 户型:' + ul.find_all('li')[0].get_text())
print(' 楼层:' + ul.find_all('li')[1].get_text())
print(' 建面:' + ul.find_all('li')[2].get_text())
print(' 实际:' + ul.find_all('li')[4].get_text())
print(' 供暖:' + ul.find_all('li')[10].get_text())
print(' 电梯:' + ul.find_all('li')[11].get_text())
print(' 信息:' + item.find('div','houseInfo').get_text())
print(" 售价:" + item.find('div','totalPrice').find('span').string + "万")
except:
continue
#time.sleep(0.1)
以上方法还可以筛选公交等线路信息,以此缩小范围,祝你早日找到心仪房产
以上仅限合理合法个人研究使用,因此产生的一切责任和损失本人概不负责。