目录
-
项目简介
爬取贝壳找房北京二手房信息数据,并整理成可分析数据
-
数据获取
(修正后数据链接:链接:https://pan.baidu.com/s/1C3_eseM-wjW3mo-WUvgCGw 提取码:73iw )
1.爬虫
从贝壳找房爬取北京二手房最新数据,代码如下:
import re
import requests
import pandas as pd
import numpy as np
#北京区域
def get_region(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Referer": "https://bj.ke.com/ershoufang/"}
proxies = {"HTTP": "113.108.242.36:47713"}
r = requests.get(url, headers=headers, proxies=proxies)
region_re = re.compile(r'''<a class=" CLICKDATA" data-click-evtid="12339" data-click-event="WebModuleClick" data-action="source_type=PC小区列表筛选条件点击"
href="/ershoufang/(.*?)/" title="北京.*?在售二手房 ">''')
region = re.findall(region_re, r.content.decode('utf-8'))
return region
#北京区域小区价格详情
def get_data(region):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Referer": "https://bj.ke.com/ershoufang/"}
proxies = {"HTTP": "113.108.242.36:47713"}
for i in region:
urll = url + i + '/'
position = []
house = []
price = []
unitprice = []
for j in range(1, 51):
urlll = urll + 'pg' + str(j) + '/'
rr = requests.get(urlll, headers=headers, proxies=proxies)
position_re = re.compile(r'''<span class="positionIcon">.*?<a href=".*?">(.+?)</a>''', re.S)
house_re = re.compile(r'''<div class="houseInfo">.*?</span>\n\s(.*?)</div>''', re.S)
price_re = re.compile(r'''<div class="totalPrice"><span>(.*?)</span>''', re.S)
u_re = re.compile(r'''<div class="unitPrice".*?data-hid=".*?" dat