webelement 获取span class_【第四弹】python | 获取房价数据

8e044cfc8471c1641f767ac23f415275.png

python|获取安居客房价数据

网络平台房价数据

“大数据下的房地产+时代”,互联网平台上每天有高达上百万条房地产资讯,产生网友们数亿次的阅读量。一方面国民对房地产的关注热度只增不减,另一方面房地产行业只为更加以用户为本,不断优化产业链上的各个环节以“更了解用户”为宗旨,为消费者营提供更加透明的更加多维的房地产资讯。

还有一点:据统计每天晚上22点-23点是手机用户最愿意阅读房产资讯的时刻。

这里话不多说,趁着这个时间点赶紧进入正题!

数据获取思路

这里对安居客网站上二手房价格的获取方法做一讲解。

step 1,获取安居客网站中某市二手房楼盘的每个区域的网址;

step 2,进入到某个区域后,得到该区域的二手房楼盘的总页数(即总共的小区数);

step 3,接下来,就可以一页一页地爬取每一页上二手房信息。

注意:若没有代理IP,个人获取的数据量有限,想要获取整个的数据则需要再寻求下其他方法(以后会安利给大家哒)。

下面给大家聊聊,我个人是怎样写python脚本获取部分数据的。

python实现脚本请看下文!!!

import requests
from lxml import etree
import pandas as pd
import json
import math
import random
import time
import urllib.request
# 获取安居客网站中某市二手房楼盘的每个区域的网址
def get_different_area_wang_zhi(url):
    headers = {"Cache-Control": "max-age=0",
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
               "Accept-Language": "zh-CN,zh;q=0.9"}
    response = requests.get(url, headers=headers).text
    re = etree.HTML(response)
    # 返回列表格式
    wang_zhi = re.xpath('//div[@class="w1180"]//div[@class="items"]/span[@class="item-title longtag"][contains(text(),"区域:")]/../span[@class="elems-l pp-mod"]/a/@href')
    return wang_zhi[1:] #第一个网址为全部的小区的网址,应该去掉
# 得到一个区域的二手房楼盘的总页数(n)
def get_one_area_number(url):
    headers = {"Cache-Control": "max-age=0",
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
               "Accept-Language": "zh-CN,zh;q=0.9"}
    response = requests.get(url, headers=headers).text
    re = etree.HTML(response)
    number = re.xpath('//div[@class="w1180"]//span[@class="tit"]/em[2]/text()')
    nu = int(number[0])
    n = int((nu / 30) + 2)  # 这里的+3是一个弹性数值
    return n
#爬取安居客的二手房小区信息
def anjuke_new(item,n,city_name):
    mozilla = [
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"
        ]
    headers = {"Cache-Control": "max-age=0",
               "User-Agent": "{}".format(random.choice(mozilla)),
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
               "Accept-Language": "zh-CN,zh;q=0.9"}
    for i in range(1,n):
        url = item + "p" +str(i)+'/'
        l = []
        try:
            print("正在爬取{}的第{}页".format(url,i))
            res = requests.get(url,headers = headers).text
            f = etree.HTML(res)
            # 获取当前页所有小区的网址
            name = f.xpath('//div[@class="list-content"]/div[@_soj="xqlb"]/div[@class="li-info"]/h3/a/text()')
            price = f.xpath('//div[@class="list-content"]/div[@_soj="xqlb"]/div[@class="li-side"]/p/strong/text()')
            dan_wei = f.xpath('//div[@class="list-content"]/div[@_soj="xqlb"]/div[@class="li-side"]/p[1]/text()')
            address = f.xpath('//div[@class="list-content"]/div[@_soj="xqlb"]/div[@class="li-info"]/address/text()')
            year = f.xpath('//div[@class="list-content"]/div[@_soj="xqlb"]/div[@class="li-info"]/p[@class="date"]/text()')
            for j in range(len(price)):
                d = {}
                d["小区名称"] = name[j]
                d["价格"] = price[j]
                k = 2 * j
                d["价格单位"] = dan_wei[k + 1]
                d["小区地址"] = address[j]
                d["建造年代"] = year[k]
                l.append(d)
        except:
            print("爬取{}的第{}页错误".format(url, i))
            with open(r"./anjuke_second_house_error.txt", "a") as f:
                f.write("{}第{}页出错n".format(url, i))
            continue
        print("爬取{}的第{}页完成".format(url,i))
        data = pd.DataFrame(l)
        data.to_csv(r"./{0}_anjuke_second_house_zero.csv".format(city_name), mode="a", header=None, encoding="utf-8_sig")
        time.sleep(random.randint(5,10))

#安居客二手房主函数调用
def anjuke_second_main(url,city_name):
    wang_zhi = get_different_area_wang_zhi(url)
    for item in wang_zhi:
        n = get_one_area_number(item)
        anjuke_new(item,n,city_name)
        time.sleep(random.randint(10,15))
#爬取其他城市修改url,只需要修改城市名称的简称即可,例如西安是xa,厦门是xm,具体看安居客的网址。
if __name__ == "__main__":
    url = "https://xa.anjuke.com/community/"
    #用于储存文件
    city_name = "xian"
    anjuke_second_main(url,city_name)

通过运行以上脚本,获取的数据样本如下。

1c6340163c024c7ab113a990db677f38.png

交通充电圈(微信号|Transport_Circle)

035c717e59a93503abbb2b3d53bab1cf.png
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值