Python二手房价格预测(一)——数据获取

审核一直不通过,只能用这种方式了。
Python二手房价格预测(一)——数据获取

前言

        二手房价格预测问题一直作为基础的数据分析入门课题,有许多开源的房价预测数据集。这些数据虽为经典,但时效上有所不足。因此我将在此记录Python从0到1的二手房房价预测过程,从数据获取开始。

全部代码

import requests
from bs4 import BeautifulSoup
import json
import pymongo
MONGO_URL = 'localhost'
MONGO_DB = 'lianjia'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

def getPage(url):
    try:
        res = requests.get(url)
        if res.status_code == 200:
            # print(res.text)
            # print(res.encoding)
            return res.text
    except Exception as e:
        print(e)



def getHouseId(url):
    pageText = getPage(url)
    soup = BeautifulSoup(pageText, 'lxml')
    soupContent = soup.find_all(name="a", attrs={"data-el": "ershoufang"})
    houseIdSet = set()
    for a in soupContent:
        houseIdSet.add(a.get("href"))
        # print(a.get("href"))

    return list(houseIdSet)


def saveData(houseDict):
	# 这里是沈阳市的二手房数据,若需要获取其他,更换表名db[’tableName‘]
    db['shenyang'].save(houseDict)
    print("存入数据库"+str(len(houseDict)) + "条数据!")


def getHouseContent(url):
    houseInfoDict = {}
    pageText = getPage(url)
    soup = BeautifulSoup(pageText, 'lxml')
	
	# 一些数据重复,因此将重复的部分注释了
    # 简要信息
    # 总价
    price = soup.find(name="span", attrs={"class": "total"}).text
    houseInfoDict['总价'] = price
    # print(price)

    # 单位价格
    unitPrice = soup.find(name="span", attrs={"class": "unitPriceValue"}).text
    houseInfoDict['单位价格'] = unitPrice
    # print(unitPrice)

    # 户型
    # room = soup.find(name="div", attrs={"class": "room"})
    # roomInfo = BeautifulSoup(str(room), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
    # print(roomInfo)

    # 楼层
    # roomSubInfo = BeautifulSoup(str(room), 'lxml').find(name="div", attrs={"class": "subInfo"}).text
    # print(roomSubInfo)

    # 朝向
    # type = soup.find(name="div", attrs={"class": "type"})
    # typeInfo = BeautifulSoup(str(type), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
    # print(typeInfo)

    # 装修
    # typeSubInfo = BeautifulSoup(str(type), 'lxml').find(name="div", attrs={"class": "subInfo"}).text
    # print(typeSubInfo)

    # 面积
    area = soup.find(name="div", attrs={"class": "area"})
    # areaInfo = BeautifulSoup(str(area), 'lxml').find(name="div", attrs={"class": "mainInfo"}).text
    # print(areaInfo)

    # 楼房信息
    areaSubInfo = BeautifulSoup(str(area), 'lxml').find(name="div", attrs={"class": "subInfo noHidden"}).text
    houseInfoDict['楼房信息'] = areaSubInfo
    # print(areaSubInfo)

    # 小区
    community = soup.find(name="div", attrs={"class": "communityName"})
    communityInfo = BeautifulSoup(str(community), 'lxml').find(name="a", attrs={"class": "info"}).text
    houseInfoDict['小区'] = communityInfo
    # print(communityInfo)

    # 所属区县
    areaDistrict = soup.find(name="div", attrs={"class": "areaName"})
    areaDistrictInfo = BeautifulSoup(str(areaDistrict), 'lxml').find_all(name="a")[0].text
    houseInfoDict['所属区县'] = areaDistrictInfo
    # print(areaDistrictInfo)

    # 基本属性
    base = soup.find(name="div", attrs={"class": "base"})
    baseInfo = BeautifulSoup(str(base), 'lxml').find_all(name="li")
    for li in baseInfo:
        key = li.text[:4]
        houseInfoDict[key] = li.text[4:]
        # print(li.text[4:])

    # 交易属性
    transaction = soup.find(name="div", attrs={"class": "transaction"})
    transactionInfo = BeautifulSoup(str(transaction), 'lxml').find_all(name="li")
    for li in transactionInfo:
        liText = li.text.strip("\r").strip("\n").strip('\r').strip('\n').replace('\n', '').replace(' ', '')
        key = liText[:4]
        houseInfoDict[key] = liText[4:]
        # print(li.text[4:])

    # 户型分间
    layout = soup.find(name="div", attrs={"class": "layout"})
    roomRow = BeautifulSoup(str(layout), 'lxml').find_all(name="div", attrs={"class": "row"})
    # print(roomRow)
    houseInfoDict['户型分间'] = {}
    for row in roomRow:
        # key = roomRow[0]
        col = BeautifulSoup(str(row), 'lxml').find_all(name="div", attrs={"class": "col"})
        for i in range(1, len(col)):
            key = col[0].text
            if i == 1:
                houseInfoDict['户型分间'][key+"面积"] = col[1].text
            elif i == 2:
                houseInfoDict['户型分间'][key + "朝向"] = col[2].text
            elif i == 3:
                houseInfoDict['户型分间'][key + "窗型"] = col[3].text

    # 小区简介
    rid = soup.find(name="div", attrs={"id": "framesdk"}).get("data-resblock-id")
    # print(rid)
    houseRecord = soup.find(name="div", attrs={"class": "houseRecord"})
    hid = BeautifulSoup(str(houseRecord), 'lxml').find(name="span", attrs={"class": "info"}).text[:-2]
    # print(hid)
    # 这里是沈阳市的二手房数据,若需要获取其他,更换链接前缀
    xiaoquInfoUrl = 'https://sy.l、i、a、n、、、、jia、、.com/er、、shou、、fang/housestat?hid=' + str(hid) + '&rid=' + str(rid)
    # print(xiaoquInfoUrl)
    xiaoquInfo = json.loads(getPage(xiaoquInfoUrl))
    # print(xiaoquInfo)
    # print(xiaoquInfo['data']['resblockCard'])
    # buildYear
    buildYear = xiaoquInfo['data']['resblockCard']['buildYear']
    buildNum = xiaoquInfo['data']['resblockCard']['buildNum']
    unitPrice = xiaoquInfo['data']['resblockCard']['unitPrice']


    houseInfoDict['小区简介'] = {}
    houseInfoDict['小区简介']['小区建造年份'] = buildYear
    houseInfoDict['小区简介']['楼栋总数'] = buildNum
    houseInfoDict['小区简介']['小区均价'] = unitPrice

    try:
        saveData(houseInfoDict)
    except Exception as e:
        print(e)
    # print(houseInfoDict)


for i in range(1, 101):
	# 这里是沈阳市的二手房数据,若需要获取其他,更换链接前缀
    url = 'https://sy.l、i、a、n、、j、i、a、、.com/er、shou、fang/pg' + str(i)
    houseIdList = getHouseId(url)
    print("第" + str(i) + "页")
    for j in range(len(houseIdList)):
        print(j)
        try:
            getHouseContent(houseIdList[j])
        except Exception as e:
            print(e)

总结

数据获取先写到这里,后续出数据处理、可视化以及二手房价格预测模型部分。
有直接需要部分二手房数据的同学,可以来秘密基地。

  • 5
    点赞
  • 67
    收藏
    觉得还不错? 一键收藏
  • 18
    评论
评论 18
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值