一、项目背景
北京房价一直是大家非常关注的话题。本项目以研究北京二手房房价为目的,通过Scrapy框架爬取链家网站的二手房房源信息,对其进行基本的数据分析及可视化,并利用决策树算法对未来房价进行预测,最后,可视化模型的学习曲线,观察是否出现过拟合问题。(仅供参考)
二、爬取数据
链家网站的二手房房源信息展示如下:
共有77049条房源信息,但是只显示了100页,每页30条。这些数据未设置反爬,可直接爬取。本文主要爬取如下红框内的11个字段,包括房源描述、房源所在位置、布局、面积、朝向、装修情况、楼层、建造时间、类型、总房价和每平米单价。
如下是具体的爬虫代码:
item.py
import scrapy
class LianjiaspiderprojectItem(scrapy.Item):
# define the fields for your item here like:
Description = scrapy.Field() #房源描述
Location = scrapy.Field() #房源所在位置
Layout = scrapy.Field() #房源布局
Size = scrapy.Field() #房源面积
Direction = scrapy.Field() #房源朝向
Renovation = scrapy.Field() #房源装修情况
Floorinfo = scrapy.Field() #房源所在楼层信息
Year = scrapy.Field() #房源建造年份
Type = scrapy.Field() #房源类型
Price = scrapy.Field() #房源总价
unitPrice = scrapy.Field() #房源单价
pass
lianjia.py
import scrapy
from lxml import etree
from LianjiaSpiderProject.items import LianjiaspiderprojectItem
class LianjiaSpider(scrapy.Spider):
name = 'lianjia'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://bj.lianjia.com/ershoufang/pg1/']
initial_url = "https://bj.lianjia.com/ershoufang/pg"
current_page = 2
def parse(self, response):
#获取第一页中所有房源信息所在的标签,其中每页包括30条房源信息,即30条li标签
sell_list = response.xpath('//ul[@class="sellListContent"]/li')
#对30条li标签进行解析获取相应的房源信息
for sell in sell_list:
Houseinfo = sell.xpath('./div[1]/div[@class="address"]/div//text()').extract()[0]
if len(Houseinfo.split(' | ')) == 7:
Layout = Houseinfo.split(' | ')[0]
Size = Houseinfo.split(' | ')[1]
Direction = Houseinfo.split(' | ')[2]
Renovation = Houseinfo.split(' | ')[3]
Floorinfo = Houseinfo.split(' | ')[4]
Year = Houseinfo.split(' | ')[5]
Type = Houseinfo.split(' | ')[6]
else:
break
Description = sell.xpath('./div[1]/div[@class="title"]/a/text()').extract()[0]
Location = sell.xpath('./div[1]/div[@class="flood"]//text()').extract()
Location_new = "".join([x.strip() for x in Location if len(x.strip()) > 0]) # 去除列表中的空格和空字符串,并将其拼接成一个字符串
Price = sell.xpath('./div[1]/div[@class="priceInfo"]/div[1]//text()').extract()
Price_new = "".join(Price)
unitPrice = sell.xpath('./div[1]/div[@class="priceInfo"]/div[2]//text()')[0].extract()
#将爬取的数据与item文件里面的数据对应起来
item = LianjiaspiderprojectItem()
item['Description'] = Description
item['Location'] = Location_new
item['Layout'] = Layout
item['Size'] = Size
item['Direction'] = Direction
item['Renovation'] = Renovation
item['Floorinfo'] = Floorinfo
item['Year'] = Year
item['Type'] = Type
item['Price'] = Price_new
item['unitPrice'] = unitPrice
yield item
#链家只展示了100页的内容,抓完100页就停止爬虫
#组装下一页要抓取的网址
if self.current_page != 101:
new_url = self.initial_url + str(self.current_page) + '/'
print('starting scrapy url:', new_url)
yield scrapy.Request(new_url, callback=self.parse)
self.current_page += 1
else:
print('scrapy done')
pass
pipelines.py
from itemadapter import ItemAdapter
import csv
class LianjiaspiderprojectPipeline(object):
fp = None
index = 0
#该方法只在爬虫开始的时候被调用一次
def open_spider(self, spider):
print('开始爬虫......')
self.fp = open('./lianjia.csv', 'a', encoding='utf-8')
def process_item(self, item, spider):
Description = item['Description']
Location = item['Location']
Layout = item['Layout']
Size = item['Size']
Direction = item['Direction']
Renovation = item['Renovation']
Floorinfo = item['Floorinfo']
Year = item['Year']
Type = item['Type']
Price = item['Price']
unitPrice = item['unitPrice']
if self.index == 0:
columnnames = "房源描述,位置,布局,面积,朝向,装修情况,楼层,建造年份,类型,总计,单价"
self.fp.write(columnnames+'\n')
self.index = 1
self.fp.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(Description, Location, Layout, Size, Direction, Renovation, Floorinfo, Year, Type, Price, unitPrice))
return item
def close_spider(self, spider):
print('爬虫结束!')
self.fp.close()
pass
三、导入爬取的数据
#导入相关库
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid',context='notebook',font_scale=1.5) # 设置背景
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False #处理中文和坐标负号显示
#导入链家网站二手房数据集
lianjia = pd.read_csv('lianjia.csv')
display(lianjia.head(