这里对第一版做了一些改进,一次性能够完成杭州所有二手房信息的爬取。
import requests
from fake_useragent import UserAgent
from lxml import etree
import pandas as pd
import numpy as np
import time
import json
from collections import OrderedDict #用来生成有序的字典
import re
import os
import glob
解析函数
改变的地方:
- 1.改变了房子内部信息解析部分代码。
#下面是房子内部信息
houseInfo = sell.xpath('div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')
#下面先给这些变量赋一个np.nan,保证不会把前面一个值写到后面去
room = np.nan
area = np.nan
orientation = np.nan
decoration = np.nan
elevator = np.nan
#houseInfo[0]需要丢弃
num = 0
for temp in houseInfo:
num += 1
if num == 2:
room = temp
elif num == 3:
area = temp
elif num == 4:
orientation = temp
elif num == 5:
decoration = temp
elif num == 6:
elevator = temp
- 2.添加了一个总页数解析不到情况(此分类下无搜索结果)的处理
if len(totalPageList) == 0:
df = []
totalPage = '0'
return df,totalPage
else:
totalPageDict = json.loads(totalPageList[0])
totalPage = totalPageDict["totalPage"]
同时下面的主函数main()
里面加一个判断是否出现该分类下无结果的情况:
if page == '0':
break
下面是这个函数的实现:
def parse(text):
selector = etree.HTML(text)
###下面是总页数解析过程
totalPageList = selector.xpath('//div[@class="page-box fr"]/div[1]/@page-data') #这是一个字符串,里面包含了一个字典
if len(totalPageList) == 0:
df = []
totalPage = '0'
return df,totalPage
else:
totalPageDict = json.loads(totalPageList[0])
totalPage = totalPageDict["totalPage"]
###下面是数据解析过程
sellList = selector.xpath('//ul[@class="sellListContent"]/li')
house = []
for sell in sellList:
link = sell.xpath('a/@href')[0]
title = sell.xpath('div[@class="info clear"]/div[@class="title"]/a/text()')[0]
address = sell.xpath('div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/a/text()')[0]
#下面是房子内部信息
houseInfo = sell.xpath('div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')
#下面先给这些变量赋一个np.nan,保证不会把前面一个值写到后面去
room = np.nan
area = np.nan
orientation = np.nan
decoration = np.nan
elevator = np.nan
#houseInfo[0]需要丢弃
num = 0
for temp in houseInfo:
num += 1
if num == 2:
room = temp
elif num == 3:
area = temp
elif num == 4:
orientation = temp
elif num == 5:
decoration = temp
elif num == 6:
elevator = temp
#下面是房子的总体信息
positionIcon = sell.xpath('div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]/text()')[0]
positionIconTemp = re.split("年建|\-",positionIcon)#进行字符串切分多段
floor = positionIconTemp[0][:-4] #楼层信息
year = positionIconTemp[0][-4:] #建造年份
genre = positionIconTemp[1].strip()
positionInfo = sell.xpath('div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]/a/text()')[0]
#下面是房子的关注者信息
followInfo = sell.xpath('div[@class="info clear"]/div[@class="followInfo"]/text()')[0]
followInfoTemp = followInfo.split('/')
follower = followInfoTemp[0].split('人')[0] #关注人数
interestedFollower = re.split('共|次',followInfoTemp[1])[1] #看房人数
datetime = followInfoTemp[2].strip() #发布时间
#下面是tag标签
tag = []
tagList = sell.xpath('div[@class="info clear"]/div[@class="tag"]/span')
for tags in tagList:
tag.append(tags.xpath('text()')[0])
#下面是价格
totalPrice = sell.xpath('div[@class="info clear"]/div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()')[0]
unitPrice = sell.xpath('div[@class="info clear"]/div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()')[0]
#因为字典是无序,若你一开始设计的时候就希望它按照的添加的顺序进行有序排列(比如读取CSV文件),那么我们就是利用collection模块里面的OrderedDict()处理:
houseDict = OrderedDict()
houseDict['link'] = link
houseDict['title'] = title
houseDict['address'] = address
houseDict['room'] = room
houseDict['area'] = area
houseDict['orientation'] = orientation
houseDict['decoration'] = decoration
houseDict['elevator'] = elevator
houseDict['floor'] = floor
houseDict['year'] = year
houseDict['genre'] = genre
houseDict['positionInfo'] = positionInfo
houseDict['follower'] = follower
houseDict['interestedFollower'] = interestedFollower
houseDict['datetime'] = datetime
houseDict['tag'] = tag
houseDict['totalPrice'] = totalPrice
houseDict['unitPrice'] = unitPrice
#下面合并成一个列表
house.append(houseDict)
df = pd.DataFrame(house)
return df,totalPage
页面请求函数
这个跟第一版一样。
def getData(url,headers):
try:
time.sleep(1)
response = requests.get(url,headers = headers)
text = response.text
return text
except Exception as e:
time.sleep(10)
print(url)
print("requests fail, retry!")
return getData(url,headers) #递归调用
主函数
下面讲下链家网URL构成。
url = "https://hz.lianjia.com/ershoufang/lc{}l{}a{}/pg{}/"
其中:
- lc代表楼层高度 1-3对应低中高
- l代表房型 1-5对应一室到四室以上
- a代表面积 1-8对应50平以下、50-70、70-90、90-120、120-140、140-160、160-200、200平以上
def main():
#下面是请求头构造
ua = UserAgent()
headers = {
'User-Agent':ua.random,
'Host': 'hz.lianjia.com',
'Referer': 'https://hz.lianjia.com/ershoufang/pg1/'
}
url = "https://hz.lianjia.com/ershoufang/lc{}l{}a{}/pg{}/"
for z in range(1,9):
for y in range(1,6):
for x in range(1,4):
#下面要获取总页数
text = getData(url.format(str(x),str(y),str(z),'1'),headers)
total_df,page = parse(text)
print(page)
#下面是爬取全部页面
if page == '0':
break
elif page == '1':
continue
else:
for i in range(2,int(page)+1):
text = getData(url.format(str(x),str(y),str(z),str(i)),headers)
df,_ = parse(text)
total_df = pd.concat([total_df,df],axis = 0)
#下面是保存到csv文件
total_df.to_csv('./data_v2/House-Second-Hangzhou-lc{}-l{}-a{}-v2.csv'.format(str(x),str(y),str(z)), sep = ',', header = True, index = False)
main()
多个csv文件合并函数
这个跟第一版一样。
def merge():
csv_list = glob.glob('*.csv') #查看同文件夹下的csv文件数
print(u'共发现%s个CSV文件'% len(csv_list))
print(u'正在处理............')
for i in csv_list: #循环读取同文件夹下的csv文件
fr = open(i,'rb').read()
with open('House-Second-Hangzhou-v2.csv','ab') as f: #将结果保存为result.csv
f.write(fr)
print(u'合并完毕!')
merge()
共发现107个CSV文件
正在处理............
合并完毕!
df_read = pd.read_csv("House-Second-Hangzhou-v2.csv")
df_read.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20073 entries, 0 to 20072
Data columns (total 18 columns):
link 20073 non-null object
title 20073 non-null object
address 20073 non-null object
room 20073 non-null object
area 20073 non-null object
orientation 20073 non-null object
decoration 20071 non-null object
elevator 17022 non-null object
floor 20070 non-null object
year 20073 non-null object
genre 19212 non-null object
positionInfo 20073 non-null object
follower 20073 non-null object
interestedFollower 20073 non-null object
datetime 20073 non-null object
tag 20073 non-null object
totalPrice 20073 non-null object
unitPrice 20073 non-null object
dtypes: object(18)
memory usage: 2.8+ MB