from selenium import webdriver
import time
from lxml import etree
import requests
def iskong(a):
if len(a):
return a
else:
return ''
#初始URL
url = 'https://bj.fang.lianjia.com/'
heard = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
response = requests.get(url,headers=heard).content.decode('utf-8')
diquxinxi = {}
#构建每个地区房间的树
tree = etree.HTML(response)
daqu = tree.xpath('//li/div[@class="city-enum fl"]')
for xiaoqu in daqu:
difang_name = xiaoqu.xpath('./a/text()')
difang_a = xiaoqu.xpath('./a/@href')
for i in range(len(difang_name)):
diquxinxi[difang_name[i]] = 'https:'+difang_a[i]
#把地区信息装在字典中
for i,j in diquxinxi.items():
print("开始获取"+i+"楼盘的网页")
response = requests.get(j,headers=heard).content.decode('utf-8')
tree = etree.HTML(response)
xinfang = tree.xpath('//li/a[@data-other-action=2]/@href')[0]
#进入楼盘链接:
n=1
while True:
loupan = j+xinfang+'pg{}'+str(n)
response = requests.get(loupan,headers=heard).content.decode('utf-8')
print("爬取成功-----正在加载")
tree = etree.HTML(response)
fangwu = tree.xpath('//ul[@class="resblock-list-wrapper"]/li[@class="resblock-list post_ulog_exposure_scroll has-results"]')
if len(fangwu) == 0 :
print(str(i)+'楼盘爬取结束')
break
#下面是每一个楼盘的每一个,也是我们需要的楼盘列表页
base_url = 'https://bj.fang.lianjia.com'
#图片
tupian = fangwu[n-1].xpath('.//a/img/@src')
print("图片:",tupian[0])
#详情页
xiangqin = fangwu[n-1].xpath('.//div[@class="resblock-name"]/a/@href')
print("详情页:",base_url+xiangqin[0])
#楼盘名称
a = fangwu[n-1].xpath('.//div[@class="resblock-name"]/a/text()')
print("楼盘名称:",a[0])
#城区
chengqu = fangwu[n-1].xpath('.//div[@class="resblock-location"]/span/text()')
print("城区:",chengqu[0])
#地址
dizhi = fangwu[n-1].xpath('.//div[@class="resblock-location"]/a/text()')
print("地址:",dizhi[0])
#主力户型:
zhuli = fangwu[n-1].xpath('.//a[@class="resblock-room"]/span/text()')
print("主力户型:",''.join(i for i in zhuli))
#建筑面积:
jianmian = fangwu[n-1].xpath('.//div[@class="resblock-area"]/span/text()')
if len(jianmian):
print("建筑面积:",iskong(jianmian[0]))
else:
print("建筑面积未知")
#jiage
jiage = fangwu[n-1].xpath('.//div[@class="main-price"]/span[@class="number"]/text()')
danwei = fangwu[n-1].xpath('.//div[@class="main-price"]/span[@class="desc"]/text()')
if len(jiage) and len(danwei):
print("均价:",iskong(jiage[0])+iskong(danwei[0]))
else:
print("均价异常")
#下面是单个的房子直接可以进入房子的详情页
for fangzi in xiangqin:
url = base_url+fangzi
response=requests.get(url).content.decode('utf-8')
#已经进入详情页:
tree = etree.HTML(response)
title = tree.xpath('//h1/text()')
print("正在爬取",title[0],"的界面")
fangwuxinxi ={}
#动态
print("开始爬取楼盘动态")
dongtai = []#这是我们的动态字典#后添加
#进去动态的详情页
dongtaixiangqin = tree.xpath('.//a[@class="dynamic-more pull-right"]/@href')
if len(dongtaixiangqin)==0:
dongtai.append("没有任何动态")
print(dongtai[0])
else:
response = requests.get(url+'dongtai/',headers = heard).content.decode('utf-8')
tree = etree.HTML(response)
title = tree.xpath('//span[@class="a-title"]/text()')
content = tree.xpath('//a[@onclick="return false"]/text()')
date = tree.xpath('//span[@class="a-time"]/text()')
xiaokuangkuang={}
for i in range(len(date)):
xiaokuangkuang['title']=title[i]
xiaokuangkuang['content'] = content[i]
xiaokuangkuang['date'] = date[i]
dongtai.append(xiaokuangkuang)
guang = dongtai[i]#记住!在这里面保存
fangwuxinxi={"动态":dongtai}
#户型:
print("开始爬取户型")
huxing = []
huxing_list = tree.xpath('//div[@class="houselist"]/ul')
for i in huxing_list:
pic = i.xpath('.//li/img/@src')
print("pic:",pic[0])
fangzi = i.xpath('.//li[@class="info-li"]/p[@class="p1"]/text()')[0]
shi = fangzi[0]
ting = fangzi[2]
ce = fangzi[4]
print("室:",iskong(shi))
print("厅:",iskong(ting))
print("厕所:",iskong(ce))
area = i.xpath('.//li[@class="info-li"]/p[@class="p1"]/span/text()')[0]
print("面积",iskong(area))
price = i.xpath('.//li[@class="info-li"]/p[@class="p2"]/span/text()')
print("价格",iskong(price))
date = i.xpath('.//li[@class="info-li"]/p[@class="p2"]/span[@class="p2-time"]/text()')
print("时间:",iskong(date))
type_desc = i.xpath('.//li[@class="info-li"]/p[@class="p3"]/span/text()')
print("解读:",iskong(type_desc))
huxing= {
"pic": pic,
"room": iskong(shi), # 室
"ting": iskong(ting), # 厅
"wei": iskong(ce), # 卫
"area": iskong(area), # 建筑面积
"price": iskong(price),
"date": iskong(date),
"type_desc": iskong(type_desc), # 户型解读
},
fangwuxinxi["户型"]=huxing
#图片相册
print("正在爬取图片相册")
pic={}
xiaoguotu = tree.xpath('.//div[@class="album-list-item pull-left"]')
if len(xiaoguotu)==0:
print("效果图不存在")
else:
xiaoguotu = xiaoguotu[0].xpath('.//a/@href')
b = []
for i in xiaoguotu:
b.append('https://bd.fang.lianjia.com' + i)
pic["效果图"] = b
shijin = tree.xpath('.//div[@class="album-list-item pull-left"]')
if len(shijin)==0:
print("实景图不存在")
else:
shijin = shijin[1].xpath('.//a/@href')
c = []
for i in shijin:
c.append('https://bd.fang.lianjia.com' + i)
pic["实景图"] = c
print(pic)
#楼盘详情
print("正在爬取楼盘详情")
p_list = tree.xpath('.//p[@class="desc-p clear"]/span/text()')
lp_dic={}
c = []
for i in range(0,len(p_list)):
if i % 2 !=0:
value=p_list[i]
c.append(value.replace(" ","").replace("\n",""))
else:
key = p_list[i]
lp_dic[key] = ''
x = 0
for i in lp_dic:
lp_dic[i] = c[x]
x+=1
print(lp_dic)
fangwuxinxi["楼盘详情"]=lp_dic
print("一个房屋的所有信息时:",fangwuxinxi)
n+=1