介绍一下用requests,xpath简单的爬取链家数据,并存入csv文件
代码如下:
# _*_ coding:utf-8 _*_
import requests
from lxml import etree
import csv
class LianjiaSpider(object):
def __init__(self,num):
# 请求头
self.headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
# 地址链接
self.base_url='https://hz.lianjia.com/zufang/pg'
# 爬取页面的数量
self.num=num
def get_list(self):
# 详情页链接的空列表
detail_href_list = []
for i in range(1,self.num):
url=self.base_url+str(i)+'rt200600000002'
response=requests.get(url=url,headers=self.headers)
html_doc = etree.HTML(response.text)
# 找到每一个详情页的链接
detail_href = html_doc.xpath("//p[@class='content__list--item--title twoline']/a/@href")
for i in detail_href:
detail_href_list.append(i)
set(detail_href_list)
list(detail_href_list)
return detail_href_list
# 得到详情页的文档
def get_detail(self,phref):
response = requests.get(url=phref, headers=self.headers).text
return response
# 获取详情页的信息
def detail_message(self,response):
html_doc = etree.HTML(response)
# 房屋名字
try:
room_name = html_doc.xpath("//div[@class='content clear w1150']/p/text()")[0]
print(room_name)
except:
room_name=None
# 上架时间
try:
room_shelf_time = html_doc.xpath("//div[@class='content__subtitle']/text()")[1].strip()
print(room_shelf_time)
except:
room_shelf_time=None
# 房源编号
try:
house_code = html_doc.xpath("//div[@class='content__subtitle']/i[@class='house_code']/text()")[0]
print(house_code)
except:
house_code=None
# 价格
try:
price_num = html_doc.xpath("//div[@class='content__aside fr']/p[@class='content__aside--title']/span/text()")[
0] + '元/月'
print(price_num)
except:
price_num=None
# 房源列表
try:
room_list = html_doc.xpath("//div[@class='content__aside fr']/p[@class='content__aside--tags']/i/text()")
print(room_list)
except:
room_list=None
# 租凭方式
try:
rental_way = html_doc.xpath("//ul[@class='content__aside__list']/p/span/text()")[0]
print(rental_way)
except:
rental_way=None
# 房源户型
try:
room_style = html_doc.xpath("//ul[@class='content__aside__list']/p/span/text()")[1]
print(room_style)
except:
room_style=None
# 面积
try:
area = html_doc.xpath("//ul[@class='content__aside__list']/p/span/text()")[2]
print(area)
except:
area=None
# 朝向
try:
toward = html_doc.xpath("//ul[@class='content__aside__list']/p/span/text()")[3]
print(toward)
except:
toward=None
# 管家
try:
house_manage = html_doc.xpath("//div[@class='content__aside__list--title oneline']/span/@title")[0]
print(house_manage)
except:
house_manage=None
# 管家电话
try:
housemanage_phone = html_doc.xpath("//p[@class='content__aside__list--bottom oneline']/text()")[0]
print(housemanage_phone)
except:
housemanage_phone=None
# 房屋基本信息
try:
room_message = html_doc.xpath("//div[@class='content__article__info']/ul/li/text()")
print(room_message)
except:
room_message=None
# 房源描述
try:
# room_describ = html_doc.xpath("//p[@data-el='houseComment']/text()")
room_describ = html_doc.xpath("//div[@class='content__article__info3']//p[@data-el='houseComment']/text()")
# room_describ=''.join(describ)
print(room_describ)
except:
room_describ=None
# 详情页信息的字典
dict_new = {'room_name':room_name,'room_shelf_time':room_shelf_time,'house_code':house_code,
'price_num': price_num, 'room_list': room_list, 'rental_way': rental_way,
'room_style': room_style,'area': area, 'toward': toward,'house_manage':house_manage,
'housemanage_phone':housemanage_phone,'room_message':room_message,'room_describ':room_describ}
return dict_new
def supporting_facilities(self,response):
html_doc = etree.HTML(response)
"""配套设施"""
# 房间名字
try:
room_name = html_doc.xpath("//div[@class='content clear w1150']/p/text()")[0]
print(room_name)
except:
room_name = None
# 价格
try:
price_num = html_doc.xpath("//div[@class='content__aside fr']/p[@class='content__aside--title']/span/text()")[0] + '元/月'
print(price_num)
except:
price_num = None
# 电视
try:
tv = html_doc.xpath("//ul[@class='content__article__info2']/li[@class='fl oneline television ']/text()")[0]
print(tv)
except:
tv = None
# 冰箱
try:
refrigerator = html_doc.xpath(
"//ul[@class='content__article__info2']/li[@class='fl oneline refrigerator ']/text()")[0]
print(refrigerator)
except:
refrigerator = None
# 洗衣机
try:
washing_machine = html_doc.xpath(
"//ul[@class='content__article__info2']/li[@class='fl oneline washing_machine ']/text()")[0]
print(washing_machine)
except:
washing_machine = None
# 空调
try:
air_conditioner = html_doc.xpath(
"//ul[@class='content__article__info2']/li[@class='fl oneline air_conditioner ']/text()")[0]
print(air_conditioner)
except:
air_conditioner = None
# 热水器
try:
water_heater = html_doc.xpath(
"//ul[@class='content__article__info2']/li[@class='fl oneline water_heater ']/text()")[0]
print(water_heater)
except:
water_heater = None
# 床
try:
bed = html_doc.xpath(
"//ul[@class='content__article__info2']/li[@class='fl oneline bed ']/text()")[0]
print(bed)
except:
bed = None
# 暖气
try:
heating = html_doc.xpath(
"//ul[@class='content__article__info2']/li[@class='fl oneline heating ']/text()")[0]
print(heating)
except:
heating = None
# 宽带
try:
wifi = html_doc.xpath(
"//ul[@class='content__article__info2']/li[@class='fl oneline wifi ']/text()")[0]
print(wifi)
except:
wifi = None
# 衣柜
try:
wardrobe = html_doc.xpath(
"//ul[@class='content__article__info2']/li[@class='fl oneline wardrobe ']/text()")[0]
print(wardrobe)
except:
wardrobe = None
# 天然气
try:
natural_gas = html_doc.xpath(
"//ul[@class='content__article__info2']/li[@class='fl oneline natural_gas ']/text()")[0]
print(natural_gas)
except:
natural_gas = None
# 详情页配套设施字典
dict_supporting_facilities = {'room_name': room_name,'price_num':price_num, 'tv': tv, 'refrigerator': refrigerator,
'washing_machine': washing_machine,'air_conditioner': air_conditioner,
'water_heater': water_heater, 'bed': bed,'heating': heating, 'wifi': wifi,
'wardrobe': wardrobe,'natural_gas': natural_gas}
return dict_supporting_facilities
# 保存到csv文件
def save_csv(self,csv_writer,dict_new,csv_sfwriter,dict_supporting_facilities):
csv_writer.writerow(dict_new.values())
csv_sfwriter.writerow(dict_supporting_facilities.values())
def start_work(self):
# detail_href = self.get_list()
detail_href_list = self.get_list()
for href in detail_href_list:
# 每个详情页的链接
phref = 'https://hz.lianjia.com' + href
response = self.get_detail(phref=phref)
dict_new = self.detail_message(response=response)
dict_supporting_facilities = self.supporting_facilities(response=response)
csv_file = open('链家基本信息.csv', 'a+')
csv_supporting_facilities = open('链家配套设施.csv', 'a+')
csv_writer = csv.writer(csv_file)
csv_sfwriter = csv.writer(csv_supporting_facilities)
if phref == 'https://hz.lianjia.com/zufang/HZ2160567896500076544.html':
csv_writer.writerow(dict_new.keys())
csv_sfwriter.writerow(dict_supporting_facilities.keys())
else:
pass
self.save_csv(dict_new=dict_new,dict_supporting_facilities=dict_supporting_facilities,csv_sfwriter=csv_sfwriter,csv_writer=csv_writer)
if __name__ == '__main__':
num = int(input('请输入爬取页面数量:'))
ds=LianjiaSpider(num)
ds.start_work()