爬取q房网房源信息

爬取房源信息,保存到CSV文件,比较简单,没有什么反爬虫。

# -*- encoding: utf-8 -*-
"""
@File    : qfang.py
@Time    : 2020/6/11 14:44
@Author  : ligang
@WeChat   : 18233275213
@Software: PyCharm
"""

import requests
import time
from lxml import etree
import csv


def spider_page(url):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
                    AppleWebKit/537.36 (KHTML, like Gecko) \
                    Chrome/70.0.3538.110 Safari/537.36',
               'upgrade-insecure-requests': '1',
               'cookie': 'cookieId=4a8744d7-42ab-4567-bb86-3eaea4fab2e1; sid=eacd9f8d-3a3a-4bb0-8a15-8ce9adba2886; qchatid=2ba17dcd-b976-46c6-b02b-3c12323002fd; language=SIMPLIFIED; JSESSIONID=aaavyjKCV6-9H5phIDEkx; cookieId=03a9dc9f-40c1-4d34-890b-0ef91d91d713; cookieId=91044341-5ea6-45fc-95e6-e22c7160c570; CITY_NAME=SHENZHEN; Hm_lvt_4d7fad96f5f1077431b1e8d8d8b0f1ab=1591858478; Hm_lpvt_4d7fad96f5f1077431b1e8d8d8b0f1ab=1591858478; Hm_lvt_de678bd934b065f76f05705d4e7b662c=1591858479; Hm_lpvt_de678bd934b065f76f05705d4e7b662c=1591858479; WINDOW_DEVICE_PIXEL_RATIO=1; _jzqa=1.1115870713157731100.1591858479.1591858479.1591858479.1; _jzqc=1; _jzqckmp=1; _qzja=1.1207994965.1591858478700.1591858478700.1591858478700.1591858478700.1591858478700.0.0.0.1.1; _qzjc=1; _qzjto=1.1.0; _ga=GA1.3.1531234912.1591858479; _gid=GA1.3.676679335.1591858479; _dc_gtm_UA-47416713-1=1; _jzqb=1.1.10.1591858479.1; _qzjb=1.1591858478700.1.0.0.0'}

    response = requests.get(url, headers=headers)
    time.sleep(2)  # 延迟两秒时间
    return response.text

def csv_data(item):
    # newline设置为''可以去点换行
    with open('fangwo_info.csv', 'a+', encoding='utf-8', newline='')as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(item)

def paser_info(url):
    # 解析页面
    html = spider_page(url)
    # 以构造器的形式返回
    selector = etree.HTML(html)
    for index in range(1, 31):
        # 获取 ["名称", "户型", "面积", "装修", "楼层", "朝向", "售价", "总价/万", "详情"] 信息
        xiangq = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[1]/a/text()')[0]
        name = xiangq.split(' ')[0]
        style = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[1]/text()')[0].split(' ', 1)[0]
        area = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[2]/text()')[0].split(' ', 1)[0]
        decotored = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[3]/text()')[0].split(' ', 1)[0]
        louceng = selector.xpath('.//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[4]/text()')[0].split(' ', 1)[0]
        chaoxiang = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[2]/div[2]/p[5]/text()')[0].split(' ', 1)[0]
        total = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[3]/p[1]/span[1]/text()')[0].split(' ', 1)[0]
        price = selector.xpath('//*[@class="list-result"]/ul/li[' + str(index) +']/div[3]/p[2]/text()')[0].split(' ', 1)[0]
        info = [name, style, area, decotored, louceng, chaoxiang, price, total, xiangq]
        csv_data(info)
        print("正在爬取", name)

def main():
    # 添加csv标题头
    info_title = ["名称", "户型", "面积", "装修", "楼层", "朝向", "售价", "总价/万", "详情"]
    csv_data(info_title)
    # 共爬取  10  页
    urls = ['https://shenzhen.qfang.com/sale/f%s' % x for x in range(1, 10)]
    for url in urls:
        paser_info(url)

if __name__ == '__main__':
    main()

结果:

在这里插入图片描述

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Python图像识别

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值