爬虫 scrapy 房天下

要求

1.获取所有城市的url: http://www.fang.com/SoufunFamily.htm
2.获取所有城市的新房url
例如:杭州:http://hz.fang.com/
杭州新房: https://hz.newhouse.fang.com/house/s/
(需要将获得的url先进行拆分)
3.获取所有城市二手房url链接
例如:杭州:http://hz.fang.com/
杭州二手房:https://hz.esf.fang.com/

代码目录
在这里插入图片描述
middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

import random

class UserAgentDownloadMiddleware(object):
    USER_AGENTS = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36'
    ]

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exporters import JsonLinesItemExporter

class FangPipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json', 'wb')
        self.esfhouse_fp = open('esfhouse.json', 'wb')
        self.newhouse_exporter = JsonLinesItemExporter(
            self.newhouse_fp, ensure_ascii=False
        )
        self.esfhouse_exporter = JsonLinesItemExporter(
            self.esfhouse_fp, ensure_ascii=False
        )

    def process_item(self, item, spider):
        self.newhouse_exporter.export_item(item)
        self.esfhouse_exporter.export_item(item)
        return item

    def close_item(self, item, spider):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

# 新房
class NewHouseItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 小区名字
    name = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 几局 是个列表形式
    rooms = scrapy.Field()
    # 面积
    area = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 行政区
    district = scrapy.Field()
    # 是否在售
    sale = scrapy.Field()
    # 房天下详情页面的url
    origin_url = scrapy.Field()

# 二手房
class ESFHouseItem(scrapy.Item):
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 小区名字
    name = scrapy.Field()
    # 几室几厅
    rooms = scrapy.Field()
    # 层
    floor = scrapy.Field()
    # 朝向
    toward = scrapy.Field()
    # 年代
    year = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 建筑面积
    area = scrapy.Field()
    # 总价
    price = scrapy.Field()
    # 单价
    unit = scrapy.Field()
    # 详情url
    origin_url = scrapy.Field()

sfw.py文件

# -*- coding: utf-8 -*-
import scrapy
import re
from fang.items import NewHouseItem, ESFHouseItem


class SfwSpider(scrapy.Spider):
    name = 'sfw'
    allowed_domains = ['fang.com']
    start_urls = ['https://www.fang.com/SoufunFamily.htm']

    def parse(self, response):
        trs = response.xpath("//div[@class = 'outCont']//tr")
        province = None
        for tr in trs:
            tds = tr.xpath(".//td[not(@class)]")   # 每个tr标签下有多个td标签,除了带class的(因为带class没有文字,只有空白字符),其余td标签均要提取
            province_td = tds[0]
            province_text = province_td.xpath(".//text()").get()   # 提取省份字段
            province_text = re.sub(r"\s", "", province_text)     # 保存省份 替换成空白字符
            if province_text:                           # 若当前这列有省份,则保存;若没有省份字段,为空,说明该列字段空字符,该行城市属于该省份
                province = province_text
            # 不爬取海外国家
            if province == '其它':
                continue
            city_td = tds[1]   # 找到城市,其中包含多个城市
            city_links = city_td.xpath(".//a")
            for city in city_links:
                city_name = city.xpath("./text()").get()   # .//text()
                city_url = city.xpath("./@href").get()      # .//@href()

                # 构建新房链接
                # 将http://hz.fang.com/拆分成 http://hz  fang  com/ 三部分
                url_module = re.split("\\.", city_url)
                url0 = url_module[0]
                url1 = url_module[1]
                url2 = url_module[2]
                #  重新组合成 https://hz.newhouse.fang.com/house/s/
                newhouse_url = url0 + '.' + 'newhouse.' + url1 + '.' + url2 + 'house/s/'

                # 构建二手房链接
                # https://hz.esf.fang.com/
                esf_url = url0 + '.' + 'esf.' + url1 + '.' + url2

                yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta={"info": (province, city_name)})
                yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={"info": (province, city_name)})
            #     break
            # break


    # 解析新房信息
    def parse_newhouse(self, response):
        province, city = response.meta.get('info')   # 元组形式
        lis = response.xpath("//div[@class='nl_con clearfix']/ul/li")    # 或者 //li
        for li in lis:

            # 所有小区的名字
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name == None:    # 由于name存在None值,无法strip,使用需要进行判断
                continue
            name = name.strip()  # 存在空白字符

            # 小区的类型(几居 列表形式)
            rooms = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
            rooms = list(filter(lambda x: x.endswith("居"), rooms))    # 过滤掉以“居”结尾的字符串,比如2居,而其他字符串‘亦庄·臻珑府价格待定'则不需要

            # 面积
            area = "".join(li.xpath(".//div[@class='house_type clearfix']/text()").getall())   # 使用get()只会返回“面积”前的一段空白,所以getall,由于返回的是列表,需要转换成字符串,否则会有\t一堆
            area = re.sub(r"\s|-|/", "", area)   # 过滤掉空白字符\s(制表符/空格/其他空白)

            # 地址
            address = li.xpath(".//div[@class='address']/a/@title").get()    # 注意写法 <a title="地址" target="xxx"....

            # 行政区
            district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())   # 提取所有的地址 并转换成字符串
            district = re.search(r".*\[(.+)\].*", district_text).group(1)  # 从字符串中提取带[]的字段.group(1)和group()有区别
            # 不采用该方法是因为不是所有的都有span标签,会出现NoneTpye
            # district = li.xpath(".//div[@class='address']/a/span/text()").get().strip()

            # 是否在售
            # sale = li.xpath(".//div[@class='fangyuan pr']/span/text()").get()   # 会提取到None,该方法不合适
            # sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/@class").get()   # 提取英文,注意写法<span class="inSale">在售</span>
            sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()    # 提取中文

            # 价格
            # strip还可以用正则表达式替代 price = re.sub(r"\s","",price)
            price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()).strip()    # 既要取价格还要取单位,所以用getall和//text()

            # 详情页面的url https://beidaihekongquecheng.fang.com/
            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
            origin_url = 'https:' + origin_url     # 获取的url不全,需要字符拼接

            item = NewHouseItem(province=province, city=city, name=name, rooms=rooms, area=area, address=address, district=district,
                                sale=sale, price=price, origin_url=origin_url)
            yield item
        next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()   # 注意写法 <a class='next' href='xxxx'>下一页
        if next_url:            # 如果存在下一页,由于可能url不完全,所以需要拼接url,将newhouse_url,即response.urljoin next_url
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse(),
                                meta={"info": (province, city)})  # 如果有下一页,循环调用newhouse新房列表页面



    # 解析二手房信息
    def parse_esf(self, response):
        province, city = response.meta.get('info')  # 元组形式
        dls = response.xpath("//div[contains(@class,'shop_list ')]/dl")
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            # 小区名字
            item['name'] = dl.xpath(".//p/a/@title").get()

            # 面积朝向等信息infos
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()  # 信息较多,先用getall获取所有
            infos = list(map(lambda x: re.sub("\s", "", x), infos))
            for info in infos:
                # 几室几厅
                if '厅' in info:
                    item['rooms'] = info
                # 层
                elif '层' in info:
                    item['floor'] = info
                # 面积
                elif '㎡' in info:
                    item['area'] = info
                # 朝向
                elif '向' in info:
                    item['toward'] = info
                # 年代
                elif '年' in info:
                    item['year'] = info
                    # item['year'] = info.replace("建筑年代:", "")    # 去掉建筑年代,只保留数字 2019
            # 地址
            item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
            # 价格(数字和单位分开)
            num = dl.xpath(".//span[@class='red']/b/text()").get()
            if num:
                num = "".join(num)      # num 会出现None,无法join,报错
            num_danwei = dl.xpath(".//dd[@class='price_right']/span/text()").get()
            if num_danwei:              # 同样会出现none。直接join报错
                num_danwei = "".join(num_danwei)
            # 字符串拼接(nonetype无法拼接)444万
            if num and num_danwei:
                item['price'] = num + num_danwei

            # 单价
            item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get()

            # 二手房详情页面
            detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
            if detail_url:
                item['origin_url'] = response.urljoin(detail_url)   # 网页不全,且有none

            yield item

        next_url = response.xpath("//div[@class='page_al']//p/a/@href").get()
        yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={'info': (province, city)})
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值