爬虫 scrapy 房天下

最新推荐文章于 2022-02-27 23:14:06 发布

qq_43609939

最新推荐文章于 2022-02-27 23:14:06 发布

阅读量441

点赞数

本文链接：https://blog.csdn.net/qq_43609939/article/details/104250090

版权

要求

1.获取所有城市的url: http://www.fang.com/SoufunFamily.htm
2.获取所有城市的新房url
例如：杭州：http://hz.fang.com/
杭州新房： https://hz.newhouse.fang.com/house/s/
（需要将获得的url先进行拆分）
3.获取所有城市二手房url链接
例如：杭州：http://hz.fang.com/
杭州二手房：https://hz.esf.fang.com/

代码目录
在这里插入图片描述
middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

import random

class UserAgentDownloadMiddleware(object):
    USER_AGENTS = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36'
    ]

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exporters import JsonLinesItemExporter

class FangPipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json', 'wb')
        self.esfhouse_fp = open('esfhouse.json', 'wb')
        self.newhouse_exporter = JsonLinesItemExporter(
            self.newhouse_fp, ensure_ascii=False
        )
        self.esfhouse_exporter = JsonLinesItemExporter(
            self.esfhouse_fp, ensure_ascii=False
        )

    def process_item(self, item, spider):
        self.newhouse_exporter.export_item(item)
        self.esfhouse_exporter.export_item(item)
        return item

    def close_item(self, item, spider):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

# 新房
class NewHouseItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 小区名字
    name = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 几局 是个列表形式
    rooms = scrapy.Field()
    # 面积
    area = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 行政区
    district = scrapy.Field()
    # 是否在售
    sale = scrapy.Field()
    # 房天下详情页面的url
    origin_url = scrapy.Field()

# 二手房
class ESFHouseItem(scrapy.Item):
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 小区名字
    name = scrapy.Field()
    # 几室几厅
    rooms = scrapy.Field()
    # 层
    floor = scrapy.Field()
    # 朝向
    toward = scrapy.Field()
    # 年代
    year = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 建筑面积
    area = scrapy.Field()
    # 总价
    price = scrapy.Field()
    # 单价
    unit = scrapy.Field()
    # 详情url
    origin_url = scrapy.Field()

sfw.py文件

# -*- coding: utf-8 -*-
import scrapy
import re
from fang.items import NewHouseItem, ESFHouseItem


class SfwSpider(scrapy.Spider):
    name = 'sfw'
    allowed_domains = ['fang.com']
    start_urls = ['https://www.fang.com/SoufunFamily.htm']

    def parse(self, response):
        trs = response.xpath("//div[@class = 'outCont']//tr")
        province = None
        for tr in trs:
            tds = tr.xpath(".//td[not(@class)]")   # 每个tr标签下有多个td标签，除了带class的（因为带class没有文字，只有空白字符），其余td标签均要提取
            province_td = tds[0]
            province_text = province_td.xpath(".//text()").get()   # 提取省份字段
            province_text = re.sub(r"\s", "", province_text)     # 保存省份 替换成空白字符
            if province_text:                           # 若当前这列有省份，则保存；若没有省份字段，为空，说明该列字段空字符，该行城市属于该省份
                province = province_text
            # 不爬取海外国家
            if province == '其它':
                continue
            city_td = tds[1]   # 找到城市，其中包含多个城市
            city_links = city_td.xpath(".//a")
            for city in city_links:
                city_name = city.xpath("./text()").get()   # .//text()
                city_url = city.xpath("./@href").get()      # .//@href()

                # 构建新房链接
                # 将http://hz.fang.com/拆分成 http://hz  fang  com/ 三部分
                url_module = re.split("\\.", city_url)
                url0 = url_module[0]
                url1 = url_module[1]
                url2 = url_module[2]
                #  重新组合成 https://hz.newhouse.fang.com/house/s/
                newhouse_url = url0 + '.' + 'newhouse.' + url1 + '.' + url2 + 'house/s/'

                # 构建二手房链接
                # https://hz.esf.fang.com/
                esf_url = url0 + '.' + 'esf.' + url1 + '.' + url2

                yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta={"info": (province, city_name)})
                yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={"info": (province, city_name)})
            #     break
            # break


    # 解析新房信息
    def parse_newhouse(self, response):
        province, city = response.meta.get('info')   # 元组形式
        lis = response.xpath("//div[@class='nl_con clearfix']/ul/li")    # 或者 //li
        for li in lis:

            # 所有小区的名字
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name == None:    # 由于name存在None值，无法strip，使用需要进行判断
                continue
            name = name.strip()  # 存在空白字符

            # 小区的类型（几居 列表形式）
            rooms = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
            rooms = list(filter(lambda x: x.endswith("居"), rooms))    # 过滤掉以“居”结尾的字符串，比如2居，而其他字符串‘亦庄·臻珑府价格待定'则不需要

            # 面积
            area = "".join(li.xpath(".//div[@class='house_type clearfix']/text()").getall())   # 使用get()只会返回“面积”前的一段空白，所以getall,由于返回的是列表，需要转换成字符串，否则会有\t一堆
            area = re.sub(r"\s|－|/", "", area)   # 过滤掉空白字符\s(制表符/空格/其他空白)

            # 地址
            address = li.xpath(".//div[@class='address']/a/@title").get()    # 注意写法 <a title="地址" target="xxx"....

            # 行政区
            district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())   # 提取所有的地址 并转换成字符串
            district = re.search(r".*\[(.+)\].*", district_text).group(1)  # 从字符串中提取带[]的字段.group(1)和group()有区别
            # 不采用该方法是因为不是所有的都有span标签，会出现NoneTpye
            # district = li.xpath(".//div[@class='address']/a/span/text()").get().strip()

            # 是否在售
            # sale = li.xpath(".//div[@class='fangyuan pr']/span/text()").get()   # 会提取到None,该方法不合适
            # sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/@class").get()   # 提取英文,注意写法<span class="inSale">在售</span>
            sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()    # 提取中文

            # 价格
            # strip还可以用正则表达式替代 price = re.sub(r"\s","",price)
            price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()).strip()    # 既要取价格还要取单位，所以用getall和//text()

            # 详情页面的url https://beidaihekongquecheng.fang.com/
            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
            origin_url = 'https:' + origin_url     # 获取的url不全，需要字符拼接

            item = NewHouseItem(province=province, city=city, name=name, rooms=rooms, area=area, address=address, district=district,
                                sale=sale, price=price, origin_url=origin_url)
            yield item
        next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()   # 注意写法 <a class='next' href='xxxx'>下一页
        if next_url:            # 如果存在下一页，由于可能url不完全，所以需要拼接url，将newhouse_url，即response.urljoin next_url
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse(),
                                meta={"info": (province, city)})  # 如果有下一页，循环调用newhouse新房列表页面



    # 解析二手房信息
    def parse_esf(self, response):
        province, city = response.meta.get('info')  # 元组形式
        dls = response.xpath("//div[contains(@class,'shop_list ')]/dl")
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            # 小区名字
            item['name'] = dl.xpath(".//p/a/@title").get()

            # 面积朝向等信息infos
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()  # 信息较多，先用getall获取所有
            infos = list(map(lambda x: re.sub("\s", "", x), infos))
            for info in infos:
                # 几室几厅
                if '厅' in info:
                    item['rooms'] = info
                # 层
                elif '层' in info:
                    item['floor'] = info
                # 面积
                elif '㎡' in info:
                    item['area'] = info
                # 朝向
                elif '向' in info:
                    item['toward'] = info
                # 年代
                elif '年' in info:
                    item['year'] = info
                    # item['year'] = info.replace("建筑年代：", "")    # 去掉建筑年代，只保留数字 2019
            # 地址
            item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
            # 价格（数字和单位分开）
            num = dl.xpath(".//span[@class='red']/b/text()").get()
            if num:
                num = "".join(num)      # num 会出现None,无法join,报错
            num_danwei = dl.xpath(".//dd[@class='price_right']/span/text()").get()
            if num_danwei:              # 同样会出现none。直接join报错
                num_danwei = "".join(num_danwei)
            # 字符串拼接（nonetype无法拼接）444万
            if num and num_danwei:
                item['price'] = num + num_danwei

            # 单价
            item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get()

            # 二手房详情页面
            detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
            if detail_url:
                item['origin_url'] = response.urljoin(detail_url)   # 网页不全，且有none

            yield item

        next_url = response.xpath("//div[@class='page_al']//p/a/@href").get()
        yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={'info': (province, city)})

qq_43609939

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
5
评论
爬虫 scrapy 房天下

要求1.获取所有城市的url: http://www.fang.com/SoufunFamily.htm2.获取所有城市的新房url例如：杭州：http://hz.fang.com/杭州新房： https://hz.newhouse.fang.com/house/s/（需要将获得的url先进行拆分）3.获取所有城市二手房url链接例如：杭州：http://hz.fang.com...
复制链接

扫一扫