房天下爬虫

fang.py

# -*- coding: utf-8 -*-
import json
import re

import scrapy

from myspider_fangtianxia.items import MyspiderFangtianxiaItem


class FangSpider(scrapy.Spider):
    name = 'fang'
    allowed_domains = ['fang.com']
    start_urls = ['http://newhouse.gz.fang.com/house/s/']

    def parse(self, response):
        for each in response.xpath('//div[@class="nlcd_name"]/a/@href'):
            yield scrapy.Request(each.extract(), callback=self.parse_xiaoqu)
        for each in ['http://newhouse.gz.fang.com/house/s/b9{}'.format(i) for i in range(2, 28)]:
            yield scrapy.Request(each, callback=self.parse)

    def parse_xiaoqu(self, response):

        xiaoqus = response.xpath('//div[@class="inf_left1 "]//strong/text()').extract()
        html = response.text.replace('\r', '').replace('\t', '').replace('\n', '')
        newcode = re.findall("'http://newhouse.gz.fang.com';var newcode = '(.*?)';", html)
        if xiaoqus and newcode:
            newcode = newcode[0]
            xiaoqu = re.sub("[\s+\.\!\/_,$%^*+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", xiaoqus[0])

            item = MyspiderFangtianxiaItem()

            item['xiaoqu'] = xiaoqu

            for each in [
                'http://newhouse.gz.fang.com/house/ajaxrequest/householdlist_get.php?newcode={}&room={}'.format(
                        newcode,
                        i) for i in
                range(1, 7)]:
                yield scrapy.Request(each, meta={'meta_1': item}, callback=self.parse_image)

    def parse_image(self, response):
        rep_dict = json.loads(response.text)
        if rep_dict:
            for r in rep_dict:
                item = response.meta['meta_1']
                room = r['room']
                new_url = r['houseimageurl'][:-12] + '.jpg'
                new_url = new_url.replace('/viewimage', '')

                item['room'] = room
                item['url'] = new_url

                yield item

items.py

import scrapy


class MyspiderFangtianxiaItem(scrapy.Item):
    xiaoqu = scrapy.Field()
    room = scrapy.Field()
    url = scrapy.Field()

 

pipelines.py

import uuid

import os
import requests


class MyspiderFangtianxiaPipeline(object):
    def process_item(self, item, spider):
        xiaoqu = item['xiaoqu']

        room = item['room']
        url = item['url']

        path = 'F:\images/' + xiaoqu + '/' + room

        if (not os.path.exists(path)):
            os.makedirs(path)

        fname = path + '/' + str(uuid.uuid1()) + '.jpg'

        rep = requests.get(url)
        if rep.status_code == 200:
            with open(fname, 'wb')as f:
                f.write(rep.content)
        return item

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值