fang.py
# -*- coding: utf-8 -*-
import json
import re
import scrapy
from myspider_fangtianxia.items import MyspiderFangtianxiaItem
class FangSpider(scrapy.Spider):
name = 'fang'
allowed_domains = ['fang.com']
start_urls = ['http://newhouse.gz.fang.com/house/s/']
def parse(self, response):
for each in response.xpath('//div[@class="nlcd_name"]/a/@href'):
yield scrapy.Request(each.extract(), callback=self.parse_xiaoqu)
for each in ['http://newhouse.gz.fang.com/house/s/b9{}'.format(i) for i in range(2, 28)]:
yield scrapy.Request(each, callback=self.parse)
def parse_xiaoqu(self, response):
xiaoqus = response.xpath('//div[@class="inf_left1 "]//strong/text()').extract()
html = response.text.replace('\r', '').replace('\t', '').replace('\n', '')
newcode = re.findall("'http://newhouse.gz.fang.com';var newcode = '(.*?)';", html)
if xiaoqus and newcode:
newcode = newcode[0]
xiaoqu = re.sub("[\s+\.\!\/_,$%^*+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", xiaoqus[0])
item = MyspiderFangtianxiaItem()
item['xiaoqu'] = xiaoqu
for each in [
'http://newhouse.gz.fang.com/house/ajaxrequest/householdlist_get.php?newcode={}&room={}'.format(
newcode,
i) for i in
range(1, 7)]:
yield scrapy.Request(each, meta={'meta_1': item}, callback=self.parse_image)
def parse_image(self, response):
rep_dict = json.loads(response.text)
if rep_dict:
for r in rep_dict:
item = response.meta['meta_1']
room = r['room']
new_url = r['houseimageurl'][:-12] + '.jpg'
new_url = new_url.replace('/viewimage', '')
item['room'] = room
item['url'] = new_url
yield item
items.py
import scrapy
class MyspiderFangtianxiaItem(scrapy.Item):
xiaoqu = scrapy.Field()
room = scrapy.Field()
url = scrapy.Field()
pipelines.py
import uuid
import os
import requests
class MyspiderFangtianxiaPipeline(object):
def process_item(self, item, spider):
xiaoqu = item['xiaoqu']
room = item['room']
url = item['url']
path = 'F:\images/' + xiaoqu + '/' + room
if (not os.path.exists(path)):
os.makedirs(path)
fname = path + '/' + str(uuid.uuid1()) + '.jpg'
rep = requests.get(url)
if rep.status_code == 200:
with open(fname, 'wb')as f:
f.write(rep.content)
return item