利用scrapy框架爬取图虫风景图

tuchong.py代码

# -*- coding: utf-8 -*-
import scrapy
import json
import re
from ..items import TuchongItem

class TuchongSpider(scrapy.Spider):
    name = 'tuchong'
    allowed_domains = ['tuchong.com']
    start_urls = ['https://stock.tuchong.com/api/free/search/?term=%E9%A3%8E%E6%99%AF&size=100&page=2']
    #https://stock.tuchong.com/search?term=%E6%B0%B4%E6%9E%9C&use=0&type=&layout=&sort=0&category=0&page=1&size=100&search_from=&exact=0&platform=weili&tp=&abtest=&royalty_free=0&option=&has_person=0&face_num=&gender=0&age=&racial=
    #https://stock.tuchong.com/search?term=%E6%B0%B4%E6%9E%9C&page=2
    def parse(self, response):
        #这是你想爬取的页数
        for j in range(1,3):
            next_url = 'https://stock.tuchong.com/api/free/search/?term=%E9%A3%8E%E6%99%AF&size=100&page={}'.format(j)
            yield scrapy.Request(next_url,self.parse_xq)
    def parse_xq(self, response):
        # with open('tuc.html','wb') as f:
        #     f.write(response.body)
        html_str = response.text
        html_str = json.loads(html_str)


        for i in range(1,100):
            image_id = html_str['data']['hits'][i]['imageId']
            print(image_id)
            #https://stock.tuchong.com/free/image/?imageId=198471649997357066&term=%E9%A3%8E%E6%99%AF
            img_url = 'https://stock.tuchong.com/free/image/?imageId={}&term=%E9%A3%8E%E6%99%AF'.format(image_id)
            yield scrapy.Request(img_url,self.parse_img)
    def parse_img(self,response):
        # html_img = response.text
        # img = re.search(r'<img src="(.*?)"',html_img)
        # print(img)
        #//*[@id="image-detail"]/div/div[2]/img

        item = TuchongItem()
        img = response.xpath('//*[@id="image-detail"]/div/div[2]/img/@src').extract()
        img = ''.join(img)
        item['img'] = img


        yield item


items.py文件代码

class TuchongItem(scrapy.Item):
    img = scrapy.Field()

pipelines.py文件代码

from .mysqlhelper import MysqlHelper
import os
class TuchongProjectPipeline(object):
    def process_item(self, item, spider):
        import requests
        if not os.path.exists('download'):
            os.mkdir('download')

        filename  ='download/'+ item['img'].split('/')[-1]
        item['img'] = 'http://' +item['img'].replace('//','')
        response =  requests.get(item['img'])

        with open(filename,'wb') as f:
            f.write(response.content)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值