利用scrapy框架爬取图虫风景图

最新推荐文章于 2021-11-27 12:10:12 发布

majiexiong

最新推荐文章于 2021-11-27 12:10:12 发布

阅读量438

点赞数

分类专栏：学习博客 python

本文链接：https://blog.csdn.net/majiexiong/article/details/82108212

版权

学习博客同时被 2 个专栏收录

20 篇文章 0 订阅

订阅专栏

python

15 篇文章 0 订阅

订阅专栏

tuchong.py代码

# -*- coding: utf-8 -*-
import scrapy
import json
import re
from ..items import TuchongItem

class TuchongSpider(scrapy.Spider):
    name = 'tuchong'
    allowed_domains = ['tuchong.com']
    start_urls = ['https://stock.tuchong.com/api/free/search/?term=%E9%A3%8E%E6%99%AF&size=100&page=2']
    #https://stock.tuchong.com/search?term=%E6%B0%B4%E6%9E%9C&use=0&type=&layout=&sort=0&category=0&page=1&size=100&search_from=&exact=0&platform=weili&tp=&abtest=&royalty_free=0&option=&has_person=0&face_num=&gender=0&age=&racial=
    #https://stock.tuchong.com/search?term=%E6%B0%B4%E6%9E%9C&page=2
    def parse(self, response):
        #这是你想爬取的页数
        for j in range(1,3):
            next_url = 'https://stock.tuchong.com/api/free/search/?term=%E9%A3%8E%E6%99%AF&size=100&page={}'.format(j)
            yield scrapy.Request(next_url,self.parse_xq)
    def parse_xq(self, response):
        # with open('tuc.html','wb') as f:
        #     f.write(response.body)
        html_str = response.text
        html_str = json.loads(html_str)


        for i in range(1,100):
            image_id = html_str['data']['hits'][i]['imageId']
            print(image_id)
            #https://stock.tuchong.com/free/image/?imageId=198471649997357066&term=%E9%A3%8E%E6%99%AF
            img_url = 'https://stock.tuchong.com/free/image/?imageId={}&term=%E9%A3%8E%E6%99%AF'.format(image_id)
            yield scrapy.Request(img_url,self.parse_img)
    def parse_img(self,response):
        # html_img = response.text
        # img = re.search(r'<img src="(.*?)"',html_img)
        # print(img)
        #//*[@id="image-detail"]/div/div[2]/img

        item = TuchongItem()
        img = response.xpath('//*[@id="image-detail"]/div/div[2]/img/@src').extract()
        img = ''.join(img)
        item['img'] = img


        yield item

items.py文件代码

class TuchongItem(scrapy.Item):
    img = scrapy.Field()

pipelines.py文件代码

from .mysqlhelper import MysqlHelper
import os
class TuchongProjectPipeline(object):
    def process_item(self, item, spider):
        import requests
        if not os.path.exists('download'):
            os.mkdir('download')

        filename  ='download/'+ item['img'].split('/')[-1]
        item['img'] = 'http://' +item['img'].replace('//','')
        response =  requests.get(item['img'])

        with open(filename,'wb') as f:
            f.write(response.content)