tuchong.py代码
# -*- coding: utf-8 -*-
import scrapy
import json
import re
from ..items import TuchongItem
class TuchongSpider(scrapy.Spider):
name = 'tuchong'
allowed_domains = ['tuchong.com']
start_urls = ['https://stock.tuchong.com/api/free/search/?term=%E9%A3%8E%E6%99%AF&size=100&page=2']
#https://stock.tuchong.com/search?term=%E6%B0%B4%E6%9E%9C&use=0&type=&layout=&sort=0&category=0&page=1&size=100&search_from=&exact=0&platform=weili&tp=&abtest=&royalty_free=0&option=&has_person=0&face_num=&gender=0&age=&racial=
#https://stock.tuchong.com/search?term=%E6%B0%B4%E6%9E%9C&page=2
def parse(self, response):
#这是你想爬取的页数
for j in range(1,3):
next_url = 'https://stock.tuchong.com/api/free/search/?term=%E9%A3%8E%E6%99%AF&size=100&page={}'.format(j)
yield scrapy.Request(next_url,self.parse_xq)
def parse_xq(self, response):
# with open('tuc.html','wb') as f:
# f.write(response.body)
html_str = response.text
html_str = json.loads(html_str)
for i in range(1,100):
image_id = html_str['data']['hits'][i]['imageId']
print(image_id)
#https://stock.tuchong.com/free/image/?imageId=198471649997357066&term=%E9%A3%8E%E6%99%AF
img_url = 'https://stock.tuchong.com/free/image/?imageId={}&term=%E9%A3%8E%E6%99%AF'.format(image_id)
yield scrapy.Request(img_url,self.parse_img)
def parse_img(self,response):
# html_img = response.text
# img = re.search(r'<img src="(.*?)"',html_img)
# print(img)
#//*[@id="image-detail"]/div/div[2]/img
item = TuchongItem()
img = response.xpath('//*[@id="image-detail"]/div/div[2]/img/@src').extract()
img = ''.join(img)
item['img'] = img
yield item
items.py文件代码
class TuchongItem(scrapy.Item):
img = scrapy.Field()
pipelines.py文件代码
from .mysqlhelper import MysqlHelper
import os
class TuchongProjectPipeline(object):
def process_item(self, item, spider):
import requests
if not os.path.exists('download'):
os.mkdir('download')
filename ='download/'+ item['img'].split('/')[-1]
item['img'] = 'http://' +item['img'].replace('//','')
response = requests.get(item['img'])
with open(filename,'wb') as f:
f.write(response.content)