首先创建一个爬虫文件dgrds.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy_redis.spiders import RedisSpider
class DgrdsSpider(RedisSpider):
name = 'dgrds'
redis_key = 'dgrds:start_urls'
def parse(self, response):
for i in range(2499930, 2499940):
yield scrapy.Request('https://www.douguo.com/cookbook/' + str(i) + '.html', callback=self.parse2)
def parse2(self, response):
if (response.status == 200):
title = response.css('.rinfo h1.title::text').get('')
view_nums = response.css('.vcnum span:first-of-type::text').get('')
collection_nums = response.css('.vcnum .collectnum::text').get('')
user_name = response.css('.author-info .nickname::text').get('')
user_image = response.css('.author-img img::attr(src)').get('')
tags = ''
tag_arr = response.css('.fenlei span')
if tag_arr is not None:
for tg in tag_arr:
tags += ';' + tg.css('a::text').get('')
basic_url = ''
youku = ''
id = 0
isvideo = response.css('#banner + a')
if isvideo is not None:
next_url = response.css('#banner + a::attr(href)').get('')
id = next_url.replace('/recipevideo/', '')
basic_url = 'https://www.douguo.com/cookbook/' + id + '.html'
item = {
'cate': '',
'title': title,
'view_nums': view_nums,
'collection_nums': collection_nums,
'user_name': user_name,
'user_image': user_image,
'tags': tags,
'basic_url