scrapy-redis 持久化爬取吉他社吉他谱

最新推荐文章于 2023-06-09 01:29:40 发布

ALWAYS_FANG

最新推荐文章于 2023-06-09 01:29:40 发布

阅读量556

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/qq_38120760/article/details/88128540

版权

爬虫专栏收录该内容

7 篇文章 0 订阅

订阅专栏

我做了一个导航站（域名是挂路灯的全拼gualudeng.com)，里面精选了各种影视，动漫，黑科技，实用工具，搞笑有趣的站点，动动大家可爱的小手，点进来看看吧,良心站点。

1.爬虫文件

# -*- coding: utf-8 -*-
import scrapy
import copy
import sys

from gtshe.items import GtsheItem

class MusicSpider(scrapy.Spider):
	name = 'music'
	allowed_domains = ['jitashe.org']
	start_urls = ["https://www.jitashe.org"]
	cookie = "yGhj_40fe_saltkey=IDLlBPKk; yGhj_40fe_lastvisit=1551500410; yGhj_40fe_auth=e9f8%2FuHsl%2BbAhP%2BGint%2FUgLktBBjjf3EvlF0TXj4ZXWHe4Z%2Bcbge1LRi21zG6TL19UdsJLoP8sZmZAE%2B3iutAYxvfGg; yGhj_40fe_lastcheckfeed=644660%7C1551504015; yGhj_40fe_lip=60.176.42.168%2C1551504015; yGhj_40fe_pushuid=8430419; yGhj_40fe_pushgid=49982; yGhj_40fe_connect_is_bind=1; yGhj_40fe_st_p=644660%7C1551540151%7C13551060efe5a9679df4c4e9e02ed7a1; yGhj_40fe_viewid=tid_1336137; yGhj_40fe_ulastactivity=1551587332%7C0; yGhj_40fe_checkpm=1; yGhj_40fe_noticeTitle=1; Hm_lvt_4ad169a3774e8f5be3c7945513632bde=1551504009,1551515651,1551522942,1551587351; Hm_lpvt_4ad169a3774e8f5be3c7945513632bde=1551587351; yGhj_40fe_lastact=1551587332%09misc.php%09patch"
	cookies = {i.split("=")[0]:i.split("=")[1] for i in cookie.split("; ")}
	def start_requests(self):
		yield scrapy.Request(
			self.start_urls[0],
			callback = self.parse,
			cookies=self.cookies
		)

	def parse(self,response):
		
		new = "https://www.jitashe.org/guide/newtab/t1/"
		hot = "https://www.jitashe.org/guide/hottab/t1/"
		item = GtsheItem()
		item['cat'] = "new"
		yield scrapy.Request(
			new,
			callback=self.parse1,
			meta = {'item':copy.deepcopy(item)},
			cookies=self.cookies
		)
		
		item['cat'] = "hot"
		yield scrapy.Request(
			hot,
			callback=self.parse1,
			meta = {'item':copy.deepcopy(item)}
		)
	
	
	def parse1(self,response):
		url_list = ["https://www.jitashe.org"+i for i in response.xpath("//a[@class='title']/@href").extract()]
		name_list = response.xpath("//a[@class='title']/text()").extract()
		next_item = copy.deepcopy(response.meta['item'])
		print (url_list)
		for index,url in enumerate(url_list):
			item = response.meta['item']
			item['name'] = name_list[index]
			yield scrapy.Request(
					url = url,
					meta={'item':copy.deepcopy(item)},
					callback = self.parse2,
					cookies=self.cookies #为了实现持久化必须每次访问都携带cookies
				)
		#获取下一页
		
		next_url = response.xpath('//a[@class="nxt"]/@href').extract()
		if len(next_url)!=0:
			yield scrapy.Request(
				url = "https://www.jitashe.org/"+next_url[0],
				meta = {'item':copy.deepcopy(next_item)},
				callback = self.parse1
				)
		
	def parse2(self,response):
		url = response.xpath("//a[@id='gtp_download']/@href").extract_first()
		print(url)
		if url is not None:
			gtp_url = "https://www.jitashe.org"+url
			item = response.meta['item']
			item['url'] = gtp_url
			print(gtp_url)
			print("开始爬取："+item['name'])
			
			yield item

2.配置文件

# -*- coding: utf-8 -*-
#radis 配置

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = True
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
REDIS_URL = "redis://127.0.0.1:6379"


BOT_NAME = 'gtshe'
SPIDER_MODULES = ['gtshe.spiders']
NEWSPIDER_MODULE = 'gtshe.spiders'
DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'en',
}
#COOKIES_DEBUG=True
#LOG_LEVEL="WARNING"
USER_AGENT_LIST = [
	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
	"Opera/8.0 (Windows NT 5.1; U; en)",
	"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
	"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
	"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
	"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
	"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
	"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
	"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
	"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
	"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
	"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
	"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36"
]

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'gtshe (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
FILES_STORE = "G:/Eclipse_p/scrapy/gtshe/gtp_forum"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1


# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'en',
}



DOWNLOADER_MIDDLEWARES = {
    'gtshe.middlewares.GtsheDownloaderMiddleware': 543,
}

ITEM_PIPELINES = {
    'gtshe.pipelines.GtshePipeline': 300,
}

3.item文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class GtsheItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    url = scrapy.Field()
    cat = scrapy.Field()

4.下载中间件

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
from scrapy import signals
class GtsheDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    def process_request(self,request,spider):
        request.headers['USER-AGENT'] = random.choice(spider.settings.get('USER_AGENT_LIST'))

5.管道文件

# -*- coding: utf-8 -*-

import scrapy
import copy
import os
from scrapy import cmdline
from scrapy.utils.misc import md5sum
from scrapy.pipelines.files import FilesPipeline
try:
    from cStringIO import StringIO as BytesIO
except ImportError:
    from io import BytesIO
class GtshePipeline(FilesPipeline):
	
	def get_media_requests(self,item,spider):
		yield scrapy.Request(item['url'],meta={'item':item})

	#获取文件后缀名
	def file_path(self,request,response=None,info=None):
		item = request.meta['item']
		return item['cat']+"/"


    #由于不能直接从链接中获取图片名称所以只能从header头中获取		
	def file_downloaded(self, response, request, info):

		path = self.file_path(request, response=response, info=info)
		file_name = response.headers.get('Content-Disposition')
		#print(response.headers)
		if file_name is None:
			print("爬虫关闭")
			os._exit(0)
			
		path = path+str(file_name,'utf-8').split("\"")[1]
		buf = BytesIO(response.body)
		checksum = md5sum(buf)
		buf.seek(0)
		self.store.persist_file(path, buf, info)
		return checksum