scrapy_常用技术

中间件简明

  • middlewares.py
  • DOWNLOADER_MIDDLEWARES
  • Spider与Downloader的中间件
from scrapy import signals
from fake_useragent import UserAgent

class RandomUserAgentMiddleware(object):
	def __init__(self,user_agent):
		self.user_agent=user_agent
		self.ua = UserAgent()

    @classmethod
    def from_crawler(cls, crawler):
        # 修改设置构造新的spider
        s = cls(crawler.settings['USER_AGENT'])
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

	def process_request(self, request, spider):
		# 请求前,随机UA
		request.headers['User-Agent']=self.ua.random
		# 设置代理
		request.meta['proxy']='http://127.0.0.1:9743'

	def process_response(self,request,response,spider):
		# 请求后修改状态码,再返回response到spider
		response.status=201
		return response # None

	def process_exception(request,exception,spider):
		# 异常执行
		print('======>',repr(exception))
		return None #Response,Request
  • middlewares.py
  • SPIDER_MIDDLEWARES
from scrapy.exceptions import CloseSpider

class Close_spider(object):
    def process_spider_input(self,response,spider):
    	# response被处理前执行
        if not 200 <= response.status <= 300:
            print('失败 url(%s) stcode(%s)'%(response.url,response.status))
            # raise CloseSpider('%s爬虫异常,退出!'%response.url)
        return None
 
    def process_spider_output(self,response,result,spider):
    	# spider处理response返回结构后执行
        for res in result:
            yield res #iter

	def process_spider_exception(self,response,exception,spider):
		# 异常执行
		print('======>',repr(exception))
		return None #Response,iter

数据库链接

  • pipeline.py
  • ITEM_PIPELINES

MySql

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

class Pipeline_MySql(object):
    def __init__(self,user,password,port,database,charset):
        self.user=user
        self.password=password
        self.port=port
        self.database=database
        self.charset=charset

    # 用setting变量初始化自身对象
    @classmethod
    def from_crawler(cls,crawler):
        return cls(
        user=crawler.settings.get('MYSQL_USER'),
        password=crawler.settings.get('MYSQL_PASSWORD'),
        port=crawler.settings.get('MYSQL_PORT'),
        database=crawler.settings.get('MYSQL_DATABASE'),
        charset=crawler.settings.get('MYSQL_CHARSET')
        )
        
    # spider开启时调用,构造engine及连接数据库
    def open_spider(self,spider):
        cracom='mysql+pymysql://{user}:{passwork}@127.0.0.1:{port}/{database}?charset={charset}'
        self.engine=create_engine(cracom.format(
            user=self.user,
            passwork=self.passwork,
            port=self.port,
            database=self.atabase,
            charset=self.charset)
            )
        self.session=sessionmaker(bind=self.engine)()
        
    # spider关闭时调用,断开数据库
    def close_spider(self,spider):
        self.session.close()

    # 处理item,把item写入数据库并返回item
    def process_item(self,item,spider):
        item.to_sql('tbname',con=self.engine,if_exists='append',index=False)
        return item

MongoDB

import pymongo

class MongoPipeline(object):
	def __init__(self,mongo_uri,mongo_db):
		self.mongo_uri=mongo_uri
		self.mongo_db=mongo_db

	@classmethod
    def from_crawler(cls, crawler):
    	# 修改spider
        return cls(
			mongo_url=crawler.settings.get('MONGO_RUI')
			mongo_db=crawler.settings.get('MONGO_DB')
		)

	def open_spider(self,spider):
		# spider启动时执行
		self.clinet=pymongo.MongoClient(self.mongo_uri)
		self.db=self.client[self.moongo_db]

	def close_spider(self,spider):
		# spider关闭前执行
		self.client.close()

	def process_item(self,item,spider):
		# 处理返回的单个结果
		self.db[item.collection].insert(dict(item))
		return item

对接selenium

from selenium import webdriver
from selenium webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from scrapy.http import HtmlResponse
from logging import getLogger

class SeleniumMiddleware(object):
	def __init__(self,timeout=None):
		self.logger=getLogger(__name__)
		self.timeout=timeout
		self.bw=webdriver.Chrome()
		self.bw.set_window_size(1400,700)
		self.bw.set_page_load_timeout(self.timeout)
		self.wait=WebDriverWait(self.bw,self.timeout)

	def __def__(self):
		self.bw.close()

	@classmethod
	def from_crawler(cls,crawler):
		return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'))

	def process_request(self,request,spider):
		self.logger.debug('Chorme is starting')
		try:
			self.get(request.url)
			self.wait.until(EC.presence_of_element_located((By.ID, 'q')))
			return HtmlResponse(url=request.url,body=self.bw.page_source,request=request,encoding='utf-8',status=200)
		except TimeoutException as e:
			print(repr(e))
			return HtmlResponse(url=request.url,status=500,request=reqest)

对接Docker

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值