烽火编程--scrapy爬虫框架的使用

scrapy基本使用

安装scrapy

pip install pyopenssl
pip install Twisted
pip install scrapy

创建项目

进入准备存放项目的文件夹打开shell

scrapy startproject 项目名称

创建爬虫

切换到项目目录

cd 项目名称
scrapy genspider 爬虫名 -t  "目标网站域名"
scrapy genspider -t crawl 爬虫名 "目标网站域名"

使用manager启动

在项目同名目录下添加manager.py模块

from scrapy.cmdline import execute
execute('scrapy crawl 爬虫名'.split())

爬虫内主动退出爬虫
spider.crawler.engine.close_spider(spider, “主动退出爬虫”)

settings模块设置

USER_AGENT = "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1"
ROBOTSTXT_OBEY = False  # 不遵守爬虫协议
SCHEDULER_PERSIST = True  # 允许中断
DOWNLOAD_DELAY = 3  # 延时下载
# 设置 分布式的 去重组件
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置 分布式的 调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

piplines

class DataSourcePipeline(object):
    """添加数据源信息的管道, 优先执行"""
    def process_item(self, item, spider):
        item['data_source'] = spider.name
        item['data_time'] = str(datetime.utcnow())
        return item


class JsonPipeline(object):
    """存json文件的管道"""
    def open_spider(self, spider):
        self.file = open("aqi.json", 'w')

    def process_item(self, item, spider):
        str_item = json.dumps(dict(item)) + '\n'
        self.file.write(str_item)
        return item

    def close_spider(self, spider):
        self.file.close()

class CsvPipeline(object):
    """存csv文件的管道"""
    def open_spider(self, spider):
        self.file = open("aqi.csv", 'w')
        self.csv_writer = CsvItemExporter(self.file)
        self.csv_writer.start_exporting()

    def process_item(self, item, spider):
        self.csv_writer.export_item(item)
        return item

    def close_spider(self, spider):
        self.file.close()
        self.csv_writer.finish_exporting()

class RedisPipeline(object):
    """存redis数据库的管道"""
    def open_spider(self, spider):
        self.client = redis.Redis("127.0.0.1", 6379)

    def process_item(self, item, spider):
        self.client.lpush("AQI_List", dict(item))
        return item

class MongoPipeline1(object):
    """存mongodb的管道"""
    def open_spider(self, spider):
        self.client = pymongo.MongoClient("127.0.0.1",27017)
        self.db = self.client['MongoAQI']
        self.collections = self.db['aqi']

    def process_item(self, item, spider):
        self.collections.insert(dict(item))
        return item

class MongoPipeline2():
collection_name = ‘users’

def __init__(self, mongo_uri, mongo_db):
    self.mongo_uri = mongo_uri
    self.mongo_db = mongo_db

@classmethod
def from_crawler(cls, crawler):
    return cls(
        mongo_uri=crawler.settings.get('MONGO_URI'),
        mongo_db=crawler.settings.get('MONGO_DATABASE')
    )

def open_spider(self, spider):
    self.client = pymongo.MongoClient(self.mongo_uri)
    self.db = self.client[self.mongo_db]

def close_spider(self, spider):
    self.client.close()

def process_item(self, item, spider):
	 # 存在整体替换,不存在插入
    self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True) 
    return item

from_crawler(cls, crawler)方法
使用settings中的配置信息进行初始化, 会直接传入初始化方法中

def __init__(self, mongo_uri, mongo_db):
     self.mongo_uri = mongo_uri
     self.db = mongo_db
@classmethod
def friom_crawler(cls, crawler):
return cls(
    mongo_uri = crewler.settings.get("MONGO_URI"),
    mongo_db = crawler.settings.get("DB", "items_{}".format(datetime.date.today()))
    )

process_item(self, item)方法
截获item 在进入存储管道之前对item进行一些处理, 如数据清洗, 去重等.
需要return item

middlewares

import time
import scrapy
from selenium import webdriver

 class ChromeMiddleware(UserAgentMiddleware):
    def __init__(self, user_agent):
        super().__init__()
        self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
    return cls(user_agent=crawler.settings.get('USER_AGENT_LIST'))
def process_request(self, request, spider):
    # 过滤渲染页面
    url = request.url
    if url = "https://www.aqistudy.cn/historydata/":
        
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(2)
        data = driver.page_source
        driver.quit() 
        return scrapy.http.HtmlResponse(
            url=url, body=data.encode("utf-8"), encoding="utf-8", request=request)

process_excepyion(self, request, response, spider)
捕获异常进行处理

return request.meta["proxy"] = "ip:port"

串行spider

import scrapy
from 项目名.items import Item

class AqiSpider(scrapy.Spider):
    name = 'aqi'
    allowed_domains = []
    start_urls = []
    # 指定当前爬虫个性化的pipeline
    custom_settings = {
    'ITEM_PIPELINES': {
        'FxtDataAcquisition.pipelines.CityPipeline': 300,
    }
}

    def parse(self, response):

        names_list = response.xpath('//li/text()').extract()
        links_list = response.xpath('//li/a/@href').extract()

        item = Item()

        for link, name in zip(links_list, names_list):
            item['city_name'] = name
            url = 'https://www.aqistudy.cn/historydata/' + link
            yield scrapy.Request(url=url, meta={"itemkey": item}, callback=self.detail_parse)


    def detail_parse(self, response):
        # 取出item
        item = response.meta['itemkey']

        # 取出所有 的 tr
        tr_list = response.xpath('//tr')
        for tr in tr_list:
            item['date'] = tr.xpath('./td[1]/text()').extract_first()
            yield item

并行crawlspider

# coding: utf-8
import scrapy
from Xxx.items import Item
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class XxxSpider(CrawlSpider):
    name = ''
    allowed_domains = []
    start_urls = []
    # item不能传递,无法对应,一般只解析最后一层数据
    # 默认 如果有了callback= follow= False
    rules = (
        # 提取 第二层urls
        Rule(LinkExtractor(allow=r"month\.php\?city=")),

        # 提取 第三层urls
        Rule(LinkExtractor(allow=r"day\.php\?city="), callback="day_parse", follow=False),
    )


    def day_parse(self, response):
        item = Item()
        tr_list = response.xpath('//tr')
        for tr in tr_list:
            item['date'] = tr.xpath('./td[1]/text()').extract_first()
            yield item

make_requests_from_url(self, url)

# 设置超时时间
yield scrapy.Request(url=url, meta={"download_timeout": 10})

需要全局调用的变量可以定义在spider的init方法中, 通过spider调用.

如browser等

内容解析

xpath和css解析参考

https://blog.csdn.net/mouday/article/details/80455560

webdriver动作与解析

https://blog.csdn.net/qq_38284543/article/details/75267168
https://blog.csdn.net/fkew2009/article/details/83501991

scrapy的去重
url先经过标准化

from w3lib.url import canonicalize_url
url = canonicalize_url(url)

判断url的指纹是否在已完成url的集合中,
如果不在则生成request对象推入请求队列

def get_fingerprint(url):
    standard_url = canonicalize_url(url)
    md = md5()
    md.update(url.encode('utf8'))
    finger = md.hexdigest()
return finger

訪問成功加入集合
否则说明重复

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值