Scrapy 框架之 ——crawl Spiders

最新推荐文章于 2023-06-28 10:42:36 发布

Luke Liu

最新推荐文章于 2023-06-28 10:42:36 发布

阅读量627

点赞数 1

分类专栏： Scarpy 文章标签： Python Scrapy

Scarpy 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

				版权声明：本文为博主原创文章，未经博主允许不得转载。					https://blog.csdn.net/u012017783/article/details/76169807				</div>
							<link rel="stylesheet" href="https://csdnimg.cn/release/phoenix/template/css/ck_htmledit_views-f57960eb32.css">
    <a href="https://edu.csdn.net/topic/python115?utm_source=bw_bt" target="_blank"><img src="https://img-bss.csdn.net/1556441596718.png"></a>								            <link rel="stylesheet" href="https://csdnimg.cn/release/phoenix/template/css/ck_htmledit_views-f57960eb32.css">
					<div class="htmledit_views" id="content_views">

一、适用条件

可以对有规律或者无规律的网站进行自动爬取

二、代码讲解

(1)创健scrapy项目


 
 
   
   
    
    
   
   
   
   
    
    
     
     E:myweb>scrapy startproject mycwpjt
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     New Scrapy project 
     
     'mycwpjt', using template directory 
     
     'd:\\python35\\lib\\site-packages\\scrapy\\templates\\project', created 
     
     in:
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         D:\Python35\myweb\part16\mycwpjt
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     You can start your first spider 
     
     with:
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         cd mycwpjt
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         scrapy genspider example example.com

(2) 创健爬虫


 
 
   
   
    
    
   
   
   
   
    
    
     
     E:\myweb>scrapy genspider -t crawl weisuen sohu.com
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     Created spider 
     
     'weisuen' using template 
     
     'crawl' 
     
     in module:
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
       Mycwpjt.spiders.weisuen

(3)item编写


 
 
   
   
    
    
   
   
   
   
    
    
     
     # -*- coding: utf-8 -*-
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # Define here the models for your scraped items
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # See documentation in:
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # http://doc.scrapy.org/en/latest/topics/items.html
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import scrapy
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     class MycwpjtItem(scrapy.Item):
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     # define the fields for your item here like:
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         name = scrapy.Field()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         link = scrapy.Field()

(4)pipeline编写


 
 
   
   
    
    
   
   
   
   
    
    
     
     # -*- coding: utf-8 -*-
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # Define your item pipelines here
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     class MycwpjtPipeline(object):
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     def process_item(self, item, spider):
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             print(item[
     
     "name"])
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             print(item[
     
     "link"])
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     return item

(5)settings设置


 
 
   
   
    
    
   
   
   
   
    
    
     
     ITEM_PIPELINES = {
    
    
   
   

   
   
    
    
   
   
   
   
    
       
     
     'mycwpjt.pipelines.MycwpjtPipeline': 
     
     300,
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     }

(6)爬虫编写


 
 
   
   
    
    
   
   
   
   
    
    
     
     # -*- coding: utf-8 -*-
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import scrapy
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from scrapy.linkextractors 
     
     import LinkExtractor
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from scrapy.spiders 
     
     import CrawlSpider, Rule
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     from mycwpjt.items 
     
     import MycwpjtItem
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #显示可用的模板              scrapy genspider -l
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #利用crawlspider创建的框架  scrapy genspider -t crawl weisun sohu.com
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #开始爬取                   scrapy crawl weisun --nolog
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     class WeisunSpider(CrawlSpider):
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         name = 
     
     'weisun'
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         allowed_domains = [
     
     'sohu.com']
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         start_urls = [
     
     'http://sohu.com/']
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         rules = (
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     # 新闻网页的url地址类似于：
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     # “http://news.sohu.com/20160926/n469167364.shtml”
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     # 所以可以得到提取的正则表达式为'.*?/n.*?shtml’
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             Rule(LinkExtractor(allow=(
     
     '.*?/n.*?shtml'), allow_domains=(
     
     'sohu.com')), callback=
     
     'parse_item', follow=
     
     True),
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         )
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
        
     
     def parse_item(self, response):
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             i = MycwpjtItem()
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     # 根据Xpath表达式提取新闻网页中的标题
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             i[
     
     "name"] = response.xpath(
     
     "/html/head/title/text()").extract()
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     # 根据Xpath表达式提取当前新闻网页的链接
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             i[
     
     "link"] = response.xpath(
     
     "//link[@rel='canonical']/@href").extract()
    
    
   
   

   
   
    
    
   
   
   
   
    
            
     
     return i

CrawlSpider是爬取那些具有一定规则网站的常用的爬虫，它基于Spider并有一些独特属性

rules: 是Rule对象的集合，用于匹配目标网站并排除干扰
parse_start_url: 用于爬取起始响应，必须要返回Item，Request中的一个。

因为rules是Rule对象的集合，所以这里介绍一下Rule。它有几个参数：link_extractor、callback=None、 cb_kwargs=None、follow=None、process_links=None、process_request=None
其中的link_extractor既可以自己定义，也可以使用已有LinkExtractor类，主要参数为：