scrapy框架(三)

CrawlSpider

创建CrawlSpider

  • 命令:scrapy genspider -t crawl hr.tencent hr.tencent.com

  • url就是想要爬去的网址

  • 注意:分析本地文件时一定要带上路径,scrapy shell默认当做url

  • 创建完hr_tencent.py文件的代码如下:

      import scrapy
      from scrapy.linkextractors import LinkExtractor
      from scrapy.spiders import CrawlSpider, Rule
      class HrTencentSpider(CrawlSpider):
          name = 'hr.tencent'
          allowed_domains = ['hr.tencent.com']
          start_urls = ['http://hr.tencent.com/']
          rules = (   #包含一个或多个rule对象,写成列表或元组
              Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
          )	#对爬去网站的动作做了些特定的操作
          def parse_item(self, response):	#这个定义的方法要和callback一致
          i = {}
          #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
          #i['name'] = response.xpath('//div[@id="name"]').extract()
          #i['description'] = response.xpath('//div[@id="description"]').extract()
          return i
    

Rule

  • Rule用来定义CrawlSpider的爬取规则
  • 参数:
    • link_extractor Link Extractor对象,它定义如何从每个已爬网页面中提取链接
    • callback 回调函数
    • cb_kwargs是一个包含要传递给回调函数的关键字参数的dict
    • follow它指定是否应该从使用此规则提取的每个响应中跟踪链接
    • process_links用于过滤连接的回调函数
    • process_request用于过滤请求的回调函数
      在这里插入图片描述

LinkExtractor

  • LinkExtractor也是scrapy框架定义的一个类
  • 他唯一的目的是从web页面中提取最终将被跟踪的连接
  • 我们也可定义我们自己的连接提取器,只需要提供一个名为extract_links的方法,它接收Response对象并返回scrapy.link.Link对象列表
    在这里插入图片描述

代码:

class Rule(object):
    def __init__(self,link_extractor,callback=None,cb_kwargs=None,follow=None,process_links=None,process_request=identify):
        self.link_extractor = link_extractor
        self.callback = callback
        self.cb_kwargs = cb_kwargs or {}
        self.process_links = process_links
        self.process_request = process_request
        if follow is None:
            self.follow = False if callback else True
        else:
            self.follow = follow
class CrawlSpider(Spider):
    rules = ()
    def __init__(self,*args,**kwargs):
        super(CrawlSpider,self).__init__(*args,**kwargs)
        self._compile_rules()
    def parse(self,response):
        return self._parse_response(response,self.parse_start_url,cb_kwargs={})
    def parse_start_url(self,response):

案例实践

hr_tencent.py代码

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tencent.items import TencentItem,DetailItem
class HrTencentSpider(CrawlSpider):   #CrawlSpider通过parse方法实现自动追踪链接的功能,不要在后面再自定义parse方法
    name = 'hr.tencent'
    allowed_domains = ['hr.tencent.com']
    start_urls = ['https://hr.tencent.com/position.php']
    rules = (   #包含一个或多个rule对象,写成列表或元组
        Rule(LinkExtractor(allow=r'start=\d+'), callback='parse_item', follow=True),    #follow=True意思是继续在这个页面解析新的内容
        #符合规则的url请求的回调函数为parse_items,并继续跟进,response传递下去继续匹配url
        Rule(LinkExtractor(allow=r'position_detail\.php\?id=\d+'), callback='parse_detail_item', follow=False),
    )   #callback='parse_detail_item'响应返回给parse_item
    def parse_item(self, response):
        """
        解析职位信息,解析之后继续解析
        :param response:
        :return:
        """
        for tr in response.xpath('//tr[@class="even"]|//tr[@class="odd"]'):
            item = TencentItem()
            item['job_name'] = tr.xpath('./td[1]/a/text()').extract_first()
            item['job_type'] = tr.xpath('./td[2]/text()').extract_first()
            item['job_num'] = tr.xpath('./td[3]/text()').extract_first()
            item['job_addr'] = tr.xpath('./td[4]/text()').extract_first()
            item['job_time'] = tr.xpath('./td[5]/text()').extract_first()
            yield item
    def parse_detail_item(self, response):
        """
        解析具体职位详情,解析之后不再解析
        :param response:
        :return:
        """
        item = DetailItem()
        item['detail_content'] = " " .join(response.xpath('//ul[@class="squareli"]/li/text()').extract())
        yield item
        # i = {}
        # #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        # #i['name'] = response.xpath('//div[@id="name"]').extract()
        # #i['description'] = response.xpath('//div[@id="description"]').extract()
        # return i

items.py

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # pass
    job_name = scrapy.Field()   #声明一下,如果没有这几个字段会报错
    job_type = scrapy.Field()
    job_num = scrapy.Field()
    job_addr = scrapy.Field()
    job_time = scrapy.Field()
class DetailItem(scrapy.Item):
    detail_content = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from tencent.items import DetailItem
from tencent.items import TencentItem
class TencentPipeline(object):
    def open_spider(self,spider):
        #在爬虫被打开的时候运行
        #打开文件句柄
        self.f = open('tencent_job.json','w',encoding='utf-8')
        self.f2 = open('detail_job.json','w',encoding='utf-8')
    def process_item(self, item, spider):
        if isinstance(item,TencentItem):
            content = json.dumps(dict(item),ensure_ascii=False) + "\n"
            self.f.write(content)
        elif isinstance(item,DetailItem):
            content = json.dumps(dict(item),ensure_ascii=False) + "\n"
            self.f2.write(content)
        return item     #丢到下一个工序
    def close_spider(self,spider):
        #爬虫关闭时运行
        self.f.close()
        self.f2.close()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值