CrawlSpider
创建CrawlSpider
-
命令:scrapy genspider -t crawl hr.tencent hr.tencent.com
-
url就是想要爬去的网址
-
注意:分析本地文件时一定要带上路径,scrapy shell默认当做url
-
创建完hr_tencent.py文件的代码如下:
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class HrTencentSpider(CrawlSpider): name = 'hr.tencent' allowed_domains = ['hr.tencent.com'] start_urls = ['http://hr.tencent.com/'] rules = ( #包含一个或多个rule对象,写成列表或元组 Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), ) #对爬去网站的动作做了些特定的操作 def parse_item(self, response): #这个定义的方法要和callback一致 i = {} #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return i
Rule
- Rule用来定义CrawlSpider的爬取规则
- 参数:
- link_extractor Link Extractor对象,它定义如何从每个已爬网页面中提取链接
- callback 回调函数
- cb_kwargs是一个包含要传递给回调函数的关键字参数的dict
- follow它指定是否应该从使用此规则提取的每个响应中跟踪链接
- process_links用于过滤连接的回调函数
- process_request用于过滤请求的回调函数
LinkExtractor
- LinkExtractor也是scrapy框架定义的一个类
- 他唯一的目的是从web页面中提取最终将被跟踪的连接
- 我们也可定义我们自己的连接提取器,只需要提供一个名为extract_links的方法,它接收Response对象并返回scrapy.link.Link对象列表
代码:
class Rule(object):
def __init__(self,link_extractor,callback=None,cb_kwargs=None,follow=None,process_links=None,process_request=identify):
self.link_extractor = link_extractor
self.callback = callback
self.cb_kwargs = cb_kwargs or {}
self.process_links = process_links
self.process_request = process_request
if follow is None:
self.follow = False if callback else True
else:
self.follow = follow
class CrawlSpider(Spider):
rules = ()
def __init__(self,*args,**kwargs):
super(CrawlSpider,self).__init__(*args,**kwargs)
self._compile_rules()
def parse(self,response):
return self._parse_response(response,self.parse_start_url,cb_kwargs={})
def parse_start_url(self,response):
案例实践
hr_tencent.py代码
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tencent.items import TencentItem,DetailItem
class HrTencentSpider(CrawlSpider): #CrawlSpider通过parse方法实现自动追踪链接的功能,不要在后面再自定义parse方法
name = 'hr.tencent'
allowed_domains = ['hr.tencent.com']
start_urls = ['https://hr.tencent.com/position.php']
rules = ( #包含一个或多个rule对象,写成列表或元组
Rule(LinkExtractor(allow=r'start=\d+'), callback='parse_item', follow=True), #follow=True意思是继续在这个页面解析新的内容
#符合规则的url请求的回调函数为parse_items,并继续跟进,response传递下去继续匹配url
Rule(LinkExtractor(allow=r'position_detail\.php\?id=\d+'), callback='parse_detail_item', follow=False),
) #callback='parse_detail_item'响应返回给parse_item
def parse_item(self, response):
"""
解析职位信息,解析之后继续解析
:param response:
:return:
"""
for tr in response.xpath('//tr[@class="even"]|//tr[@class="odd"]'):
item = TencentItem()
item['job_name'] = tr.xpath('./td[1]/a/text()').extract_first()
item['job_type'] = tr.xpath('./td[2]/text()').extract_first()
item['job_num'] = tr.xpath('./td[3]/text()').extract_first()
item['job_addr'] = tr.xpath('./td[4]/text()').extract_first()
item['job_time'] = tr.xpath('./td[5]/text()').extract_first()
yield item
def parse_detail_item(self, response):
"""
解析具体职位详情,解析之后不再解析
:param response:
:return:
"""
item = DetailItem()
item['detail_content'] = " " .join(response.xpath('//ul[@class="squareli"]/li/text()').extract())
yield item
# i = {}
# #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
# #i['name'] = response.xpath('//div[@id="name"]').extract()
# #i['description'] = response.xpath('//div[@id="description"]').extract()
# return i
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
job_name = scrapy.Field() #声明一下,如果没有这几个字段会报错
job_type = scrapy.Field()
job_num = scrapy.Field()
job_addr = scrapy.Field()
job_time = scrapy.Field()
class DetailItem(scrapy.Item):
detail_content = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from tencent.items import DetailItem
from tencent.items import TencentItem
class TencentPipeline(object):
def open_spider(self,spider):
#在爬虫被打开的时候运行
#打开文件句柄
self.f = open('tencent_job.json','w',encoding='utf-8')
self.f2 = open('detail_job.json','w',encoding='utf-8')
def process_item(self, item, spider):
if isinstance(item,TencentItem):
content = json.dumps(dict(item),ensure_ascii=False) + "\n"
self.f.write(content)
elif isinstance(item,DetailItem):
content = json.dumps(dict(item),ensure_ascii=False) + "\n"
self.f2.write(content)
return item #丢到下一个工序
def close_spider(self,spider):
#爬虫关闭时运行
self.f.close()
self.f2.close()