一般有三个步骤:
(1) 在爬虫中创建bro对象。(2) 中间件中使用。(3) 关闭,在爬虫中。
py文件:
# -*- coding: utf-8 -*-
import scrapy
import logging
from BossScrapy.items import BossscrapyItem
class BossscrapySpider(scrapy.Spider):
name = 'bossscrapy'
allowed_domains = ['zhipin.com']
prefix_url = 'http://www.zhipin.com/'
start_urls = [
'http://www.zhipin.com/'
]
def __init__(self, name=None, **kwargs):
self.url = 'https://www.zhipin.com/c101190400/?page={pageNo}&ka=page-{pageNo}&query=数据分析'
self.now_page = 1
super().__init__(name, **kwargs)
# @classmethod
# def from_crawler(cls, crawler, *args, **kwargs):
# return super().from_crawler(crawler, *args, **kwargs)
#
def start_requests(self):
yield scrapy.Request(self.url.format(pageNo=self.now_page), callback=self.parse,)
def parse(self, response):
global degree
global company_persion
logging.info(response)
item = BossscrapyItem()
if bool(response.xpath("//div[@class='job-list']")) == False:
logging.info('cookie已失效')
return
for info in response.xpath("//div[@class='job-list']/ul//li"):
url = self.prefix_url + ''.join(info.xpath(".//span[@class='job-name']/a/@href").extract())
jobs = ''.join(info.xpath(".//span[@class='job-name']/a/text()").extract())
work_address = info.xpath(".//span[@class='job-area']/text()").get()
scalary = info.xpath(".//div[@class='job-limit clearfix']/span/text()").get()
if len(info.xpath(".//div[@class='job-limit clearfix']/p/text()").extract()) == 2:
experiences, degree = info.xpath(".//div[@class='job-limit clearfix']/p/text()").extract()
else:
experiences = ''.join(info.xpath(".//div[@class='company-text']/p/text()").extract())
company = ''.join(info.xpath(".//div[@class='company-text']/h3/a/text()").extract())
if len(info.xpath(".//div[@class='company-text']/p/text()").extract()) == 2:
financing_condition,company_persion = info.xpath(".//div[@class='company-text']/p/text()").extract()
else:
financing_condition = ''.join(info.xpath(".//div[@class='company-text']/p/text()").extract())
item['url'] = url
item['jobs'] = jobs
item['work_address'] = work_address
item['scalary'] = scalary
item['experiences'] = experiences
item['degree'] = degree
item['company'] = company
item['financing_condition'] = financing_condition
item['company_persion'] = company_persion
yield item
try:
page = response.xpath("//div[@class='page']/a[last()]/@href").get()
next_url = self.prefix_url + page
if not page:
logging.info("爬取完毕,退出爬虫")
return
else:
logging.info("下一页地址:{}".format(next_url))
yield scrapy.Request(next_url)
except Exception as e:
logging.info('爬虫异常,退出爬虫...{}'.format(e))
return
pipeline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import logging
from openpyxl import Workbook
class BossscrapyPipeline:
def process_item(self, item, spider):
logging.info('用excel处理返回的数据')
line = [
item['company'], item['scalary'], item['jobs'], item['experiences'], item['degree'],item['work_address'],
item['company_persion'], item['financing_condition'],item['url']
]
self.ws.append(line)
self.wb.save(self.file_name)
return item
def __init__(self) -> None:
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(
[
'公司名称', '薪资',
'工作岗位', '要求工作经验', '要求学历',
'公司地址', '公司人数',
'融资情况', '招聘链接'
]
)
self.file_name = "bossInfo.xlsx"
def close_spider(self, spider):
# 关闭
self.wb.close()
middleware.py自创一个下载中间件
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import time
import logging
from selenium import webdriver
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.http import HtmlResponse
class BossscrapyDownloaderMiddleware:
def __init__(self) -> None:
logging.info("初始化浏览器")
self.driver = webdriver.Chrome()
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
self.driver.get(request.url)
time.sleep(5)
source = self.driver.page_source
response = HtmlResponse(url=self.driver.current_url, body=source, request=request, encoding='utf-8')
return response
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
setting.py 把下载件激活
BOT_NAME = 'BossScrapy'
SPIDER_MODULES = ['BossScrapy.spiders']
NEWSPIDER_MODULE = 'BossScrapy.spiders'
LOG_LEVEL = 'ERROR'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
# 'cookie':'lastCity=101190400; wd_guid=e0afac60-aa63-4f98-8495-d938bb72f550; historyState=state; _bl_uid=7zlp721hh9ChyLdzzsU5ng82bk6v; wt2=D6yLXYtmO5DN-Xkvy-ji4CpukgADINuY4wvFwSkhizXjohqwolD9FQwFpRv7P7AO3Mk_b1fQIaZVooyEqGZAk4A~~; wbg=0; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1653700494,1655595662,1655602859; __zp_seo_uuid__=00c42b1e-81d7-4c17-8530-72be852488d2; __g=-; acw_tc=0b6e703216556228747703283e016e76be2a670acaa87f08a3fa1d9f165ca1; geek_zp_token=V1RNsiEOz-0lxjVtRvyhgeKy-w6DzTxCo~; __zp_stoken__=7f3adPDQ1F11%2FWzFpWmF%2BIHp%2Fa38%2Fcm04IkcrZSg6aWBiTWFKGD5AW0BWXAN%2FWF5iHmMJJWVWcXclbSFVPH1VXyE8cjZsdBhpLDEAGgtKR3ZFBBtvbwwTLAMSdVpKOAN%2FXFo7W2BffFxtXSU%3D; __c=1655602859; __l=r=https%3A%2F%2Fcn.bing.com%2F&l=%2Fwww.zhipin.com%2Fc101190400%2F%3Fquery%3D%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%26page%3D2&s=3&g=&friend_source=0&s=3&friend_source=0; __a=16256098.1651058103.1655595662.1655602859.264.6.154.247; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1655623508',
'referer': 'https://www.zhipin.com/',
'sec-ch-ua-platform': "Windows",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
DOWNLOADER_MIDDLEWARES = {
'BossScrapy.middlewares.BossscrapyDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'BossScrapy.pipelines.BossscrapyPipeline': 300,
}