scrapy框架下的两个爬虫分工合作爬取智联招聘所有职位信息。

最新推荐文章于 2024-08-02 09:46:03 发布

天心有情

最新推荐文章于 2024-08-02 09:46:03 发布

阅读量3.1k

点赞数 3

分类专栏： scrapy 文章标签：爬虫爬虫合作 scrapy

本文链接：https://blog.csdn.net/sinat_35045195/article/details/75305753

版权

scrapy 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

爬虫一

本次爬取为两个爬虫，第一个爬虫爬取需要访问的URL并且存储到文本中，第二个爬虫读取第一个爬虫爬取的URl然后依次爬取该URL下内容，先运行第一个爬虫然后运行第二个爬虫即可完成爬取。

本帖仅供学习交流使用，请不要胡乱尝试以免影响网站正常运转

spiders文件下的spander.py文件内容

# -*- coding:utf-8 -*-
import scrapy
from ..items import ZhilianFistItem

class zhilian_url(scrapy.Spider):
    name = 'zhilian_url'
    start_urls = ['http://jobs.zhaopin.com/']

    def parse(self,response):
        myurl = ZhilianFistItem()

        urls = response.xpath('/html/body/div/div/div/a[@target="_blank"]/@href').extract()
        # if len(urls) == 0:
        #     print('+++++++++++++++++++     空空空空空空空     +++++++++++++++++++++++++')
        for url in urls:
            myurl['url'] = url
            # print('---------begin-----------------------------------------')
            # print(url)
            # print('---------end-----------------------------------------')
            yield myurl
    pass

items.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item,Field

class ZhilianFistItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()


    url = Field()

middlewares.py文件

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class ZhilianFistSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

pipelines.py文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# import  xlsxwriter

class ZhilianFistPipeline(object):
    # def open_spider(self, spider):
    def open_spider(self,spider):
        print('++++++++++++             ++++++++++++')
        print('++++++++++++    start    ++++++++++++')
        # 打开excel文件命名为url.xls
        # self.xls =xlsxwriter.Workbook('url.xlsx')
        # self.worksheet = self.xls.add_worksheet('myurls')
        # self.id = 0
        self.fp = open('myurls','w')
        print('++++++++++++      ok     ++++++++++++')
        pass
    def process_item(self, item, spider):
        if '.htm' in item['url']:
            pass
        elif 'http://jobs.zhaopin.com/' in item['url']:
            print('++++++++++++             ++++++++++++')
            print('++++++++++++    存储中    ++++++++++++')
            # id  =  'A' + str(self.id + 1)
            # # print('*****************', id, '***************************************')
            # self.worksheet.write(id, item['url'])
            # self.id = self.id +1
            self.fp.writelines(item['url']+"\n")
            print('++++++++++++     ok      ++++++++++++')


            return item
        else:
            pass

    # def spider_closed(self, spider):
    # def spider_closed(self, spider):
    def spider_closed(self, spider):
        print('++++++++++++           ++++++++++++')
        print('++++++++++++    结束    ++++++++++++')
        self.fp.close()
        print('++++++++++++    ok    ++++++++++++')

setting.py文件

# -*- coding: utf-8 -*-


BOT_NAME = 'zhilian_fist'

SPIDER_MODULES = ['zhilian_fist.spiders']
NEWSPIDER_MODULE = 'zhilian_fist.spiders'


# Obey robots.txt rules
ROBOTSTXT_OBEY = False



DEFAULT_REQUEST_HEADERS = {
    'Host':'jobs.zhaopin.com',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0'
}



ITEM_PIPELINES = {
   'zhilian_fist.pipelines.ZhilianFistPipeline': 300,
}

第二个爬虫，zhilian_second

spander.py文件

# -*- coding:utf-8 -*-
import scrapy
from ..items import ZhilianSecondItem
from scrapy import Request
from bs4 import BeautifulSoup
class spider(scrapy.Spider):
    name = 'zhilian_second'
    start_urls =[]
    def __init__(self):
        links = open('E:/PythonWorkStation/zhilian_fist/myurls')
        for line in links:
            # 一定要去掉换行符，如果有换行符则无法访问网址，真他妈坑爹
            line=line[:-1]
            # print('-----------------------------')
            # print('-----------------------------')
            # print(line+'测试是否有换行符')
            # print('-----------------------------')
            # print('-----------------------------')
            self.start_urls.append(line)
            # break
    def parse(self, response):
        item = ZhilianSecondItem()
        # print('--------------        start                 -----------------------')
        title_list = response.xpath('//div/span[@class="post"]/a/text()').extract()
        company_list = response.xpath('//div/span[@class="company_name"]/a/text()').extract()
        salary_list = response.xpath('//div/span[@class="salary"]/text()').extract()
        address_list = response.xpath('//div/span[@class="address"]/text()').extract()
        release_list = response.xpath('//div/span[@class="release_time"]/text()').extract()
        if response.xpath('//span[@class="search_page_next"]').extract()!= None:
            next_url = response.xpath('//span[@class="search_page_next"]/a/@href').extract()
            next_url=next_url[0].split('/')[2]
            # print('----b--------')
            # print('----b--------')
            # print(response.url)
            # print(len(response.url.split('/')))
            # print(next_url)
            # print(len(next_url))
            # print('----e--------')
            # print('----e--------')
            # self.start_urls.append( Request(response.url[:-9]+next_url[0]))

            if len(response.url.split('/'))==5:
                yield Request(response.url+next_url)
            elif len(response.url.split('/'))>5:
                i = len(next_url)+1
                print('***********')
                # print(i)
                print(next_url.lstrip('p'))
                print('***********')
                if (next_url.lstrip('p') == str(10) or next_url.lstrip('p')==str(100) or next_url.lstrip('p')==str(1000) or next_url.lstrip('p')== str(10000)):
                    print('++++++++++++++++')
                    i = i-1
                yield Request(response.url[:-(i)] + next_url)

        for a,s,d,f,g in zip(title_list,company_list,salary_list, address_list,release_list):
            item['company']=s
            item['salary']=d
            item['address']=f
            item['release']=g
            item['title'] = a
            yield item

items.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class ZhilianSecondItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    title =scrapy.Field()
    company =scrapy.Field()
    salary =scrapy.Field()
    address =scrapy.Field()
    release =scrapy.Field()

middlewares.py文件

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class ZhilianSecondSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

pipeline.py文件

# -*- coding: utf-8 -*-

class ZhilianSecondPipeline(object):
    def open_spider(self,spider):
        self.file = open('E:/招聘岗位.txt','w',encoding='utf-8')

    def process_item(self, item, spider):
        self.file.write(item['title']+","+item['company']+","+item['salary']+","+item['address']+","+item['release']+'\n')
        # print('----------------------------------------------------------')
        # print(item['title'],item['company'],item['salary'],item['address'],item['release'])
        # print('----------------------------------------------------------')
        return item
    def spoder_closed(self,spider):
        self.file.close()

setting.py文件

# -*- coding: utf-8 -*-
BOT_NAME = 'zhilian_second'
SPIDER_MODULES = ['zhilian_second.spiders']
NEWSPIDER_MODULE = 'zhilian_second.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'zhilian_second.pipelines.ZhilianSecondPipeline': 300,
}
LOG_LEVEL = 'INFO'

由于爬取的太多需要等的时间过长，所以本人在程序没有运行结束之前关终止了运行，但是依旧爬取了数十万岗位信息如下图所示

爬取的内容分割如下

（职位，公司名称，工资介绍，地址，发布日期。）