------爬取51job招聘网站


import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from myproject.items import JobItem
# from w3lib.html import remove_tags
import datetime
import hashlib
import re,math
from datetime import timedelta
from w3lib.html  import remove_tags

class JobSpider(CrawlSpider):
    name = 'job'
    allowed_domains = ['51job.com']
    start_urls = ['http://51job.com/']

    custom_settings = {
         'DEFAULT_REQUEST_HEADERS':{
            # "HOST":"www.51job.com",
            "Connection":"keep-alive",
            "Cookie": "partner=baidupz;51job=cenglish%3D0%26%7C%26;guid=15239472266864690059; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60010000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%601%A1%FB%A1%FA010000%2C00%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA%A1%FB%A1%FA-1%A1%FB%A1%FA1523947245%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA%7C%21; adv=adsnew%3D0%26%7C%26adsresume%3D1%26%7C%26adsfrom%3Dhttps%253A%252F%252Fwww.baidu.com%252Fs%253Fie%253Dutf-8%2526f%253D3%2526rsv_bp%253D1%2526rsv_idx%253D1%2526tn%253Dbaidu%2526wd%253D51job%2526oq%253D%252525E6%25252599%252525BA%252525E8%25252581%25252594%252525E6%2525258B%2525259B%252525E8%25252581%25252598%2526rsv_pq%253Dac5ea6250006502c%2526rsv_t%253Da442Kq379pCcmdG2lH0k2AbNMT%25252Fwkd1JPVOZO3ZVCAhl4kRGPfTviTDFhTQ%2526rqlang%253Dcn%2526rsv_enter%253D1%2526inputT%253D3836%2526rsv_sug3%253D18%2526rsv_sug1%253D8%2526rsv_sug7%253D100%2526rsv_sug2%253D0%2526rsv_sug4%253D4509%26%7C%26adsnum%3D789233;nolife=fromdomain%3Dwww",
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
        },
        # "ITEM_PIPELINES" : {
        #    'myproject.pipelines.MyprojectPipeline': 1,
        # }

    }
    num_pattern = re.compile(r'\d+')

    rules = (
        Rule(LinkExtractor(allow=r'https://search.51job.com/'), follow=True),
        Rule(LinkExtractor(allow=r'https://jobs.51job.com/.*/\d+.html'), follow=False,callback='parse_item',process_request='process_request'),

    )

    def process_request(self,
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值