import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from myproject.items import JobItem # from w3lib.html import remove_tags import datetime import hashlib import re,math from datetime import timedelta from w3lib.html import remove_tags class JobSpider(CrawlSpider): name = 'job' allowed_domains = ['51job.com'] start_urls = ['http://51job.com/'] custom_settings = { 'DEFAULT_REQUEST_HEADERS':{ # "HOST":"www.51job.com", "Connection":"keep-alive", "Cookie": "partner=baidupz;51job=cenglish%3D0%26%7C%26;guid=15239472266864690059; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60010000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%601%A1%FB%A1%FA010000%2C00%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA%A1%FB%A1%FA-1%A1%FB%A1%FA1523947245%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA%7C%21; adv=adsnew%3D0%26%7C%26adsresume%3D1%26%7C%26adsfrom%3Dhttps%253A%252F%252Fwww.baidu.com%252Fs%253Fie%253Dutf-8%2526f%253D3%2526rsv_bp%253D1%2526rsv_idx%253D1%2526tn%253Dbaidu%2526wd%253D51job%2526oq%253D%252525E6%25252599%252525BA%252525E8%25252581%25252594%252525E6%2525258B%2525259B%252525E8%25252581%25252598%2526rsv_pq%253Dac5ea6250006502c%2526rsv_t%253Da442Kq379pCcmdG2lH0k2AbNMT%25252Fwkd1JPVOZO3ZVCAhl4kRGPfTviTDFhTQ%2526rqlang%253Dcn%2526rsv_enter%253D1%2526inputT%253D3836%2526rsv_sug3%253D18%2526rsv_sug1%253D8%2526rsv_sug7%253D100%2526rsv_sug2%253D0%2526rsv_sug4%253D4509%26%7C%26adsnum%3D789233;nolife=fromdomain%3Dwww", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" }, # "ITEM_PIPELINES" : { # 'myproject.pipelines.MyprojectPipeline': 1, # } } num_pattern = re.compile(r'\d+') rules = ( Rule(LinkExtractor(allow=r'https://search.51job.com/'), follow=True), Rule(LinkExtractor(allow=r'https://jobs.51job.com/.*/\d+.html'), follow=False,callback='parse_item',process_request='process_request'), ) def process_request(self,
------爬取51job招聘网站
最新推荐文章于 2024-05-14 02:41:07 发布