这是在爬取伯乐在线的基础之上的,所以就没重复代码。
在lagou.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ArticleSpider.utils.common import get_md5
from selenium import webdriver
import time
import pickle
from ArticleSpider.items import LagouJobItemLoader, LagouJobItem
from datetime import datetime
class LagouSpider(CrawlSpider):
name = 'lagou'
allowed_domains = ['www.lagou.com']
start_urls = ['https://www.lagou.com/']
# headers = {
# "HOST": "www.lagou.com",
# "Referer": 'https://www.lagou.com',
#
# 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36"
# }
rules = (
Rule(LinkExtractor(allow=r'gongsi/j/\d+.html'), follow=True),
Rule(LinkExtractor(allow=r'zhaopin/.*'), follow=True),
Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
)
def parse_job(self, response):
#解析拉勾网的职位
item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)