scrapy爬虫-拉勾网(学习交流)

scrapy爬拉钩网 (学习交流)

2020-5-29
本教程将针对于拉勾网数据爬取进行分析,本教程讲使用到requests库。我知道很多人懒不想看文字,我直接讲思路上图。(程序员的浪漫–直入正题)
(新手上路,讲的不对的地方请大佬指正)

 
拉勾网的职位信息可以不使用很大力气爬取,但是职位详情页就会出问题。如图:
在这里插入图片描述

可见拉钩对职位详情页进行了反爬机制。通过常规方式无法爬取,添加随机请求头再试一下。在middleware.py中添加:


class UserAgentDownloadMiddleware(object):
    USER_AGENTS = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
        'Mozilla/5.0 (X11; Linux i586; rv:63.0) Gecko/20100101 Firefox/63.0',
        'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10; rv:62.0) Gecko/20100101 Firefox/62.0',
        'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
    ]

    def process_request(self, request, spider):
        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent

再配置到setting.py中

SPIDER_MIDDLEWARES = {
   'csdn.middlewares.CsdnSpiderMiddleware': 543,
    'csdn.middlewares.UserAgentDownloadMiddleware': 2,
}

结果是:(并没有什么卵用)
在这里插入图片描述

分析一下网页:(随便打开一个职位详情页)发现在每一职位详情页的请求头中都会带有cookies,推断一下这个网站是用cookies验证的。
在这里插入图片描述

这个时候虽然发现了反爬机制,又伪造不了cookies,机智的我拿搜索页的cookies去测试一下。
用request库试一下:
思路:
1、获取搜索页的cookies
2、将获取的cookies带入到职位详情页的请求中去。
测试一下:
在这里插入图片描述

emmm我真是个小天才。果然,验证了我的猜想。接下来就简单了。

# 测试代码

# -*- encoding: utf-8 -*-

'''
@File    :   test.py    
@Time :  2020/5/29 21:56
@Author :   c-cc
@Software :  PyCharm
'''

import  requests

if __name__ =="__main__":

    # 搜索页url
    search_page = 'https://www.lagou.com/zhaopin/Java/?labelWords=label'
    job_page = 'https://www.lagou.com/jobs/6843953.html?show=175b5cccafa247288f3bdfde988594e2'
	#伪造的请求头
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Length': '25',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Host': 'www.lagou.com',
        'Origin': 'https://www.lagou.com',
        'Referer': search_page,
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
        'X-Anit-Forge-Code': '0',
        'X-Anit-Forge-Token': 'None',
        'X-Requested-With': 'XMLHttpRequest'
    }
    # 创建一个session对象
    session = requests.session()
    # 请求的数据
    data = {
        'first': 'true',
        'pn': '1',
        'kd': 'Python'
    }
    session.get(search_page)

    result = session.post(job_page, headers=headers, data=data, allow_redirects=False)
    print(result.text)

接下来就把requests集成到scrapy中就可以了.直接上代码

# -*- coding: utf-8 -*-
import scrapy
import requests
import re
from lxml import etree
# from scrapy.item import DictItem
# from scrapy.loader import ItemLoader
from lagou.items import LagouItem


class GetSpider(scrapy.Spider):
    name = 'get'
    allowed_domains = ['lagou.com']
    start_urls = ['https://lagou.com/']

    item = LagouItem()

    def parse(self, response):
        first_categorys = response.xpath('//div[@id="sidebar"]//div[@class="menu_box"]')
        for first_category in first_categorys:
            # 一级分类标题
            first_category_title = first_category.xpath('.//h2/text()').getall()

            # list
            second_categorys = first_category.xpath('.//div[@class="menu_sub dn"]')
            for second_category in second_categorys:
                # 二级职务分类
                job_category = second_category.xpath('.//span/text()').getall()
                job = second_category.xpath('.//a/h3/text()').getall()

                self.item['job_all_category'] = ",".join(job)
                self.item['job_first_category'] = job_category

                # self.item['job_title'] = job
                # 工作详情页
                job_urls = second_category.xpath('.//a/@href').getall()
                # print(first_category_title,job_category.getall(),job.getall())
                for job_url in job_urls:
                    self.joburl = job_url
                    yield scrapy.Request(url=job_url, callback=self.job_parse)

                    # 通过请求此url获取cookies进行内容爬取

    def job_parse(self, response):
        # 工作详情页
        job_pages_url = response.xpath('//div[@id="s_position_list"]//div[@class="position"]//a/@href').get()
        # print(response.url)
        # # 当前url即为工作详情页。
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '25',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'www.lagou.com',
            'Origin': 'https://www.lagou.com',
            'Referer': response.url,
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
            'X-Anit-Forge-Code': '0',
            'X-Anit-Forge-Token': 'None',
            'X-Requested-With': 'XMLHttpRequest'
        }
        # 创建一个session对象
        session = requests.session()
        # 请求的数据
        data = {
            'first': 'true',
            'pn': '1',
            'kd': 'Python'
        }
        session.get(response.url)

        result = session.post(job_pages_url, headers=headers, data=data, allow_redirects=False)
        # 将返回对象转化为HTML,以便于使用xpath语法
        html = etree.HTML(result.content)
        describe = html.xpath('//dl[@id="job_detail"]//div[@class="job-detail"]/p/text()')
        salary = html.xpath('//dd[@class="job_request"]//span[1]/text()')
        job_expresion = html.xpath('//dd[@class="job_request"]//span[3]/text()')
        city = html.xpath('//dd[@class="job_request"]//span[2]/text()')
        job_company = html.xpath('//*[@id="job_company"]/dt/a/img/@alt')
        job_title = html.xpath('//h1[2]/text()')

        self.item['job_describe'] = describe
        self.item['job_title'] = job_title
        self.item["job_expresion"] = job_expresion
        self.item["job_url"] = response.url
        self.item["job_salary"] = salary
        self.item["job_city"] = city
        self.item["job_company"] = job_company
        print(self.item)

我只获取了职位的分类,标题,工作简介,薪资工作地点等参数.虽然可以通过这种方法爬取到数据,但是很容被封ip.血泪史.后来在芝麻代理每天白嫖半个小时ip总算是搞完(猛男落泪).

 
添加常用反爬机制中间件:随即请求头,ip代理池

# ip代理,ip已失效,需要的自己购买或者是去芝麻白嫖,每天半个小时 =-=
class IpProxyMidleware(object):
    Proxy = [
        '58.218.92.90:2816',
        '58.218.92.94: 7269',
        '58.218.92.89: 4889',
        '58.218.92.94: 8008',
        '58.218.92.91: 9840',
        '58.218.92.86: 5329',
        '58.218.92.94: 3912',
        '58.218.92.89: 4628',
        '58.218.92.89: 3800',
        '58.218.92.90: 4672',
        '58.218.92.89: 5858',
        '58.218.92.94: 8525',
        '58.218.92.90: 9764',
        '58.218.92.94: 9876',
        '58.218.92.90: 9799',
        '58.218.92.91: 3280',
        '58.218.92.89: 5410',
        '58.218.92.89: 4320',
        '58.218.92.89: 8018',
        '58.218.92.91: 4889',
    ]

    def process_request(self, request, spider):
        request.meta['proxy'] = 'https://' + random.choice(self.Proxy)


class UserAgentDownloadMiddleware(object):
    USER_AGENTS = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
        'Mozilla/5.0 (X11; Linux i586; rv:63.0) Gecko/20100101 Firefox/63.0',
        'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10; rv:62.0) Gecko/20100101 Firefox/62.0',
        'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
    ]

    def process_request(self, request, spider):
        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent

至此基本上就解决了拉钩爬不到数据的问题。

 
 
我已经将源码放到了我的码云上:码云地址https://gitee.com/hou_cc/woyulagoudedouzhidouyong
转载请注明来源。

  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
基于Python Scrapy实现的拉勾全站职位数据采集 爬虫系统 含数据库处理 # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy,re from scrapy.loader import ItemLoader from scrapy.loader.processors import MapCompose, TakeFirst from w3lib.html import remove_tags def extract_num(text): #从字符串中提取出数字 match_re = re.match(".*?(\d+).*", text) if match_re: nums = int(match_re.group(1)) else: nums = 0 return nums def replace_splash(value): '''去除/''' return value.replace("/", "") def handle_strip(value): '''空格''' return value.strip() def handle_jobaddr(value): '''去查看地图''' addr_list = value.split("\n") addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"] return "".join(addr_list) class LagouJobItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst() class LagouJobItem(scrapy.Item): #拉勾职位 title = scrapy.Field() url = scrapy.Field() salary = scrapy.Field() job_city = scrapy.Field( input_processor=MapCompose(replace_splash), ) work_years = scrapy.Field( input_processor=MapCompose(replace_splash), ) degree_need = scrapy.Field( input_processor=MapCompose(replace_splash), ) job_type = scrapy.Field() publish_time = scrapy.Field() job_advantage = scrapy.Field() job_desc = scrapy.Field( input_processor=MapCompose(handle_strip), ) job_addr = scrapy.Field( input_processor=MapCompose(remove_tags, handle_jobaddr), ) company_name = scrapy.Field( input_processor=MapCompose(handle_strip), ) company_url = scrapy.Field() crawl_time = scrapy.Field() crawl_update_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into lagou_job(title, url, salary, job_city, work_years, degree_need, job_type, publish_time
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值