简单python爬虫爬取拉钩网

因为个人需求,爬取了拉钩网数据挖掘相关职位的数据

首先先进入到拉钩的首页,搜索数据挖掘,得到相关职位的列表,按F12,查看网络查看html,可以看到职位列表并不在html


所以肯定是通过XHR异步加载的,再切换到XHR,可以找到4个,点开查看,可以看到在一个请求中有我们需要的信息:


再切换到一个具体的职位中查看,可以看到,有一串编号,

跳回到刚刚的xhr中查看,可以看到有一个positionID与之对应,所以我们需要通过获取positionID来获取具体职位信息

将职位信息页面和XHR的请求头保存下来,作为头,进行网络请求,下面是具体的代码:

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 25 20:10:19 2018

@author: cy
"""

'''启动/调配爬虫'''
import requests
import json
from bs4 import BeautifulSoup as bs
import re
import time

XHR_HEAD_PATH = r'C:\Users\cy\Desktop\lagou\xhr_head.txt'#获取xhr的HEAD
XHR_URL = r'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' 
POS_HEAD_PATH = r'C:\Users\cy\Desktop\lagou\pos_head.txt' #获取职位列表及职位信息的HEAD
PAGE_NUMBER = 29 #搜索得到的职位列表的页面
SAVE_PATH = r'C:\Users\cy\Desktop\lagou\save.txt' #信息保存的的路径

#获取XHR
def get_xhr(head,num):
    data={'first':'true','kd':'数据挖掘','pn':num}
    re = requests.post(XHR_URL,headers = head, data = data)
    if re.status_code == 200:
        re_text = re.text
    else:
        re_text = None
        print('%d 页面访问错误'%num)
    return re_text
        

#解析xhr,获取positionID
def get_posID(xhr_text):
    if xhr_text == None:
        return None
    xhr_json = json.loads(xhr_text)
    result = xhr_json['content']['positionResult']['result']
    id_list = []
    for i in range(15):
        id_list.append(result[i]['positionId'])
    return id_list

#获取岗位信息html
def get_posInfo(id,head):
    if id == None:
        print('id为空')
        return None
   
    url = r'https://www.lagou.com/jobs/'+str(id)+'.html'
    print(url)
    re = requests.get(url,headers = head)
    if re.status_code != 200:
        return None
    pos_html = re.text
    return pos_html
            
            

#解析岗位信息
def analysis_pos(pos_html):
    if pos_html == None:
        print('岗位信息页面为空')
        return None
    soup = bs(pos_html)
    job_name = soup.find_all('span',class_='name')[0].text
   
    dd = soup.find_all('dd',class_='job_request')
    pattern = re.compile(r'>(.*?)</span>')
    result = pattern.findall(str(dd))
    for i in range(len(result)):
        result[i]  = result[i].replace('/','').strip()
        
    job_info = soup.find_all('dd',class_='job_bt')
    job_infos = str(job_info)
    return job_name, result, job_infos
    


def run():
    with open(XHR_HEAD_PATH,'r') as file:  #组装访问xhr的head
        xhr_text = file.read()
        xhr_sp = xhr_text.split('\n')
        xhr_head = {}
        n = len(xhr_sp)
        for i in range(n//2):
            xhr_head[xhr_sp[i*2].strip()] = xhr_sp[i*2+1].strip()
        print('xhr_head加载成功')
     
    with open(POS_HEAD_PATH,'r') as file:  #组装访问xhr的head
        pos_text = file.read()
        pos_sp = pos_text.split('\n')
        pos_head = {}
        n = len(pos_sp)
        for i in range(n//2):
            pos_head[pos_sp[i*2].strip()] = pos_sp[i*2+1].strip()
        print('pos_head加载成功')
    
    with open(SAVE_PATH,'w+',encoding='utf-8') as file:        
        for num in range(PAGE_NUMBER):
            xhr_json = get_xhr(xhr_head,num+1)  #获取xhr返回的json
            time.sleep(10)
            for i in posIDs:
                posInfoHtml = get_posInfo(i,pos_head)  #访问对应positionID的页面,获取html
                print('已访问%d页面'%i)
                job_name, result, job_info = analysis_pos(posInfoHtml) #对获取的HTML页面进行解析
                time.sleep(10)
                line = job_name+'0000'+str(result)+'0000'+job_info
                file.write(line+'\n')  
                print('页面%d已存储'%i)

if __name__ =='__main__':
    run()
    print('运行结束')


这个爬虫很简陋,非常简陋,有机会修改一下
  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值