使用python爬虫实现访问51job的想要查找的工作岗位信息并形成一个excel表

前言:小编也是现学现卖,方便自己记忆,写的不好的地方还请包涵,也欢迎各位大佬多多批评指正。

话不多说直接上代码,
1.首先编写的是基本的excel表格内容,使其形成函数方便调用。

import requests
import bs4
import openpyxl

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) \
Gecko/20100101 Firefox/77.0"}
def get_html(url,headers):
    res = requests.get(url,headers=headers)
    res.encoding = "gbk"  
    return res.text 

def get_text(html,url):
    "我的功能是html源码中获取中文信息"
    jobname = [] 
    company = [] 
    money = []  
    jobneed = [] 
    jobtemptation = []
    jobinfo =  []  
    cpinfo = [] 
    
    soup = bs4.BeautifulSoup(html,"html.parser")
    targets = soup.find("h1")  
    jobname.append(targets.text) 
    #print(jobname) 

    targets = soup.find("a",class_="catn")
    company.append(targets.text)

    targets = soup.find("div",class_="cn")
    money.append(targets.strong.text)

    targets = soup.find("p",class_="msg ltype")
    jobneed.append(targets.text.replace("\xa0",""))
 
    targets = soup.find_all("span",class_="sp4") 
    for i in targets:
        stra += i.text + " "
    jobtemptation.append(stra)
    
    targets = soup.find("div",class_="bmsg job_msg inbox")
    strb = ""
    for i in targets.find_all("p"):
        strb += i.text + " "
    jobinfo.append(strb)
        
    targets = soup.find("div",class_="tmsg inbox")
    cpinfo.append(targets.text)

    all_list = jobname+company+money+jobneed+jobtemptation+jobinfo+cpinfo
    all_list.append(url)
    return all_list

def save_info(all_list):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.append(["岗位名称","公司名称","薪资","岗位要求","职位诱惑","职位信息","公司信息","职位网站"])
    ws.append(all_list)
    wb.save("c:\\users\\spring\\desktop\\nz2001.xlsx")

if __name__ == "__main__":
    #url = "https://jobs.51job.com/changsha-ylq/82873008.html?s=01&t=0"
    #url = "https://jobs.51job.com/hunansheng/122942156.html?s=01&t=0"  #演示获取别的主页信息
    url = "https://jobs.51job.com/changsha-ylq/117480590.html?s=01&t=0"
   
    html = get_html(url,headers)
    all_list = get_text(html,url)
    save_info(all_list)

2.调用上方编写好的函数,我这里函数名为 51jobpage。

import job51page  #调用之前编写好的模块
import re     #调用正则
import openpyxl  #调用编写excel的表格

def get_joblist(reruler,html):
    "我的功能是通过搜到的html岗位页面获取岗位地址列表列表"
    joblist = re.findall(reruler,html)
    #print(joblist)
    return joblist

def save_info(joblist):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.append(["岗位名称","公司名称","薪资","岗位要求","职位诱惑","职位信息","公司信息","职位网站"])
    for joburl in joblist: 
        html = job51page.get_html(joburl,job51page.headers) #逐一每个岗位的网页信息
        jobinfolist = job51page.get_text(html,joburl)  #逐一提取每个岗位信息中的文字信息
        ws.append(jobinfolist) 
    wb.save("c:\\users\\spring\\desktop\\nz2001.xlsx")

if __name__ == "__main__":
    url = "https://search.51job.com/list/190000%252C090200,\
000000,0000,00,9,99,渗透测试工程师,2,1.html"  
    reruler = "https://jobs.51job.com/.*/\d{9}.html"   
    html = job51page.get_html(url,job51page.headers) #调用job51paged.get_html 函数 实现访问页面和添加头部信息
    joblist = get_joblist(reruler,html) #将reruler正则岗位各地区信息和html网页信息赋值给joblist
    save_info(joblist)

3.如想搜索别的岗位的信息,直接更换搜索名称即可
在这里插入图片描述
注:我这里是以湖南和四川为例,具体城市可更换url进行搜索。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值