pthon 爬虫使用requests爬取51job职位信息

最新推荐文章于 2024-05-12 05:03:45 发布

FlenceXu

最新推荐文章于 2024-05-12 05:03:45 发布

阅读量642

点赞数 3

分类专栏： 1010

本文链接：https://blog.csdn.net/qq_40243365/article/details/83003197

版权

1010 专栏收录该内容

7 篇文章 0 订阅

订阅专栏


#从  文件内   导入   函数
from getHtml import getHtml
from bs4 import BeautifulSoup
from openpyxl import Workbook
import time

#获取职位信息的函数
def getData(url):
    
    #准备网址
    #url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    
    #获取函数返回的网页文本内容
    html = getHtml(url)
 
    #获取beautifulsoup对象
    soup = BeautifulSoup(html,'html.parser')
  
    #分析网页提取内容
    #遵循常用的规则，由大到小，先确定父元素，再确定子元素

    #先获取指定内容的父元素
    parent = soup.find(id = 'resultList')
    #通过父元素获取所有class为el的div，可以避免父元素之外的div的影响
 
    #获取所有class为el的div
    divs = parent.find_all('div',class_='el')
  
    #使用for从每一个div提取信息
    #在实际操作过程中发现，第一次就报错，回去查看结构发现，第一个不是想要的p标签
  
    divs.pop(0)#那么就删除第一个p标签
   
    for each in divs:
        #获取每一个el的职位名信息
        jobName =  each.find('p').get_text().strip()
        # print(jobName)
        #获取span下的公司名
        company = each.find_all('span',recursive=False)[0].get_text().strip()
        #print(company)
        #获取工作地点
        address = each.find_all('span', recursive=False)[1].get_text().strip()
        #获取薪资
        salary = each.find_all('span', recursive=False)[2].get_text().strip()
        #获取发布时间
        lunch_time = each.find_all('span', recursive=False)[3].text.strip()
        dataList.append([jobName,company,address,salary,lunch_time])

dataList = []
def saveToExcel(excelname):
    wb = Workbook()
    ws = wb.active
    ws.title = excelname+'职位信息'
    ws.append(['职位名称','公司名称','工作地点','薪资','发布时间'])
    for i in dataList:
        ws.append(i)
    wb.save(excelname+'.xlsx')

#创建主函数，可以爬取多页
def main(jobname,n,excelname):
    for index in range(1,n+1) :
        url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'+jobname+',2,'+str(index)+'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
        print("正在爬取第" + str(index) + "页")
        getData(url)
        time.sleep(1)
    saveToExcel(excelname)


main('数据分析师',3,'数据分析师')

FlenceXu

关注

3
点赞
踩
1

收藏

觉得还不错? 一键收藏
2
评论
pthon 爬虫使用requests爬取51job职位信息

#导入模块from getHtml import getHtmlfrom bs4 import BeautifulSoupdataList = []#准备网址url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE%25E5%...
复制链接

扫一扫