2021-07-24

超简单抓取51job网站招聘数据,代码拿来就能用

我用的是:pycharm
必要的三步
1.下载一个selenium的包
在这里插入图片描述2.如图查看你的谷歌版本

在在这里插入图片描述[在这个链接里下载你对应版本的驱动,记住下载的位置,等下要用]http://chromedriver.storage.googleapis.com/index.html
在下面代码executable_path="D:\QQ\chromedriver.exe"改成你下载的驱动的路径就可以了
在这里查看你的谷歌版本在这里插入图片描述

三步完成,复制代码即可运行

from selenium import  webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.webdriver import Options
import pymysql
from time import sleep
import time
import re
chrome_options = Options()
chrome_options.add_argument('--ignore-certificate-errrors-spki-list')
chrome_options.add_experimental_option('excludeSwitches',['enable-logging'])
在下面的executable_path="D:\QQ\chromedriver.exe"改成你下载的驱动的位置就可以了
driver= webdriver.Chrome(executable_path="D:\QQ\chromedriver.exe",options=chrome_options)
#用数组存取要切换的城市编码
citys =["01000","020000","030200","040000","180200","200200","080200"
           , "070200","090200","060000","030800","230300","230200" ,"070300", "250200", "190200"            "150200","080300","170200","050000","120300","220200","240200","110200", "01"]
#存取要搜索的关键词,直接在里面换你想要搜的搜索词
keywords = ['hadoop', 'spark', 'flink','hive', 'java', 'python爬虫', '大数据分析', '数据挖掘','数据仓库','机器学习','自动化测试',  'ETL工程师', "人工智能"]
for city in citys:
    for keyword in keywords:
        url = "https://search.51job.com/list/"+str(city)+",000000,0000,00,9,99," + keyword + ",2,1.html"
        driver.get(url)
        wait = WebDriverWait(driver, 2000)
        divs1 = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, 'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4)')))
        pages = divs1.find_element(By.CSS_SELECTOR, 'div.j_page > div > div > div > span:nth-child(1)').text
        a = re.findall(r"\d+\.?\d*", pages)
        page = int(a[0])
        for i in range(1, page + 1):
            url = "https://search.51job.com/list/"+str(city)+",000000,0000,00,9,99," + keyword + ",2," + str(i) + ".html"
            driver.get(url)
            sleep(1)
            wait = WebDriverWait(driver, 2000)
            divs = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
                                                              'body>div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_joblist')))
            lists = divs.find_elements(By.CSS_SELECTOR, 'div.e')
            for div in lists:
                # 岗位名字body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_joblist > div:nth-child(1) > a > p.t > span.jname.at
                post_name = div.find_element(By.CLASS_NAME, 'jname.at').get_attribute('textContent')
                # 发布日期
                deploy_date = div.find_element(By.CSS_SELECTOR, ' a > p.t > span.time').text
                # 详细页面
                deploy_company_href = div.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')

                salary_range = div.find_element(By.CSS_SELECTOR, ' a > p.info > span.sal').text
                # 公司福利
                try:
                    post_welfare = div.find_element(By.CSS_SELECTOR, 'a > p.tags > span').text
                except:
                    post_welfare = ''

                # 搜索关键词i
                search_words = keyword
                # 搜索地区
                search_area = city
                # 详细URL
                post_ex_url = url

                area1 = div.find_element(By.CSS_SELECTOR, 'a > p.info > span.d.at')
                ss = area1.get_attribute('innerHTML').split('|')
                if len(ss) == 4:
                    # 地区
                    post_area = ss[0]
                    # 经验
                    post_years = ss[1]
                    # 学历
                    post_education = ss[2]
                    # 招人数量
                    peoples = ss[3]

                elif len(ss) == 3:
                    # 地区
                    post_area = ss[0]
                    # 经验
                    if ss[1].find('经验')>-1 or ss[1].find('在')>-1:
                        post_years = ss[1]
                        post_education = ''
                    else:
                        post_years = ''
                        post_education = ss[1]
                    # 招人数量
                    peoples = ss[2]
                else:
                    # 地区
                    post_area = ss[0]
                    # 经验
                    post_years = ''
                    # 学历
                    post_education = ''
                    # 招人数量
                    peoples = ss[1]
                # 公司名字
                try:
                    deploy_company = div.find_element(By.CSS_SELECTOR, 'div.er > a').text
                except:
                    deploy_company = ''
                # div.er > p.dc.at   div.er > p.dc.at
                try:
                    Company_Financing_situation = div.find_element(By.CSS_SELECTOR, 'div.er > p.dc.at')
                    dd = Company_Financing_situation.get_attribute('innerHTML').split('|')
                    # print(dd)
                    # print(len(dd))
                    if (len(dd) == 2):
                        # 公司性质p
                        company_nature = dd[0]
                        # 人员规模
                        staff_num = dd[1]
                    else:
                        if dd[0].find('人')>-1:
                            staff_num =dd[0]
                        else:
                            company_nature = dd[0]
                except:
                    Company_Financing_situation = ''
                # 公司类型
                try:
                    Company_Category = div.find_element(By.CSS_SELECTOR, 'div.er > p.int.at').text
                except:
                    Company_Category = ''
                # 抓取网站
                from_website = "51Job"
                # 抓取页码l
                scrape_page = i
                # 抓取人员
                scrape_person = "zxq"
                listsg = [post_name, deploy_date, deploy_company_href, salary_range, post_welfare, search_words,
                          post_ex_url, post_area, post_years, post_education, peoples, deploy_company, company_nature,
                          staff_num, Company_Category, from_website, scrape_person, search_area, scrape_page]
                #打印抓取到的信息
                print(listsg)               

如果还想存进数据库,做数据分析与可视化,可以私聊博主,如有问题欢迎指正,谢谢!

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值