超简单抓取51job网站招聘数据,代码拿来就能用
我用的是:pycharm
必要的三步
1.下载一个selenium的包
2.如图查看你的谷歌版本
[在这个链接里下载你对应版本的驱动,记住下载的位置,等下要用]http://chromedriver.storage.googleapis.com/index.html
在下面代码executable_path="D:\QQ\chromedriver.exe"改成你下载的驱动的路径就可以了
三步完成,复制代码即可运行
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.webdriver import Options
import pymysql
from time import sleep
import time
import re
chrome_options = Options()
chrome_options.add_argument('--ignore-certificate-errrors-spki-list')
chrome_options.add_experimental_option('excludeSwitches',['enable-logging'])
在下面的executable_path="D:\QQ\chromedriver.exe"改成你下载的驱动的位置就可以了
driver= webdriver.Chrome(executable_path="D:\QQ\chromedriver.exe",options=chrome_options)
#用数组存取要切换的城市编码
citys =["01000","020000","030200","040000","180200","200200","080200"
, "070200","090200","060000","030800","230300","230200" ,"070300", "250200", "190200" "150200","080300","170200","050000","120300","220200","240200","110200", "01"]
#存取要搜索的关键词,直接在里面换你想要搜的搜索词
keywords = ['hadoop', 'spark', 'flink','hive', 'java', 'python爬虫', '大数据分析', '数据挖掘','数据仓库','机器学习','自动化测试', 'ETL工程师', "人工智能"]
for city in citys:
for keyword in keywords:
url = "https://search.51job.com/list/"+str(city)+",000000,0000,00,9,99," + keyword + ",2,1.html"
driver.get(url)
wait = WebDriverWait(driver, 2000)
divs1 = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, 'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4)')))
pages = divs1.find_element(By.CSS_SELECTOR, 'div.j_page > div > div > div > span:nth-child(1)').text
a = re.findall(r"\d+\.?\d*", pages)
page = int(a[0])
for i in range(1, page + 1):
url = "https://search.51job.com/list/"+str(city)+",000000,0000,00,9,99," + keyword + ",2," + str(i) + ".html"
driver.get(url)
sleep(1)
wait = WebDriverWait(driver, 2000)
divs = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
'body>div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_joblist')))
lists = divs.find_elements(By.CSS_SELECTOR, 'div.e')
for div in lists:
# 岗位名字body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_joblist > div:nth-child(1) > a > p.t > span.jname.at
post_name = div.find_element(By.CLASS_NAME, 'jname.at').get_attribute('textContent')
# 发布日期
deploy_date = div.find_element(By.CSS_SELECTOR, ' a > p.t > span.time').text
# 详细页面
deploy_company_href = div.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
salary_range = div.find_element(By.CSS_SELECTOR, ' a > p.info > span.sal').text
# 公司福利
try:
post_welfare = div.find_element(By.CSS_SELECTOR, 'a > p.tags > span').text
except:
post_welfare = ''
# 搜索关键词i
search_words = keyword
# 搜索地区
search_area = city
# 详细URL
post_ex_url = url
area1 = div.find_element(By.CSS_SELECTOR, 'a > p.info > span.d.at')
ss = area1.get_attribute('innerHTML').split('|')
if len(ss) == 4:
# 地区
post_area = ss[0]
# 经验
post_years = ss[1]
# 学历
post_education = ss[2]
# 招人数量
peoples = ss[3]
elif len(ss) == 3:
# 地区
post_area = ss[0]
# 经验
if ss[1].find('经验')>-1 or ss[1].find('在')>-1:
post_years = ss[1]
post_education = ''
else:
post_years = ''
post_education = ss[1]
# 招人数量
peoples = ss[2]
else:
# 地区
post_area = ss[0]
# 经验
post_years = ''
# 学历
post_education = ''
# 招人数量
peoples = ss[1]
# 公司名字
try:
deploy_company = div.find_element(By.CSS_SELECTOR, 'div.er > a').text
except:
deploy_company = ''
# div.er > p.dc.at div.er > p.dc.at
try:
Company_Financing_situation = div.find_element(By.CSS_SELECTOR, 'div.er > p.dc.at')
dd = Company_Financing_situation.get_attribute('innerHTML').split('|')
# print(dd)
# print(len(dd))
if (len(dd) == 2):
# 公司性质p
company_nature = dd[0]
# 人员规模
staff_num = dd[1]
else:
if dd[0].find('人')>-1:
staff_num =dd[0]
else:
company_nature = dd[0]
except:
Company_Financing_situation = ''
# 公司类型
try:
Company_Category = div.find_element(By.CSS_SELECTOR, 'div.er > p.int.at').text
except:
Company_Category = ''
# 抓取网站
from_website = "51Job"
# 抓取页码l
scrape_page = i
# 抓取人员
scrape_person = "zxq"
listsg = [post_name, deploy_date, deploy_company_href, salary_range, post_welfare, search_words,
post_ex_url, post_area, post_years, post_education, peoples, deploy_company, company_nature,
staff_num, Company_Category, from_website, scrape_person, search_area, scrape_page]
#打印抓取到的信息
print(listsg)
如果还想存进数据库,做数据分析与可视化,可以私聊博主,如有问题欢迎指正,谢谢!