学习教程-获取51job数据
- 需要的环境为:
- 虚拟环境:Mac为conda(本文所使用的),Windows为anaconda
- Python环境:3.9以上
- 编程环境:jupyter notebook 或者是 Pycharm
- 浏览器驱动器:根据个人的浏览器版本对应去官网找
1、虚拟环境创建和使用
虚拟环境创建
conda create -n your_env_name python=X.X #建议python版本为3.9
激活虚拟环境
source activate your_env_name
同时,本文所使用的是selenium对51job进行爬取。对此,在完成虚拟环境创建后,到conda所在目录,进入到envs目录中对应的虚拟环境名称目录下,再进入bin目录中,把浏览器驱动copy到该目录下。这样保证了运行selenium代码时,浏览器能被自动化调用。
2、使用jupyter notebook
安装jupyter notebook
建议直接用以下命令
pip install jupyter -i https://pypi.tuna.tsinghua.edu.cn/simple
使用办法
在虚拟环境下,直接输入。此时,所在位置会成为jupyter notebook的默认位置
jupyter notebook
3、获取51job数据
根据职位搜索信息
from selenium import webdriver
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
driver = webdriver.Edge()
driver.get("https://www.51job.com")
# #kwdselectid 输入 查询
inputKey=driver.find_element(By.XPATH,"/html/body/div[3]/div/div[1]/div/div/p[1]/input")
inputKey.click()
inputKey.send_keys("大数据")
# /html/body/div[3]/div/div[1]/div/button
# time.sleep(5)
accessKey=driver.find_element(By.XPATH,"/html/body/div[3]/div/div[1]/div/button")
accessKey.click()
# 完成搜索
获取每行数据的简介,并实现数据清洗
import time as tm
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 开始爬取数据
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]
# 数据存储在该列表下面,对此分析得到-> div[i],可以获得不同的数据
# 第几页
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[2]/div/div/div/input
# title
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[1]/a/p[1]/span[1]
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[2]/a/p[1]/span[1]
# time
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[1]/a/p[1]/span[2]
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[2]/a/p[1]/span[2]
# salary
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[1]/a/p[2]/span[1]
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[2]/a/p[2]/span[1]
# info -> 里面有地点,工作经验,文化水平
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[1]/a/p[2]/span[2]
# /html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[2]/a/p[2]/span[2]
# 创建表
joblist = list()
timelist = list()
salarylist = list()
placelist = list()
experiencelist = list()
educationlist = list()
# 对数据进行清洗 大数据开发工程师 06-07发布 2-2.5万/月 上海 1年经验 本科
# joblist 保留
# timelist 补齐日期格式 y-m-d
# salarylist 取均值,通过(2+2.5)/2得到
# placelist -> 上海-徐汇区 -> 上海
# experiencelist 把数据从3-4年经验-> 3-4 -> 3 -> 最低的经验限制
# educationlist 保留
tm.sleep(5)
for j in range(1,11):
for i in range(1,51):
# 网页等3s
tm.sleep(1)
job = driver.find_element(By.XPATH,"/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{}]/a/p[1]/span[1]".format(i)).text
joblist.append(job)
times = driver.find_element(By.XPATH,"/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{}]/a/p[1]/span[2]".format(i)).text
timeAs = times.split("发布")
time = "2022-"+timeAs[0]
timelist.append(time)
# 数据存在空值的现象
salary = driver.find_element(By.XPATH,"/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{}]/a/p[2]/span[1]".format(i)).text
if salary == "":
salary = 0.0
else:
datas = salary.replace("万/月","")
data = datas.split("-")
try:
a = int(data[0])
b = int(data[1])
except :
b = 0
salary = (a+b)/2
salarylist.append(float(salary))
info = driver.find_element(By.XPATH,"/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{}]/a/p[2]/span[2]".format(i)).text
Info = info.split("|")
place = Info[0].replace(" ","")
try:
list1 = place.split("-")
place = list1[0]
except:
place = place
placelist.append(place)
experience =Info[1].replace(" ","")
if experience == "无需经验":
experience = 0
elif experience == "在校生/应届生":
experience = 0
else:
try:
experience = experience.replace("年经验","")
test = experience.split("-")
experience = int(test[0])
except ValueError:
experience = experience.replace("年以上经验","")
except:
experience = int(experience)
experiencelist.append(experience)
try:
education = Info[2].replace(" ","")
except:
education = "无学历要求"
educationlist.append(education)
print("正在写入第{}页,第{}列".format(j,i),[job,time,salary,place,experience,education])
click = driver.find_element(By.XPATH,"/html/body/div[2]/div[3]/div/div[2]/div[4]/div[2]/div/div/div/ul/li[8]/a")
click.click()
将数据写入到excel中
# 爬虫数据存入excel中
df=pd.DataFrame({"job":joblist,"time":timelist,"avg-salary":salarylist,"place":placelist,"low-experience":experiencelist,"education":educationlist})
print(df)
df.to_excel('databases.xlsx')