from selenium import webdriver
import time
import re
import pandas as pd
import os
在爬取的过程中可能会有登陆弹窗,要先定义一个处理弹窗的函数
def close_windows():
#如果有登录弹窗,就关闭
try:
time.sleep(0.5)
if dr.find_element_by_class_name("jconfirm").find_element_by_class_name("closeIcon"):
dr.find_element_by_class_name("jconfirm").find_element_by_class_name("closeIcon").click()
except BaseException as e:
print('close_windows,没有弹窗',e)
第二部分就是爬取部分,这里爬取维度为11列,基本上包含了职位的大部分信息
def get_current_region_job(k_index):
flag = 0
# page_num_set=0#每区获取多少条数据,对30取整
df_empty = pd.DataFrame(columns=['岗位', '地点', '薪资', '工作经验', '学历', '公司名称', '技能','工作福利','工作类型','融资情况','公司规模'])
while (flag == 0):
# while (page_num_set<151)&(flag == 0):#每次只能获取150条信息
time.sleep(0.5)
close_windows()
job_list = dr.find_elements_by_class_name("job-primary")
for job in job_list:#获取当前页的职位30条
job_name = job.find_element_by_class_name("job-name").text
# print(job_name)
job_area = job.find_element_by_class_name("job-area").text
# salary = job.find_element_by_class_name("red").get_attribute("textContent") # 获取薪资
salary_raw = job.find_element_by_class_name("red").get_attribute("textContent") # 获取薪资
salary_split = salary_raw.split('·') # 根据·分割
salary = salary_split[0] # 只取薪资,去掉多少薪
# if re.search(r'天', salary):
# continue
experience_education = job.find_element_by_class_name("job-limit").find_element_by_tag_name(
"p").get_attribute("innerHTML")
# experience_education_raw = '1-3年<em class="vline"></em>本科'
experience_education_raw = experience_education

最低0.47元/天 解锁文章
2011

被折叠的 条评论
为什么被折叠?



