from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get("https://www.51job.com/")
driver.find_element_by_id('kwdselectid').send_keys('自动化测试')
driver.find_element_by_id('work_position_click').click()
time.sleep(1)
eles = driver.find_elements_by_css_selector('#work_position_click_multiple_selected>span')
for ele in eles:
ele.click()
driver.find_element_by_xpath('//td[@class="js_more"]/em[text()="上海"]').click()
time.sleep(1)
driver.find_element_by_id('work_position_click_bottom_save').click()
time.sleep(2)
driver.find_element_by_xpath('//button[text()="搜索"]').click()
pagemsg = driver.find_element_by_xpath('//div[@class="p_in"]/span').text
import re
pagenum =int(re.findall('\d+',pagemsg)[0])
print(f'一个有{pagenum}页')
job_collection=[]
for i in range((pagenum-30)):
print("开始获取第%s页信息" %(i+1))
time.sleep(2)
jobs = driver.find_elements_by_xpath('//div[@class="j_joblist"]//p[@class="info"]//span[1]')
time.sleep(3)
for job in jobs:
job_collection.append('|'.join(job.text.split('\n')))
if i <pagenum-1:
driver.find_element_by_css_selector('.p_in li:nth-last-child(1)').click()
time.sleep(1)
driver.find_element_by_css_selector('.j_result')
print("总共获取%s职位信息" %(len(job_collection)))
driver.quit()
with open('salaries.txt','w',encoding='utf-8') as f:
for job in job_collection:
f.write(job+'\n')
import re
def convert_data():
with open('salaries.txt','r',encoding="utf-8") as f:
salaries=f.read()
sass = []
for sa in salaries.split('\n'):
res = re.findall('\d+\.?\d*-\d+\.?\d*',sa)
if "万/月" in sa:
sass.append([float(one)* 10 for one in res[0].split('-')])
elif '千/月' in sa:
sass.append([float(one) for one in res[0].split('-')])
elif '万/年' in sa:
sass.append([float(one)/1.2 for one in res[0].split('-')])
else:
print("垃圾数据---%s" %sa)
return sass
def analysis_data(sass):
data_dict = {
'5-10K': [],
'10-15K': [],
'15-20K': [],
'20-25K': [],
'25-30K': [],
'30K+': [],
}
for sa in sass:
if sa[0] in range(5,10) or sa[1] in range(5,10):
data_dict['5-10K'].append(sa)
if sa[0] in range(10,15) or sa[1] in range(10,15):
data_dict['10-15K'].append(sa)
if sa[0] in range(15,20) or sa[1] in range(15,20):
data_dict['15-20K'].append(sa)
if sa[0] in range(20,25) or sa[1] in range(20,25):
data_dict['20-25K'].append(sa)
if sa[0] in range(25,30) or sa[1] in range(25,30):
data_dict['25-30K'].append(sa)
if sa[1] > 30:
data_dict['30K+'].append(sa)
print(data_dict)
if __name__ == '__main__':
convert_data()
analysis_data(sass)