1.熟悉chromedriver
https://blog.csdn.net/weixin_41098099/article/details/107128566
2.查看网页的页数、分析
进入https://www.51job.com/,搜索随意个岗位,例如这里是‘大数据’,如图:
分析网页地址:
第一页:
https://search.51job.com/list/000000,000000,0000,00,9,99,大数据,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
第二页:
https://search.51job.com/list/000000,000000,0000,00,9,99,大数据,2,2.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
第三页:
https://search.51job.com/list/000000,000000,0000,00,9,99,大数据,2,2.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
可以发现其的搜索页面大概的格式:关键字+页数
因此,代码可以定为:
'https://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(i)
3.岗位页面分析
可以发现,每条具体岗位的url地址,格式为:
//*[@class="j_joblist"]/div/a/@href
进入到具体岗位信息后:
同理,我们想获取岗位名或者工资同样可以锁定
例如
大数据架构师(JS)(岗位名):
//*[@class="tHeader tHjob"]/div/div/h1/@title
1.5-3万/月(工资):
//*[@class="tHeader tHjob"]/div/div/strong/text()
4.具体代码
from lxml import etree
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
"""创建csv文件,输入头标题"""
fp = open('51job.csv', 'wt', newline='', encoding='GBK', errors='ignore')
writer = csv.writer(fp)
writer.writerow(('职位', '薪资', '公司', '公司信息', '公司地址', '地区', '工作经验', '学历', '人数', '时间', '岗位信息'))
j = 1 # 记录爬取条数
def parseInfo(url):
"""获取具体岗位的基本信息"""
global j
# 为了将Chrome不弹出界面,实现无界面动态爬取
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)
# 获取网页
driver.get(url)
# 转换形式,用于接下的xpath爬取
selector = etree.HTML(driver.page_source)
# 提取出岗位的各种信息
title = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/h1/@title'))[2:-2].replace('\\xa0', '') # 去掉['内容']中的[' '],去掉内容中的\xa0
salary = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/strong/text()'))[2:-2].replace('\\xa0', '')
company = str(selector.xpath('//*[@class="tCompany_sidebar"]/div/div/a/p/text()'))[2:-2].replace('\\xa0', '')
companyinfo = selector.xpath('//*[@class="tCompany_sidebar"]/div/div/p/@title')
companyinfo = '|'.join(companyinfo) # 将获取的列表中多项信息转换成字符串
companyplace = str(selector.xpath('//*[@class="tCompany_main"]/div[2]/div/p/text()'))[2:-2].replace('\\xa0', '')
place = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[1]'))[2:-2].replace('\\xa0', '')
exp = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[2]'))[2:-2].replace('\\xa0', '')
edu = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[3]'))[2:-2].replace('\\xa0', '')
num = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[4]'))[2:-2].replace('\\xa0', '')
time = str(selector.xpath('//*[@class="tHeader tHjob"]/div/div/p[2]/text()[5]'))[2:-2].replace('\\xa0', '')
info = selector.xpath('//*[@class="tCompany_main"]/div[1]/div/p/text()')
info = ''.join(info)
print("*"*100)
print("-" * 50)
print("第" + str(j) + "条")
j += 1
print("-" * 50)
print(title, salary, company, companyinfo, companyplace, place, exp, edu, num, time, info)
print("-"*100)
writer.writerow((title, salary, company, companyinfo, companyplace, place, exp, edu, num, time, info))
# 退出浏览器
driver.quit()
def getUrl(url):
"""从搜索页面中获取具体岗位的url"""
# 为了将Chrome不弹出界面,实现无界面爬取
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)
# 发送请求,这里选取的某当网一页面。
driver.get(url)
selector = etree.HTML(driver.page_source)
urls = selector.xpath('//*[@class="j_joblist"]/div/a/@href')
driver.quit() # 关闭driver
for url in urls:
print(url)
parseInfo(url)
if __name__ == '__main__':
key = '大数据'
urls = [
'https://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(
i) for i in range(1, 6)] # 设定爬取页数
i = 1
for url in urls:
print("-" * 50)
print("第" + str(i) + "页")
print("-" * 50)
getUrl(url)
i += 1
5.代码运行效果
运行结果:
生成的csv文件:
(注:因为不是每个网页的格式都完全一样,会出现某些数据不完整,需要调整)