selenium无登录状态爬取Boss直聘

BOSS是我很早就实现数据爬取的网站,那会直接用request即可,最近再次尝试以前的代码发现,它做了一些反爬处理,当你直接访问例如https://www.zhipin.com/c101210100/b_西湖区/?query=数据分析杭州这样的网址,会进行一个二次跳转,就算获取跳转后的网址再访问也是不行的,因为它的cookies里有一个_zp_stoken_,是js加密生成的,尽力一番之后宣告难以破解,直接祭出大杀器selenium+随机user-agent,轻松解决

代码已更新2020.3.18

思路就是用selenium进入要爬取的页面,通过随机的user-agent高效爬取数据,我这边随机的user-agent是保存在本地文件headers.csv中的,需要自取https://pan.baidu.com/s/11lBIclOHvVpBdgp3NyY0nA提取码 ar5z

# -*- coding: utf-8 -*-
import json
import os
import re
from urllib.parse import urlencode
import fake_useragent
from scrapy.selector import Selector
import requests
import time
from lxml import etree
from selenium import webdriver
import pandas as pd


'''
爬取BOSS职位
'''
# 方法二,从本地文件夹获取
location = os.getcwd() + 'headers.csv'
ka = fake_useragent.UserAgent(path=location, verify_ssl=False, use_cache_server=False)

# 构造请求头User-Agent
headers = {
		'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
		'accept-encoding':'gzip, deflate, br',
		'accept-language':'zh-CN,zh;q=0.9',
		'cache - control': 'max - age = 0',
		'referer':'https://www.zhipin.com/',
		'sec-fetch-mode':'navigate',
		'sec-fetch-site':'same-origin',
		'sec-fetch-user':'?1',
		'upgrade-insecure-requests':'1',
		'user-agent': ka.random,
		'X-Requested-With': 'XMLHttpRequest'
		}
data_my = []
get_url = 'https://www.zhipin.com/wapi/zpgeek/view/job/card.json?'


def main():
	'https://www.zhipin.com/c101210100/b_西湖区/?query=数据分析杭州'

	area_list ={'西湖区','余杭区','滨江区','江干区','萧山区','拱墅区','下城区','上城区'}

	chromedriver_path = 'C:/Users/machenike/Anaconda3/Scripts/chromedriver.exe'
	# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
	options = webdriver.ChromeOptions()
	options.add_experimental_option('excludeSwitches', ['enable-automation'])
	driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
	driver.maximize_window()

	for area in area_list:

		loginurl = 'https://www.zhipin.com/c101210100/b_' +area+'/?query=数据分析杭州'
		driver.get(loginurl)
		time.sleep(3.5)
		# Selenium为我们提供了get_cookies来获取登录cookies
		cookies = driver.get_cookies()
		jsonCookies = json.dumps(cookies)
		# 把cookies保存在本地
		with open('bossCookies.json', 'w') as f:
			f.write(jsonCookies)

		#获取信息
		get_detail(driver,area)
	# 写入本地CSV文件
	df = pd.DataFrame(data_my)
	df.to_csv('./shuju.csv', index=None, encoding='utf-8-sig', mode='a')
	time.sleep(0.5)
	print('已保存该数据到本地HR.csv文件夹')
	driver.close()

def get_detail(driver,area):
	source = etree.HTML(driver.page_source)
	node_list =source.xpath("//div[@class='job-list']/ul/li")
	# 用来存储所有的item字段\

	for node in node_list:
		item = {}
		# extract() 将xpath对象转换为Unicode字符串
		item['链接'] = node.xpath(".//div[@class='info-primary']/div[@class='primary-wrapper']/a/@href")[0]
		item['职位'] = node.xpath(".//div[@class='info-primary']/div[@class='primary-wrapper']/a/div[@class='job-title']/span[1]")[0].text
		item['薪资'] = node.xpath(".//div[@class='info-primary']/div[@class='primary-wrapper']/a/div[@class='job-limit clearfix']/span[1]")[0].text
		item['工作地点'] = area
		item['工作经验'] = node.xpath(".//div[@class='info-primary']/div[@class='primary-wrapper']/a/div[@class='job-limit clearfix']//p/text()[1]")[0]
		item['公司名称'] = node.xpath(".//div[@class='info-primary']/div[@class='info-company']/div[@class='company-text']/h3/a")[0].text
		item['所处行业'] = node.xpath(".//div[@class='info-primary']/div[@class='info-company']/div[@class='company-text']//p/text()[1]")[0]
		rong= node.xpath(".//div[@class='info-primary']/div[@class='info-company']/div[@class='company-text']//p/text()[2]")[0]
		if '人' in rong:
			item['融资轮']=''
		else:
			item['融资轮']=rong
		try:
			item['规模']=node.xpath(".//div[@class='info-primary']/div[@class='info-company']/div[@class='company-text']//p/text()[3]")[0]
		except:
			gui = node.xpath(".//div[@class='info-primary']/div[@class='info-company']/div[@class='company-text']//p/text()[2]")[0]
			if '人' in gui:
				item['融资轮'] = gui
			else:
				item['融资轮'] = ''



		item['jid'] = node.xpath(".//div[@class='info-primary']/div[@class='primary-wrapper']/a/@data-jid")[0]
		item['lid'] = node.xpath(".//div[@class='info-primary']/div[@class='primary-wrapper']/a/@data-lid")[0]
		ajson = get_info(item['jid'], item['lid'])
		item['岗位职责'] = get_json(ajson)
		print(item)
		data_my.append(item)

	#翻页
	if  source.xpath('//*[@id="main"]/div/div[3]/div[3]//a[@class="next"]'):
		next_page=driver.find_element_by_xpath('//*[@id="main"]/div/div[3]/div[3]//a[@class="next"]')
		driver.execute_script("arguments[0].click();", next_page)
		time.sleep(3.5)
		# Selenium为我们提供了get_cookies来获取登录cookies
		cookies = driver.get_cookies()
		jsonCookies = json.dumps(cookies)
		# 把cookies保存在本地
		with open('bossCookies.json', 'w') as f:
			f.write(jsonCookies)
		get_detail(driver,area)

def get_info(jid, lid):
	params = {
		'jid': jid,
		'lid': lid
	}

	# 获取cookies
	with open('bossCookies.json', 'r', encoding='utf-8') as f:
		listcookies = json.loads(f.read())

	# 把获取的cookies处理成dict类型
	cookies_dict = dict()
	for cookie in listcookies:
		# 在保存成dict时,只要cookies中的name和value
		cookies_dict[cookie['name']] = cookie['value']

	requests.adapters.DEFAULT_RETRIES = 5
	s = requests.session()
	# 关闭多余进程
	s.keep_alive = False
	#请求ajax获取岗位职责
	re = requests.get(get_url + urlencode(params), headers=headers, cookies=cookies_dict)
	time.sleep(0.2)
	if re.status_code == 200:
		vjson = re.json()
		return vjson
	else:
		print("获取失败")


def get_json(js):
	#处理字符串,由于返回的岗位职责是一个包含html的json数据,需要处理一下
	if js:
		json_content = js.get('zpData').get('html')
		content = Selector(text=json_content)
		content_text = content.css(".detail-bottom-text::text").re("[\u4e00-\u9fa5_a-zA-Z0-9]+")
		cont=''.join(content_text)
		return cont
	else:
		print("未获取数据")

if __name__ == '__main__':
	main()

	print("结束---------------------------------")

最后是这样的
在这里插入图片描述

补充:对于一直在加载中的情况,可能是webdriver的版本比较新吧,被检测出什么了,可以用以下方式打开网页,记得先配置环境变量中的path,改成自己电脑里谷歌浏览器chrome.exe所在路径,亲测可行

# 打开本地chrome,!!!!!!!!需要提前配置环境变量path
os.system('cd "C:\\Program Files (x86)\\Google\\Chrome\\Application"&start chrome.exe --remote-debugging-port=9999 --user-data-dir="C:\selenum\AutomationProfile" "https://www.zhipin.com/job_detail/?ka=header-job"')
chrome_debug_port = 9999
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{chrome_debug_port}")
# selenium接管当前网页
driver = webdriver.Chrome(options=chrome_options)
评论 14
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值