这两天我一直在爬取百度百聘这个招聘网站,里面的工作还是很多的,都是从其他招聘网站上获取下来了 下面我就给大家详细分析一下这次我在百度百聘爬取时的思路和遇到的问题 和 解决办法提供大家参考:
爬取数据的思路:
1.找到存有数据的链接
2. 分析获取数据的链接 需要 城市 数据起始位置 token
3. 获取所有的城市
4. 解密token 获取
5. 进行模拟登录 然后保持session会话
6. 获取所有的数据
首先我们进入百度百聘的网站主页:
https://zhaopin.baidu.com/quanzhi?city=%E5%8C%97%E4%BA%AC
然后打开检查模式:
我们发现他里面的数据是通过json数据返回给页面展示出来的 而且是通过滑动来不断的获取数据 我们滑动数据来找出来他返回数据的链接:
通过分析:pn是返回数据的其实位置-rn是每次返回的数据个数
接下来我们的目标就就是获取 json数据 然后从json数据中获取我们需要的 有用数据
我们来分析请求json数据的链接 :
是通过https://zhaopin.baidu.com/api/qzasync?+
这几个数据拼成的链接请求出来的
现在的思路是通过改变城市和pn的起始位置,拼接链接来请求所有的json数据:
1:获取所有的城市:
self.driver = webdriver.Chrome()
self.html_all=self.driver.get('https://zhaopin.baidu.com/')
#获取所有的
def all_city(self):
city = self.html_all.xpath(r'//a[@class="city"]/text()')
city_list = [i for i in set(city)]
print(city_list)
self.wither(city_list)
def wither(self, data):
with open('city.txt', 'w') as f:
f.write('-'.join(data))
2.遍历所有城市,拼接链接来请求json数据:
def read(self):
with open('city.txt', 'r') as f:
h = f.read()
j = h.split('-')
return j
def data_dict(self, city, num, token):
data = {
"query": "python",
"city": city,
"is_adq": "1",
"pcmod": "1",
"token": ‘==QmS/6qT2apUZInspZZtV5ZemWlJaoaiFnZkF2mshmZ’,
"pn": num,
"rn": "20"
}
return data
def request_html(self):
for city in self.read():
while True:
f = self.data_dict(city, self.num)
h = parse.urlencode(f)
l = self.url + h
print(l)
data = requests.get(l, ti meo5,).content.decode('utf-8')d
if len(data) < 5000:
break
data_json = json.loads(data)
for i in range(20):
try:
company_name = data_json['data']['disp_data'][i]['company']
start = data_json['data']['disp_data'][i]['requirements']
number = data_json['data']['disp_data'][i]['number']
try:
good1 = data_json['data']['disp_data'][i]['welfare']
good = '-'.join(good1)
except Exception as e:
good = '目前没有优势'
day = data_json['data']['disp_data'][i]['lastmod']
manny = data_json['data']['disp_data'][i]['salary']
url = data_json['data']['disp_data'][i]['pc_url']
name = data_json['data']['disp_data'][i]['title']
# mess = data_json['data']['disp_data'][i]['companydescription']
direction = data_json['data']['disp_data'][i]['first_level_label']
companytype = data_json['data']['disp_data'][i]['employertype']
city = data_json['data']['disp_data'][i]['location']
# print(self.count, company_name, start, number, day, manny, good, url, name)
print(company_name, start.strip('要求: '), direction)
self.count += 1
list2 = [url, name, company_name, manny, city, number, start.strip('要求: '), companytype,
direction, good + day]
self.writ_csv(list2)
except Exception as e:
print(e)
continue
self.num += 20
self.num = 0
在请求了几次json数据后我发现后面请求回来的不是json数据 而是百度的登录页面:
这也就意味这我们要想持续的获取数据 我们就必须进行模拟登录 然后保持session:
def selenum_login(self):
self.driver.get(url)
time.sleep(2)
self.driver.find_element_by_id('TANGRAM__PSP_3__footerULoginBtn').click()
time.sleep(3)
self.driver.find_element_by_name('userName').send_keys('18191042297')
self.driver.find_element_by_name('password').send_keys('Fwm802311')
self.driver.find_element_by_id('TANGRAM__PSP_3__submit').click()
time.sleep(3)
def get_session(self):
self.html_selenium(
'https://passport.baidu.com/v2/?login&sms=1&u=https%3A%2F%2Fzhaopin.baidu.com%2Fquanzhi%3Fcity%3D%25E5'
'%258C%2597%25E4%25BA%25AC%26query%3Dpython')
self.selenum_login()
time.sleep(60)
# f = self.data_dict('北京', self.num, '==QmQztqVO70FyFlrpZbbaWawdpaVhIlq9WZmpJaYa2k')
# h = parse.urlencode(f)
# self.driver.get(self.url + h)
self.driver.get('https://zhaopin.baidu.com/quanzhi?city=%E5%8C%97%E4%BA%AC')
cookies = self.driver.get_cookies()
return cookies
#判断session是否获取过 如果获取过 就读取文件 如果没有获取过 是第一次 那就通过模拟登录来获取cookies然后写入文件
def use_session(self):
if not os.path.exists('session.txt'):
self.session.headers.clear()
self.session_city.headers.clear()
for cook in self.get_session():
self.session.cookies.set(cook['name'], cook['value'])
for cook_city in self.get_session_city():
self.session_city.cookies.set(cook_city['name'], cook_city['value'])
self.save_session(self.session, 'session')
self.save_session(self.session_city, 'session_city')
else:
self.session = self.load_session('session')
self.session_city = self.load_session('session_city')
self.driver.close()
3.,没错 我有遇到了问题 就是我发现每次请求数据的json链接中的token每个城市 和过会时间是会改变的 我们现在就要破解token:
就是在你登录请求的的第一个页面里的下面的值
data[“nekot”] = “lGSYaphoYW1jl4hVapdwaWaaapVvmFyF07OVqtzQmQ==”;
进行了如下变换
说白了就是Python中讲字符串倒序输出了
解决思路:
每次请求这个网址https://zhaopin.baidu.com/quanzhi?city= 城市 ,也就是说请求每个城市的职位时,或者这个网页中的这个值 data[“nekot”] = “lGSYaphoYW1jl4hVapdwaWaaapVvmFyF07OVqtzQmQ==”;
然后对这个值进行倒序输出,转换成token值
最后拼接参数发送请求
最后我们解决了所有问题可以开始爬取数据了:
全代码:
import csv
import json
import os
import pickle
import re
import time
from urllib import parse
import threading
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
class Baipin(object):
def __init__(self):
# threading.Thread.__init__(self)
self.session = requests.Session()
self.session_city = requests.Session()
self.html_all = ''
self.url = 'https://zhaopin.baidu.com/api/qzasync?'
self.url1 = 'https://zhaopin.baidu.com/quanzhi?'
self.driver = webdriver.Chrome()
self.headers = {'User-Agent': UserAgent().random}
self.all_headers = {
'Cookie': 'ZP_FLAG=12; Hm_lvt_da3258e243c3132f66f0f3c247b48473=1583858066,1583858672,1583859396,'
'1583860201; Hm_lpvt_da3258e243c3132f66f0f3c247b48473=1583860201; '
'BAIDUID=22233072050588D1C8EA78ACBFEAFDEE:FG=1; PSTM=1583861696; '
'BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; '
'BDUSS=Vd2c09ZT2w3Y3Y0LTZkd2d0YVY4RHYyUTU2RU1nWTNoTUtsNWFOQ34weG1'
'-bzllRUFBQUFBJCQAAAAAAQAAAAEAAACZEq8CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGZxaF5mcWhec; BIDUPSID=22233072050588D1C8EA78ACBFEAFDEE; delPer=0; PSINO=7; H_PS_PSSID=30970_1455_21111_30841_30794_30998_30824_26350',
"User-Agent": UserAgent().random
}
self.num = 0
self.count = 0
self.proxies = {
'https': 'https://113.194.31.123:9999'
}
# self.city=city
# def run(self):
#
# self.request_html(city=self.city)
def __call__(self, *args, **kwargs):
pass
# self.html_selenium('https://zhaopin.baidu.com/?query=&city=%E5%A4%A9%E6%B4%A5')
# self.all_city()
# self.read()
# self.html_request('https://zhaopin.baidu.com/quanzhi?query=python&city=%E4%B8%8A%E6%B5%B7')
# self.test()
# self.renren()
# self.request_html()
# print(UserAgent().random)
#
self.use_session()
# self.get_token('北京')
print('开始获取数据')
# self.driver.close()
self.request_html()
# self.request_html()
# self.selenum_data('https://zhaopin.baidu.com/api/qzasync?query=&city=%E5%8C%97%E4%BA%AC&is_adq=1&pcmod=1&token=%3D%3Dgxjud2E7tqaZIbZypaW6GlvhWZZdFmptWZmlZmYiZl&pn=20&rn=20')
def save_session(self, session, name):
with open(f'{name}.txt', 'wb') as f:
pickle.dump(session, f)
print('写入session成功')
def load_session(self, name):
with open(f'{name}.txt', 'rb') as f:
s = pickle.load(f)
return s
def get_session(self):
self.html_selenium(
'https://passport.baidu.com/v2/?login&sms=1&u=https%3A%2F%2Fzhaopin.baidu.com%2Fquanzhi%3Fcity%3D%25E5'
'%258C%2597%25E4%25BA%25AC%26query%3Dpython')
self.selenum_login()
time.sleep(60)
# f = self.data_dict('北京', self.num, '==QmQztqVO70FyFlrpZbbaWawdpaVhIlq9WZmpJaYa2k')
# h = parse.urlencode(f)
# self.driver.get(self.url + h)
self.driver.get('https://zhaopin.baidu.com/quanzhi?city=%E5%8C%97%E4%BA%AC')
cookies = self.driver.get_cookies()
return cookies
def get_session_city(self):
self.html_selenium('https://zhaopin.baidu.com/quanzhi?query=&city=%E5%8C%97%E4%BA%AC')
cookies = self.driver.get_cookies()
return cookies
def use_session(self):
if not os.path.exists('session.txt'):
self.session.headers.clear()
self.session_city.headers.clear()
for cook in self.get_session():
self.session.cookies.set(cook['name'], cook['value'])
for cook_city in self.get_session_city():
self.session_city.cookies.set(cook_city['name'], cook_city['value'])
self.save_session(self.session, 'session')
self.save_session(self.session_city, 'session_city')
else:
self.session = self.load_session('session')
self.session_city = self.load_session('session_city')
self.driver.close()
def session_json(self, url):
data = self.session.get(url).content.decode('utf-8')
return data
def html_selenium(self, url):
self.driver.get(url)
time.sleep(2)
def selenum_login(self):
self.driver.find_element_by_id('TANGRAM__PSP_3__footerULoginBtn').click()
time.sleep(3)
self.driver.find_element_by_name('userName').send_keys('18191042297')
self.driver.find_element_by_name('password').send_keys('Fwm802311')
self.driver.find_element_by_id('TANGRAM__PSP_3__submit').click()
time.sleep(3)
def get_token(self, city):
data_dict = {
"city": city
}
data = self.session_city.get(url=self.url1, data=data_dict, timeout=5, stream=True).content.decode('utf-8')
token1 = re.findall(r'"nekot"] = "([\s\S]+?)";', data)[0]
# print(token1)
token = token1[::-1]
return token
def selenum_data(self, url):
try:
self.driver.get(url)
data = self.driver.page_source
# data1=json.loads(data)
# da=data.e
# print(data)
soup = BeautifulSoup(data, 'lxml')
cc = soup.select('pre')[0]
h = cc.text
except Exception as e:
print(e)
h = 'hello'
return h
def html_request(self, url):
data = requests.get(url=url, headers=self.all_headers).content.decode('utf-8')
print(data)
def request_html(self):
for city in self.read():
while True:
time.sleep(3)
try:
token = self.get_token(city)
f = self.data_dict(city, self.num, token)
h = parse.urlencode(f)
l = self.url + h
print(l)
# data = self.html_selenium(self.url+h) data = requests.get(url=self.url + h,
# headers=self.all_headers,allow_redirects=False).content.decode('utf-8')
data = self.session.get(l, timeout=5,).content.decode('utf-8')
except Exception as e:
print(e)
continue
if len(data) < 5000:
break
try:
data_json = json.loads(data)
except Exception as e:
print(e)
continue
for i in range(20):
try:
company_name = data_json['data']['disp_data'][i]['company']
start = data_json['data']['disp_data'][i]['requirements']
number = data_json['data']['disp_data'][i]['number']
try:
good1 = data_json['data']['disp_data'][i]['welfare']
good = '-'.join(good1)
except Exception as e:
good = '目前没有优势'
day = data_json['data']['disp_data'][i]['lastmod']
manny = data_json['data']['disp_data'][i]['salary']
url = data_json['data']['disp_data'][i]['pc_url']
name = data_json['data']['disp_data'][i]['title']
# mess = data_json['data']['disp_data'][i]['companydescription']
direction = data_json['data']['disp_data'][i]['first_level_label']
companytype = data_json['data']['disp_data'][i]['employertype']
city = data_json['data']['disp_data'][i]['location']
# print(self.count, company_name, start, number, day, manny, good, url, name)
print(company_name, start.strip('要求: '), direction)
self.count += 1
list2 = [url, name, company_name, manny, city, number, start.strip('要求: '), companytype,
direction, good + day]
self.writ_csv(list2)
except Exception as e:
print(e)
continue
self.num += 20
self.num = 0
def all_city(self):
city = self.html_all.xpath(r'//a[@class="city"]/text()')
city_list = [i for i in set(city)]
print(city_list)
self.wither(city_list)
def wither(self, data):
with open('city.txt', 'w') as f:
f.write('-'.join(data))
def test(self):
h = parse.urlencode('上海')
print(h)
def data_dict(self, city, num, token):
data = {
"query": "python",
"city": city,
"is_adq": "1",
"pcmod": "1",
"token": token,
"pn": num,
"rn": "20"
}
return data
def renren(self):
url = 'https://passport.baidu.com/v2/?login&sms=1&u=https%3A%2F%2Fzhaopin.baidu.com%2F'
head = {
'username': '18191042297',
'password': 'Fwm802311'
}
def writ_csv(self, list1):
with open("test.csv", "a+",encoding='GB18030') as csvfile:
writer = csv.writer(csvfile)
# 先写入columns_name
# 写入多行用writerows
writer.writerow(list1)
def read(self):
with open('city.txt', 'r') as f:
h = f.read()
j = h.split('-')
return j[:48]
# class Tencent(threading.Thread, Baipin):
# def __init__(self, city):
# threading.Thread.__init__(self)
# self.city = city
if __name__ == '__main__':
print('开始获取数据')
pin = Baipin()
pin()
在这次爬取过程中我还遇到了许多的小问题:
- 在请求数据中有时候就卡在哪里一直不懂 请求不下来数据
解决办法:我们给每次请求加上时间 如果请求过了时间就报错 然后我们用异常处理来扑捉,如果扑捉到超时 我们就重新请求
try:
data = self.session.get(l, timeout=5,).content.decode('utf-8')
except Exception as e:
print(e)
continue
- 在我们用selenum请求json数据的时候返回来的数据不是纯json数据 是整个网页的html文件json数据包含在其中我们取出来很麻烦
解决办法:
from bs4 import BeautifulSoup
self.driver.get(url)
#data获取到的json网页信息
data = self.driver.page_source
# data1=json.loads(data)
# da=data.e
# print(data)
soup = BeautifulSoup(data, 'lxml')
cc = soup.select('pre')[0]
h = cc.text
#h是取出来的json数据
- url需要转译才可以请求:
from urllib import parse
def data_dict(self, city, num, token):
data = {
"query": "python",
"city": city,
"is_adq": "1",
"pcmod": "1",
"token": token,
"pn": num,
"rn": "20"
}
return data
h = parse.urlencode(data)
#h就是编译好的数据