最近在看拉钩招聘的游客访问
发现cookie有三个字段
user_trace_token
X_HTTP_TOKEN
lg_stoken
但是前面两个都是在错误请求的时候会返回的。只有第三个是生成的。
放弃了?
不。干他!
浏览器进入拉钩城市列表页面 https://www.lagou.com/jobs/allCity.html。然后F12>Application>Storage>清理缓存
然后刷新一下页面,发现第一个请求是302,然后多次请求别的URL,最后才请求成功
点开请求成功的那个,发现里面是有cookie的。而在第一个请求里面是有设置前面两个cookie的
也就是说,目前就差 __lg_stoken__ 这个参数了。这时候我想到的是进入后面请求的调用栈找JS文件,但是我发现调用栈点不进去。。。不知道是不是我的问题
于是我换了一种方法,用hook
设置方法如下
新建一个hook文件夹(任意位置)
然后里面新建 inject.js 和 manifest.json
inject.js 如下
var code = function(){
var org = document.cookie.__lookupSetter__('cookie');
document.__defineSetter__("cookie",function(cookie){
if(cookie.indexOf('__lg_stoken__')>-1){
debugger;
}
org = cookie;
});
document.__defineGetter__("cookie",function(){return org;});
}
var script = document.createElement('script');
script.textContent = '(' + code + ')()';
(document.head||document.documentElement).appendChild(script);
script.parentNode.removeChild(script);
manifest.json 如下
{
"name": "Injection",
"version": "2.0",
"description": "RequestHeader钩子",
"manifest_version": 2,
"content_scripts": [
{
"matches": [
"<all_urls>"
],
"js": [
"inject.js"
],
"all_frames": true,
"permissions": [
"tabs"
],
"run_at": "document_start"
}
]
}
然后在Chrome浏览器,扩展程序,加载已解压的扩展程序。选择刚才建的hook文件夹即可。
关于使用hock这一部分。我借鉴了一位大佬的文章
ok
再清一下缓存,然后刷新页面。发现会自动打一个断点。调用栈有一个set方法,很明显就是生成cookie的地方,我们追进去。
大概是这个意思
a = __lg_stoken__
b =5c61477188b602904e02a2b702…
f = a + ‘=’ + encodeURLComponent(b)
f = __lg_stoken__=5c61477188b602904e02a2b702…
可以看到,这个 f 就是最后缺失的cookie
而现在的缺失值就是 b 了
从调用栈可以看到传入 b 的位置
可以看到 d 是通过 (new g()).a(); 来获取的,然后我们看一下g这个方法。
鼠标选中,然后悬停,可以跟进去
格式化之后发现这是一个新的JS文件,而且很明显是混淆后的JS代码
我在这里找了好久,发现很多方法都是构造函数生成的,也就是说不同的情境下,一个方法的含义是不同的,困难很大。
到最后,我直接给整个JS代码复制下来了,然后在本地运行,也算是跑通了,加上代理就可以快乐的爬虫了。
python代码如下。JS代码比较多,就不放了。可以去我个人主页下载
from gevent import monkey; monkey.patch_all()
import gevent.pool
import json
import random
import re
from lxml import etree
import execjs
import requests
from sns_spider.config.settings import USER_AGENTS
import pymongo
class LG(object):
"""拉钩 js逆向"""
def __init__(self):
self.client = pymongo.MongoClient(host='localhost', port=27017)
self.mongo_col = self.client['demo']['lagou']
self.js_file = open('lg.js', encoding='utf8').read()
self._headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'referer': 'https://www.lagou.com/jobs/list_java/p-city_3?px=default',
}
self.token = ''
self.proxies = dict()
self.set_proxies()
self.get_token()
self.city_info = dict()
def set_proxies(self):
"""设置代理"""
ip = "获取到代理IP"
self.proxies = {
'http': 'http://{}'.format(ip),
'https': 'http://{}'.format(ip),
}
def get_response(self, url, params=None, data=None, method='GET'):
while True:
try:
if method == 'GET':
response = requests.get(url, params=params, headers=self._headers, proxies=self.proxies)
else:
response = requests.post(url, params=params, data=data, headers=self._headers, proxies=self.proxies)
response.encoding = response.apparent_encoding
return response
except:
self.set_proxies()
self.get_token()
def get_token(self):
"""获取到游客cookie"""
url = 'https://www.lagou.com/gongsi/allCity.html'
while True:
headers = {'user-agent': random.choice(USER_AGENTS)}
try:
response = requests.get(url, headers=headers, allow_redirects=False, proxies=self.proxies, timeout=10)
response.encoding = response.apparent_encoding
user_trace_token = re.findall(r'user_trace_token=(.*?);', response.headers['Set-Cookie'])[0]
x_http_token = re.findall(r'X_HTTP_TOKEN=(.*?);', response.headers['Set-Cookie'])[0]
href = response.headers['Location']
ctx = execjs.compile(self.js_file, cwd='/opt/homebrew/Cellar/node/16.3.0/bin/')
self.token = ctx.call('window.gt.prototype.a',
json.dumps({"href": href, "search": href.split('check.html')[1]}))
self._headers['cookie'] = 'user_trace_token={};X_HTTP_TOKEN={};__lg_stoken__={}'.format(
user_trace_token, x_http_token, self.token)
return
except Exception as e:
print('获取token失败\tproxies:{}\te:{}'.format(self.proxies, e))
self.set_proxies()
def get_city_info(self):
"""获取城市信息"""
url = 'https://www.lagou.com/jobs/allCity.html'
html = etree.HTML(self.get_response(url).text)
city_url = html.xpath('//ul[@class="city_list"]/li/a/@href')
city_name = html.xpath('//ul[@class="city_list"]/li/a/text()')
self.city_info = {city_name[i]: city_url[i] for i in range(len(city_url))}
def get_job_info(self, input_item):
"""获取职位信息"""
url = 'https://www.lagou.com/jobs/positionAjax.json'
params = {
"px": "default",
"city": input_item['city_name'],
"district": input_item['district'],
"needAddtionalResult": "false",
}
sid = ''
page = 1
while True:
data = {
"first": "true",
"pn": page,
"kd": input_item['keyword'],
"sid": sid,
}
job_info = self.get_response(url, params=params, data=data, method='POST').json()
if 'success' in job_info:
sid = job_info['content']['showId']
job_info = job_info['content']['positionResult']['result']
if not job_info or page == 30:
break
self.parse_info(job_info, input_item)
print('{}\t页码:{}\t数据量:{}'.format(input_item, page, len(job_info)))
page += 1
def parse_info(self, job_info, input_item):
"""解析内容"""
items = list()
for info in job_info:
item = {
'_id': info['positionId'],
'job_name': info['positionName'],
'job_url': 'https://www.lagou.com/jobs/{}.html'.format(info['positionId']),
'company_name': info['companyFullName'],
'company_size': info['companySize'],
'industry_field': info['industryField'],
'finance_stage': info['financeStage'],
'company_label': ','.join(info['companyLabelList']).rstrip(','),
'skill_label': ','.join(info['skillLables']).rstrip(','),
'position_label': ','.join(info['positionLables']).rstrip(','),
'create_time': info['createTime'],
'city': info['city'],
'district': info['district'],
'salary': info['salary'],
'work_year': info['workYear'],
'job_nature': info['jobNature'],
'education': info['education'],
'position_advantage': info['positionAdvantage'],
'position_detail': info['positionDetail'],
'position_address': info['positionAddress']
}
items.append(item)
try:
self.mongo_col.insert_many(items)
# print('{}\t插入成功。本次插入{}条'.format(input_item, len(items)))
except:
for item in items:
try:
self.mongo_col.insert_one(item)
except:
pass
def run(self):
"""启动函数"""
self.get_city_info()
# print(self.city_info)
# for city_name, city_url in self.city_info.items():
for city_name in ['郑州', '北京', '上海', '广州', '深圳']:
city_url = self.city_info[city_name]
if '-zhaopin' not in city_url:
city_url = city_url.rstrip('/') + '-zhaopin/'
response = self.get_response(url=city_url, method='GET')
html = etree.HTML(response.text)
district_name = html.xpath('//div[@data-type="district"]/a[position()>1]/text()')
item = [{'city_name': city_name, 'district': name, 'keyword': 'python'} for name in district_name]
print(item)
pool = gevent.pool.Pool(size=1)
pool.map(self.get_job_info, item)
if __name__ == '__main__':
t = LG()
t.run()