利用js2py这个库, 以百度未登录状态下为例:
import js2py
import requests
from lxml import etree
url = 'https://www.baidu.com'
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://www.baidu.com/',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6',
}
def get_js_value(url):
page_source = requests.get(url, headers=headers).content.decode('utf8')
selector = etree.HTML(page_source)
script_content = selector.xpath('/html/head/script[3]/text()')[0]
context = js2py.EvalJs()
context.execute(script_content)
return context
if __name__ == '__main__':
context = get_js_value(url)
_manCard = context._manCard
print(type(_manCard))
print(_manCard)
py_manCard = _manCard.to_dict()
print(type(py_manCard))
asynJs = py_manCard['asynJs']
print(type(asynJs))