学习目标:
python学习二十九—简单数据抓取九
学习内容:
1、抓取中医智库古籍的所有书籍的内容
2、编写一个js文件实现中医智库的解密
1、抓取中医智库古籍的所有书籍的内容
- 中医智库中的文字进行了加密,需要将返回的数据转为json,再base64解码,然后还要进行AES解密,最后还要将解密后的内容进行解压缩
import requests
from lxml.html import etree
import json
import base64
import pyaes
import zlib
# pip install pycryptodome
from Crypto.Cipher import AES
from binascii import b2a_hex, a2b_hex
url = 'https://www.zk120.com/ji/group/?nav=ahz'
response = requests.get(url)
html = etree.HTML(response.text)
name = html.xpath("//a[@class='ellipsis']/@href")
# print(response.text)
# print(name)
for i in name:
# print(i)
if 'group' in i:
src = 'https://www.zk120.com'+i
# print(src)
response = requests.get(src)
# print(response.text)
html = etree.HTML(response.text)
urls = html.xpath("//a[@class='mr5 native_read to_reader_url']/@href")
# print(urls)
url_1 = 'https://www.zk120.com'
for u in urls:
# print(u)
uu = u.replace('read', 'content')
# print(uu)
urll = url_1+uu
# print(urll)
response = requests.get(urll)
# print(response.text)
# 返回json数据
con = json.loads(response.text)
text = con['data']
# print(text)
# 解密
# print len(text)%4
# 判断这本书的内容是否是4X4规格的,如果不是的话,用=补齐16个字符
# missing_padding = 4 - len(text) % 4
# # print(missing_padding)
# if missing_padding:
# text += '=' * missing_padding
# 将分开的内容进行解码
# print(text)
content = base64.b64decode(text.encode('utf-8'))
# print(content)
# text = text.encode("utf-8")
# 这里密钥key 长度必须为16(AES-128)、24(AES-192)、或32(AES-256)Bytes 长度.目前AES-128足够用
# content= b',\x0bc\x17\xa3d\xb1+\xeb%_\x15:H\xab\x84'
# print(content)
# print(len(content))
decryptor = AES.new(b"61581af471b166682a37efe6", AES.MODE_CFB, b"c8f203fca312aaab", segment_size=128)
decrypt_text = decryptor.decrypt(content)
# print(11111111111111111111111111111111111111111,decrypt_text,str(decrypt_text, 'utf8'))
# aes = pyaes.AESModeOfOperationCFB(key=b"61581af471b166682a37efe6", iv=b"c8f203fca312aaab", segment_size=16)
# aes_text = aes.encrypt(content)
# print(22222222222222222222222222222222222222222,aes_text)
# 解压缩
text_zip = json.loads(zlib.decompress(decrypt_text))
# 输出结果
text_code = text_zip.get("text").encode("utf-8", "ignore")
print(str(text_code, encoding='utf-8'))
# with open('zhongyi.txt', 'a+', encoding='utf-8') as f:
# f.write(str(text_code, encoding='utf-8'))
# 'https://www.zk120.com/ji/content/529?uid=None&_=1523528905719'
#
# 'https://www.zk120.com/ji/read/529?nav=ahz&uid=None'
# ur = 'https://www.zk120.com'+'/ji/read/529?nav=ahz&uid=None'
# print(ur)
2、编写一个js文件实现中医智库的解密(初步构思,未实现)
- js文件:
CryptoJS = require("crypto-js")
pako = require("pako")
// const jsdom = require('jsdom');
// const {JSDOM} = jsdom;
// const {document} = (new JSDOM('<!doctype html><html><body></body></html>')).window;
// global.document = document;
// const window = document.defaultView;
// const $ = require('jquery')(window);
// function hello(name) {
// return 'hello!!!'+name
// }
function decryptData(data) {
var iv = 'c8f203fca312aaab';
var decrypted = CryptoJS.AES.decrypt(data, CryptoJS.enc.Utf8.parse("61581af471b166682a37efe6"), {
mode: CryptoJS.mode.CFB,
iv: CryptoJS.enc.Utf8.parse(iv),
padding: CryptoJS.pad.NoPadding
}).toString(CryptoJS.enc.Latin1);
return decrypted
}
- 中医智库.py驱动文件:
import execjs
import requests
print(execjs.get().name)
def get_js():
# f = open("./../js/my.js", 'r', encoding='utf-8') # 打开JS文件
f = open("JS.js", 'r', encoding='utf8') # 打开JS文件
line = f.readline()
htmlstr = ''
while line:
htmlstr = htmlstr+line
line = f.readline()
return htmlstr
def get_des_psswd(e):
js_str = get_js()
ctx = execjs.compile(js_str) #加载JS文件
return (ctx.call('decryptData', e)) #调用js方法 第一个参数是JS的方法名,后面的data和key是js方法的参数
if __name__ == '__main__':
source = requests.get('https://www.zk120.com/ji/content/529?uid=None&_=1615037333731').json()['data']
print(get_des_psswd(e=source))