爬虫逆向实例小记——某数据知识管理网站-DES-ECB模式

aHR0cHM6Ly9rZC5uc2ZjLmNuL2ZpbmFsUHJvamVjdEluaXQ=

注意:本文是逆向部分比较少,主要为了流程走通,限于代码搬运工。

第一步:分析页面

此网站经过请求响应,可以看出响应内容为加密内容。

第二步:判断加密类型

XHR 处添加 请求段,清空cookie重新请求。可以看出在send 形成断点,开始进行调试。

经过调试可以看出,从请求到页面呈现数据,经过原始数据解密过程(网站及其不稳定,tlj),可以看出加密方式为DES-ECB,pkcs7,因此可以在此处进行断点。原断点可以去除,重新请求。

第三步:重新调试请求

在第二步确定断点后,重新请求,可以看出再断点处可以将密文进行解密。后续一步一步调试即可获取页面解析后数据。但根据下图可以知道是DES加密的因此,直接网上找相关js或者Python 对应方法即可。(我就没有再继续调试了,哈哈哈哈)

第四步: 下载图片进行OCR识别

获取详情页中,在线PDF中内容,且需要下载图片进行OCR识别

第五步:上代码(建议模块化)

# -*- coding:utf-8 -*-
# @Time : 2024/5/22 17:12
# @Author: 水兵没月
# @File : 某网站.py
# @Software: PyCharm
import json
from urllib import parse
from urllib.parse import urljoin

import execjs
import requests
import pytesseract
from PIL import Image
from a import get_proxy  # 没有代理可以忽略
import base64
from Crypto.Cipher import DES
from Crypto.Util.Padding import unpad
import json

s = requests.session()
def req_payload(s, url, headers, data):
    res = s.post(url=url, data=json.dumps(data), verify=False, headers=headers)
    return res

def req_post(s, url, headers, data):
    res = s.post(url=url, data=data, headers=headers, verify=False)    # impersonate="chrome101",
    return res

def req_get(s, url, headers):
    res = s.get(url=url, headers=headers, verify=False)  # impersonate="chrome101",
    return res

def save_img(res, _id):
    with open("./某网站_img/{}.jpg".format(_id), "wb") as f:
        for chunk in res.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    f.close()

def OCR_shibie(path):
    # 指定 tesseract.exe 的安装路径
    pytesseract.pytesseract.tesseract_cmd = r'D:\Program Files\Tesseract-OCR\tesseract.exe'
    # 打开图片
    image = Image.open('./某网站_img/{}.jpg'.format(path))
    # OCR 识别
    text = pytesseract.image_to_string(image, lang='chi_sim')
    return  text

def DES_ECB_Pkck(data, keys):
    # data = b"{}".__format__(data)
    des = DES.new(key=keys.encode("utf-8"), mode=DES.MODE_ECB)
    r = des.decrypt(base64.b64decode(data))
    r = unpad(r, DES.block_size).decode("utf-8")
    return r

fuzzyKeyword = "XXXXX大学"
conclusionYear = "近五年"
pageNum = 1
data = {"complete": True, "fuzzyKeyword": fuzzyKeyword, "isFuzzySearch": True, "conclusionYear": conclusionYear, "dependUnit": "",
        "keywords": "", "pageNum": pageNum, "pageSize": 10, "projectType": "", "projectTypeName": "", "code": "",
        "ratifyYear": "", "order": "enddate", "ordering": "desc", "codeScreening": "", "dependUnitScreening": "",
        "keywordsScreening": "", "projectTypeNameScreening": ""}
headers = {
        "Accept": "application/json, text/plain, */*",
        "Content-Type": "application/json;charset=UTF-8",
        "Referer":  parse.quote("https://XXXX.XXX.XXX/finalSearchList?s={}".format(fuzzyKeyword)),
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    }
url = "https://XXXX.XXX.XXX/api/baseQuery/completionQueryResultsData"
res = req_payload(s, url, headers, data)
res.encoding = 'UTF-8'
text = res.text
# text = "BFjkDmNuMwgFUO1tnEEKp/jGSSR/i+e3W1IOLuJIBsiuC3rk6VGzPq4RWnnz5YLOzKirU1apUInNC1zjjDy4k3dA1znTXT7iKdTKBfP4n54y0hRbTwxWvFB/gL0sOehxKLY28E0Lk9UdJuIJyS0zCAzWzCk47s3+PXldk5lEezl+c55xohNdHj0+A4hc223IEaoceKzaAxWmeLZgxUTBVTN5GL6YpjW2YcAVyyNP/DzoSICPNuZlm52XKya/lFffmK8QpO6jVpCWKZhbR7Go7HI06qVGQYaFDCWpFkgrq8HmJVSHvhPEC6Hy4YP95b7IQLoqcMqjiUGOwH/EBnH3vUQL2NpbCyoAHLd0dqEWdEi1q2r+4sVVk3/975eUTwBRPT4DiFzbbchXDw2C+0fjo9eCCWwMbJvYg67lv6X0ckodF8VNWp09XBMrhd9f2HabffKb5dhF8pNRe52whNbpmbFrZ4852MnANjWJv9ys6/b6PUofwYuTFYLfswohTjL+HY6+3x3fmGeKnRM5izRk6gThGA55Ba/TKxouZH817j2aarz2fo6qDXTq+ki3Ubaq28hSFUTxHw/kBNzEBjKwNqWWnVqSRsvPXQXFHI8N16R8mNUg9vjvPerxUaycoyyWFUnjBnuzhtWGZxbXtlq3oPyF9ACgjPy+aBfL1emk1X2e7xj6PPhKYTB62kCNVUJXINWNPv0SO12D0jGmP+MupeHBpa1s5STLbWSWrqKDEmYVVNnuXrn8BD9I2OXD6gINWMoe4lZxKn7Qpg2qXbSs5KNOJ0reYWjD49+9D/pfCMFtvrkof0w7ehbKXRleaInf+q0Nv1sXjkBiY23mcYvE+YAxzK8+IRaFJ1D8t5Q5uGqBn6710pzvFLodikoSPEIGG3wJrQrGNRcZNIjSpxl7C/lXbYR6eYXG6zDTapa3UPW2SdCgsB7MMd8S2VMG0Y2xMacSyQKx1XkQe0SqiU689lnUDNr1CyAHO91olnNhIwn6Lh2o9bv9gKGG+iVirCm1brtUJpFTCPplI26fEuSJWRB7RKqJTrz2JxjE/Up/BLP1303ikaWLfhmY3q4N9p3ZGTWAr8edHe+DnAkpAyvU3VL5jsSrjPUMcQjawMyIQ0f3kSgulnBi538MbsyvCQuqKTY5t17p2HNhKpA6Jr+xLEbnEpe9A+Rrv2AlXlsf58lHIrLwypZzzCTjD6gCaJfpNjWJv9ys6/bu0dAk6lX9hC4cJFuRiHRLk+hlmAPb9BRPiNGxLnDsU54Gmfk4m/M05hNeQ+3z5s1FbVSRoTU8YMzHC+5yF4wDG5uWJ7nGUQMg1k5HhFckAUTmuxFzH9Xr8H36W5W3haspUpJCdo43ojta2Zzhowdp68O4RIYyF6GgpjwKzKj3TBR8hVI6mNTnwjXvO88I5Yf6CxANmQz7f+59YQj9M5CBNus49MCVYfLqWpnRHKqE8oLz/6pouSqw+biu+wXS6sf9cJGu3tgCgunl/3BNnTqLYi3Wc5uCB+AWT4N4JlIWYlBmW9HGzM8NTF5Cf6ZQoc3D7UpJg+dMnSlSkkJ2jjeiO1rZnOGjB2n97I6Pb7ZvWA6MLSp6nevkLsYWzUCgn+YIbVZgrNr5bQoQAm4/C5zbxhom3nicLa941u+rxFng+EPFic+VFGvxHY6+3x3fmGeKnRM5izRk6gThGA55Ba/TKxouZH817j09pm4TfxpGmEoG9xU5SHyjldipCxrvWHE/OHb7add93aWWnVqSRsvP3C/Zenya9Ul8mNUg9vjvPerxUaycoyyW/FyJt/G0U03PJbh+azBsaPJPACxTTOr4AktplnV1pXbzivS/5ZBPFE6tq/pPKY2E+O94kOYJlOKEZakRrecRzewEQoOZJv1ezU11pehYXLaxrxzoUA8aeUUHIi309EcnFdh/SMN8Oj9On2EWOXyVl85iwe4TQwfnf5aBuNR6dKSbohx7pveMPyLWpsNeHFEtLG6UUw8qX/WReCJnHnxPypcX8Or3WfrXZKo/Q1mic7BD79MoGT+YaktOoYJn8zV1pXFUw2CDNw/+pW+i3+ac05K3hgAN/1CSLXQP1DtY6Kf8GZDj4c0fo4RNcq28yL6BvLlVpFHT9kKhWy9GDjGBk8kHdSop4umDA+p0sZKk5CveP7qzcaoU+kxPEv41BFXrfr2scWSN5c9PTWeDimwTbCZLm6tmY7pY+LlYvuleX2t87+ESaaNBvvQV92IyMzw7a7CIYBa45/h3DR5MMylpuFWjW8g2DSfx4nPUoUwRP4de4y3gOJjZj14RPRmR0lRE/ozSWHpjOll6E4w77UPsYECy+HgMOftFtMyS7+xJMvIxNdRJJK4TsDmMu599XKjUEZwIBav3CSs26zj0wJVh8upamdEcqoTygvP/qmi5KrD5uK77BdLqx/1wka7e2AKCmEGV3iPcWYRegZ0mGNw6fcB1lvsNjBT8UGZb0cbMzw00TtrM+6SmGKw/RXJTkagedRAGdW20TxmyBO+3BSud2d8iyklN4oIm5edpnp2db6zs8PJs3BvPi+Oc969UisF4D5tdhRLh5B/H5mtHMNrUWB2KYELUxMkDH9UFyQ6/qtlxCNrAzIhDR/eRKC6WcGLnfwxuzK8JC6opNjm3XunYcyuiJ0bw6iRFD0Ox6peHjcf8xGXmciJKY18vBw54pkl8xZ7uSVzvQkY2NYm/3Kzr9oHWy8doWOGYLhwkW5GIdEuT6GWYA9v0FJqp2nDFKKKE/79PRVtmVTI4Asu9u2Z4TyhCZIrkmLBP6Gs9eV6uLLl354FLXEiNJsTjgLDspj9f71PV+qxsjjXotnJTkNRQ0Q/zl7lLBZ+QvUKk58hAswBm4JBIGHtdiF1iYpST5oN8JqQbfXa2udeICTfIr8TnBaYOhpvJ9wKu35pOOWSi2bURZFk/rhR6jKoqjXdec23eJuKQZw9gxWMU5RV1UJEb3Pvl7PjsEMz5wWCtgdUc0ks4QyNatNJx6oX73CYsyw2X+SiY1kQnHaO0mbJEBgKHwh3wS0o14EONCSL78pxpitIT65tON436ULR1oVraT98EcB9tSU8/STtodcQ+fLtSLWArlSRhS5cbz9NQ+3lIHL8MT1KlJHbGeVR0mrc3rmriALHyH7NTOcDte919QLWeuBrjo/3KFCuG7QFzSolwbpQCjmsFJ7TPYP3cuW+ZmL1Qe5yz8Pv4t+/Q+eO6MW71UzrQCYm8PkLexhom3nicLa+q2AUc+d0SOX2vmTM4A3Cp84F5yFCofpV/5K6X4SWXrFBQlabZMWZ0K23ifyt+JXkOE1FCGu1PqBhXtxjgaBtodOuPvvxcbfARYI4xnVa7Ac/RSbjRMuDz3Rdb4K5afVNedADzRMBvbqR+mL2wOhqTcrycEkhJvh6HWtZPxUbyttL4dg9QJNuTDXpYCcS2tJZ5xVB80dYgDLiRb0b9FiP6lp9PmyXdc1VJEqjRSVl4JRZFfA8C1Hw2YlREBR+/9/snUeJFLihZHsf22FIjG8IHb7bl6SqRZZh++AxNlmSBiVx7giXk2bzD+7Z6I91DysljOtdhoithRvvmOtnOVsIjG6DRNZ6753TY7BE+Cv4qCgX2qJZurL6DILjXfDEGCc1Gybff1UkfPe3e/axgZjQDrTAuEj7NcXLq9+fdXHIW5kktv4lvJaCNp4oYHQPfBJyJSsE4VM1IBIJjFSWP4ki12IlloHspYenFd+r3U0ZxQZ3WQCaJuDiHUmm1Db+5v1HYMcVEBfTmqvjjgIry/cUo2ESLH9app9dHI1e9yhO9p3A8HEL1nFT2QOz1Sku4Ae5RvL0p7fPUbqxe4XwN/NeD5athrETEBMZNfxeq/yA11A3o9CRIQGRIsJCVEzWt63ttH2CQReBgolTYGz5X7qmmUTXSlru5QNuSYhm8vzLGlz+DrewP6/CirMhJsurAb94kTE67ajZjP+wJVhZVsedbA1AaDfDM3BAMJakWSCurweYlVIe+E8QLVu4sv+EQ6jyMVFcDsseW2cxtniiaPhuvRAvY2lsLKgB+WULU9g1Fdow6BuPYMA8x8q8eTK0XG0JjsYyXGLWjxs73cJwyW6LN6HKOOEQWr3wtlZayDMWPvsih2Y78GlLovm9MB8ub9eC3J+/IfDSoF5HV6XA3lVsboqjJj8nS/AEnQ+113RMmhuEhOstSLv05tMVfZH/lf7wpqpGL60q5luZ+A6E1/M2YB7ws2vCB08g0/NJDWjV9uk+cO5MthFc1Ubg6x9ii3UWMHtxWrjdqJKvQWEONEI/WkJOuazMDhNnKVDZzCgdTXox0p5uZnvoFrAlqKp1Q+sET54ZsYUwKQV9XzI7a3csdOrJiBbeSN/0BN7EmmUIuQvNB+tRPBnbj3ss030A0tCOETiV14Tfb6/w/cdzMF1w50L13LOtIDm4ZwmFfAalkASX3u0kFgf90Nus49MCVYfLqWpnRHKqE8oLz/6pouSqw+biu+wXS6sf9cJGu3tgCgunl/3BNnTqL0fmvNuM4+Zjy2QvV4RdNvwH0zc7MygMXLiMgakYmWuGdwuUWhiQ8QDjgz/9gF7hMXe6gcaXqHM1FWuPUb4NHqySF7MDEMrgeW3ws/05bRZCPbPl7mmdVN7cn78h8NKgXkdXpcDeVWxuiqMmPydL8ASdD7XXdEyaG2IsevED8R8G8ievTl6HUAT1jhOWTVl/xaILKc1ONz5F1n9nI2c+l6fsjKW4VRyzS2+PkAcsS6OJRuDrH2KLdRYwe3FauN2okepiEV5vWg2tv71EmQqd2Cq6VK0QuU1ow9xpgSfo2vQbh6+KuBGNBb5Y+Zego/UzAzA+ejTEkrGrPIsSunD9ypeNTmipqrhVxGq64U/P38H3RUVgVd1mYOSAA9sN2n3p677OLIod7ZnsdCKiM+hTBPYubDHoKUxjJim4MhYJusWLhkORmZPa4zibikGcPYMVjFOUVdVCRG9z75ez47BDM+cFgrYHVHNJLOEMjWrTScepJWSNPnBlUBpbrs+xh6lVStJmyRAYCh8JOTA2WVsYQjnmXwudZj1ENtPZIFOTndwkKyE8Zi1XfIdgMU4TTaJtUialZuy2/G5Fp9NG+WheE5FrO7qHzgU51zyLErpw/cqVQJiB+7LwdczY1ib/crOv24xZj5I+e66VDxYnPlRRr8R2Ovt8d35hnip0TOYs0ZOoE4RgOeQWv0ysaLmR/Ne495JNsom9gP7NOIce5JZ9/jCweTjGMMCmEdG+Mn+Mk8fullp1akkbLzwk75vYzxr3/fJjVIPb47z3q8VGsnKMslhVZhAdUpRsrrMpT/87f8OtomvlKm1NoYyMjM2Elo71M"
if "访问" not in text:
    text_data = DES_ECB_Pkck(text,'IFROMC86')
    try:
        data = json.loads(text_data)
    except:
        data = {}

    data = data.get('data',{})
    resultsData = data.get('resultsData',[])
    for rd_list in resultsData[:2]:
        if not rd_list: continue
        _id = rd_list[0]
        print(_id)
        pdf_url = "https://XXXX.XXX.XXX/api/baseQuery/completeProjectReport" 
        pdf_data = {"id":_id, "index":"1"}
        pdf_headers = {
            "Accept": "application/json, text/plain, */*",
            "Authorization": "Bearer undefined",
            "Content-Type": "application/x-www-form-urlencoded",
            "Referer": parse.quote("https://XXXX.XXX.XXX/finalDetails?id={}".format(_id)),
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        }
        pdf_res = req_post(s, pdf_url, pdf_headers, pdf_data).json()
        img_url = pdf_res.get('data',{}).get('url','')
        if img_url:
            img_url = urljoin(pdf_url, img_url)
            img_res = req_get(s, img_url, pdf_headers)
            save_img(img_res, _id)

        ocr_res = OCR_shibie(_id)
        print(_id)
        print(ocr_res)



  • 6
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值