基于python的文书网网络爬虫代码以及数据

# coding:utf-8
import json
import time
import random
import base64
import datetime


import requests
from lxml import etree
from Crypto.Cipher import DES3
from Crypto.Util.Padding import pad, unpad


class Des(object):

    @staticmethod
    def encrypt(text, key):
        text = pad(text.encode(), DES3.block_size)
        iv = datetime.datetime.now().strftime('%Y%m%d').encode()
        cryptor = DES3.new(key, DES3.MODE_CBC, iv)
        x = len(text) % 8
        ciphertext = cryptor.encrypt(text)
        return base64.b64encode(ciphertext).decode("utf-8")

    @staticmethod
    def decrypt(text, key):
        iv = datetime.datetime.now().strftime('%Y%m%d').encode()
        cryptor = DES3.new(key, DES3.MODE_CBC, iv)
        de_text = base64.b64decode(text)
        plain_text = cryptor.decrypt(de_text)
        out = unpad(plain_text, DES3.block_size)
        return out.decode()


class Demo(object):

    def __init__(self, api, cookie=''):
        self.api = api
        self.cookie = cookie

    def make_ciphertext(self):
        timestamp = str(int(time.time() * 1000))
        salt = ''.join([random.choice('0123456789qwertyuiopasdfghjklzxcvbnm') for _ in range(24)])
        iv = datetime.datetime.now().strftime('%Y%m%d')
        des = Des()
        enc = des.encrypt(timestamp, salt)
        strs = salt + iv + enc
        result = []
        for i in strs:
            result.append(bin(ord(i))[2:])
            result.append(' ')
        return ''.join(result[:-1])

    def start_page(self):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
        }
        url = 'http://wenshu.court.gov.cn/website/wenshu/181217BMTKHNT2W0/index.html'
        with requests.get(url, headers=headers) as req:
            req.raise_for_status()
            cookie_80s = req.cookies.get('HM4hUBT0dDOn80S')
            assert cookie_80s, '文书网异常,80s获取失败'
            html = req.text
        root = etree.HTML(html)
        meta = root.xpath('//meta[last()]/@content')[0]
        js_txt = root.xpath('//script[@r="m"]/text()')[0]
        with requests.post(self.api, json={'meta': meta, 'js_txt': js_txt}) as req:
            req.raise_for_status()
            cookie_80t = req.json()['cookie']
            assert cookie_80t, '80t生成失败'
            print(cookie_80t)
        self.cookie = f'HM4hUBT0dDOn80S={cookie_80s};HM4hUBT0dDOn80T={cookie_80t};HM4hUBT0dDOnenable=true'

    def get_list(self):
        assert self.cookie, '没有正确生成cookie'
        headers = {
            'Cookie': self.cookie,
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
        }

        token = ''.join([random.choice('0123456789qwertyuiopasdfghjklzxcvbnm') for _ in range(24)])
        post_data = {
            'sortFields': 's50:desc',
            'ciphertext': self.make_ciphertext(),
            'pageNum': 1,
            'pageSize': 5,
            'queryCondition': '[{"key":"s42","value":"2020"}]',
            'cfg': 'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@queryDoc',
            '__RequestVerificationToken': token,
        }
        url = 'http://wenshu.court.gov.cn/website/parse/rest.q4w'
        with requests.post(url, headers=headers, data=post_data) as req:
            req.raise_for_status()
            assert req.text, "文书网接口异常"
            result = Des.decrypt(req.json()['result'], req.json()['secretKey'])
            print(result)
        first_doc_id = json.loads(result)['queryResult']['resultList'][0]['rowkey']
        self.get_info(first_doc_id)

    def get_info(self, doc_id):
        print('-------详情页-------')
        headers = {
            'Cookie': self.cookie,
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
        }

        token = ''.join([random.choice('0123456789qwertyuiopasdfghjklzxcvbnm') for _ in range(24)])
        post_data = {
            'docId': doc_id,
            'ciphertext': self.make_ciphertext(),
            'cfg': 'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch',
            '__RequestVerificationToken': token,
        }
        url = 'http://wenshu.court.gov.cn/website/parse/rest.q4w'
        with requests.post(url, headers=headers, data=post_data) as req:
            req.raise_for_status()
            assert req.text, "文书网接口异常"
            print(req.text)
            result = Des.decrypt(req.json()['result'], req.json()['secretKey'])
            print(result)


if __name__ == '__main__':
    api = 'http://118.24.27.218:8012/wenshu'
    sss = Demo(api=api)
    sss.start_page()
    sss.get_list()

        大部分反扒需要只需要拷贝js下来之后使用pyexecjs模块来运行这些js,现在系统进行了升级,不仅需要人脸验证,还需要实名认证。之前使用模拟浏览器的方式已经行不通了

        我们多次遍历,获取了1.2亿数据,约100GB,全部采集于开放数据。我们多次遍历,获取了1.2亿数据,约100GB,全部采集于开放数据。我们多次遍历,获取了1.2亿数据,约100GB,全部采集于开放数据。

我们多次遍历,获取了1.2亿数据,约100GB,全部采集于开放数据。

我们多次遍历,获取了1.2亿数据,约100GB,全部采集于开放数据。

我们多次遍历,获取了1.2亿数据,约100GB,全部采集于开放数据。

我们多次遍历,获取了1.2亿数据,约100GB,全部采集于开放数据。

         The Judgement Documents Network is a unified public platform for adjudication documents established in 2013 and one of the three major platforms for public disclosure. By August 23, 2017, the Judgement Documents Network had made public more than 32.47 mill

     The Judgement Documents Network is a unified public platform for adjudication documents established in 2013 and one of the three major platforms for public disclosure. By August 23, 2017, the Judgement Documents Network had made public more than 32.47 million adjudication documents, including more than 7,600 language documents; the total number of visits exceeded 10 billion, with an average of 17.29 million daily visits and a maximum of 50 million visits in a single day, and more than 1.75 billion visits from overseas, including more than 210 countries and regions worldwide. As of 18:00 on 30 August 2020, the total number of documents on the Magistrate's website exceeded 100 million, with a total of nearly 48 billion visits, making it the largest public website of magistrate's documents in the world.

        The total number of data 120 million, can open formal information technology service tax, with Wuhan University, Huazhong University of Science and Technology in-depth cooperation, if there is a subject group demand yemanzhongting @ foxmail. com

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值