无讼案例网爬虫分享

瓦匠徐尼玛

于 2020-11-14 12:22:00 发布

阅读量844

点赞数

文章标签： python 爬虫大数据

本文链接：https://blog.csdn.net/qq_41238404/article/details/109689125

版权

代码


```python
'''
date: 2020.11.11
time: 17:21:24
网站地址：https://wusong.itslaw.com/bj
by：XUZHIWEI
'''
import re
import requests
import xlwt
import time
import numpy as np
import json


def get_pages(j):
    """
    不断获取翻页后的源代码（每页源代码包含20案例url需要的参数值）
    :param j: 翻页参数
    :return: 每页的源代码
    """
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
        "Cookie": "UM_distinctid=172ad1c13215a5-096f2b20b21719-5d462912-144000-172ad1c1322982;home_sessionId=true; subSiteCode=bj;cookie_allowed=true;reborn-userToken=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxODk5NjA1MzI2NCJ9.UoDVvWzLxlyz_Qz6sJ6_FrW3EJWHRbvMCMJuuzcxglA;CNZZDATA1278721950=371480104-1592040232-https%253A%252F%252Fwww.baidu.com%252F%7C1592043421",
    }
    url = "https://www.itslaw.com/api/judgements?_timer=1592904610377&sortType=1&startIndex=" + str(
        j) + "&countPerPage=20&conditions=searchWord%2B%E7%94%B5%E4%BF%A1%E7%BD%91%E7%BB%9C%E8%AF%88%E9%AA%97%2B1%2B%E7%94%B5%E4%BF%A1%E7%BD%91%E7%BB%9C%E8%AF%88%E9%AA%97&conditions=trialRound%2B1%2B8%2B%E4%B8%80%E5%AE%A1"
    response = requests.get(url, headers=head).text
    # print(response)
    return response


def get_source():
    '''
    提取参数，拼接url,保存url到csv文件
    :return: url列表
    '''
    # 构造每个案例的url
    i = 0
    url_list = []
    for j in range(20, 2000, 20):
        response = get_pages(j)
        time.sleep(1)
        parameters = re.findall('"temporarySearchReport":false}.*?{"id":"(.*?)","title":"(.*?)","caseType', response,
                                re.S)  # 正则提取参数
        for parameter in parameters:
            i += 1
            url = "https://www.itslaw.com/api/judgements/detail?_timer=1592057292299&judgementId=%s" % parameter[0]
            print("第%s个url" % i, url)
            url_list.append(url)
        save_data("每个案例的url.csv", url_list)
    print("总共获取了%s个url" % len(url_list))
    return url_list


def save_data(file_name, datas, rowx=0, colx=0):
    """
    保存数据，只能保存一维列表或是二维列表。
    :param file_name: 保存为的文件名，需要带后缀
    :param datas: 以列表形式提供要保存的数据
    :param rowx: 行号
    :param colx: 列号
    :return: None
    """
    if np.ndim(datas) == 1:
        # 纵向保存一位列表
        writebook = xlwt.Workbook(file_name)
        sheet = writebook.add_sheet("result", cell_overwrite_ok=True)
        for data in datas:
            # print(data)
            sheet.col(0).width = 35000  # 设置列宽
            sheet.write(rowx, colx, data)
            rowx += 1
        writebook.save(file_name)
        print("数据保存完成！")
    elif np.ndim(datas) == 2:
        # 保存二维列表
        writebook = xlwt.Workbook(file_name)
        sheet = writebook.add_sheet("信息", cell_overwrite_ok=True)
        rowx = 0
        for a in datas:
            rowx += 1
            colx = 0
            x = 0
            y = [25000, 3500, 8000, 5000, 5000, 5500, 4000, 5000, 5000, 3000, 3000, 3000, 3000, 3000]  # 列数变多，需要加数
            for infor in a:
                # print(data)
                sheet.col(colx).width = y[x]  # 设置不同的列宽
                sheet.write(rowx, colx, infor)
                colx += 1
                x += 1
        writebook.save(file_name)
        print("数据保存完成！")
    else:
        print("保存的数据既不是一维列表，也不是二维列表，不能保存！")

def get_oneurl_response():
    url_list = get_source()
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
        "Cookie": "UM_distinctid=172ad1c13215a5-096f2b20b21719-5d462912-144000-172ad1c1322982;home_sessionId=true; subSiteCode=bj;cookie_allowed=true;reborn-userToken=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxODk5NjA1MzI2NCJ9.UoDVvWzLxlyz_Qz6sJ6_FrW3EJWHRbvMCMJuuzcxglA;CNZZDATA1278721950=371480104-1592040232-https%253A%252F%252Fwww.baidu.com%252F%7C1592043421",
    }
    all = []  # 二维列表，子列表存储每个案例提取的信息
    all.append(["标题", "案件类型", "案件号",  "检察院名字", "被告人姓名",
                "出生日期", "出生城市", "文化", "工作","判处年限","罚金"])
    all.append(["", "", "", "", "", "", "", "", "", "", ""])
    for url in url_list:
        try:
            print(url)
            r = requests.get(url, headers=head).text
            res = json.loads(r)
            dict_data = res['data']
            wenben = dict_data['fullJudgement']

            # 标题
            title = wenben['title']
            # 案件类型
            caseType = wenben['caseType']
            # 案例号
            caseNumber = wenben['caseNumber']
            # 检察院名字
            proponents = wenben['proponents']
            proponents = proponents[0]
            proponents_name = proponents['label']

            # 被告人姓名
            opponents = wenben['opponents']
            opponents = opponents[0]
            opponents_name = opponents['name']

            #提取判决年限
            time_re = re.findall('徒刑((\S{1,2}年\S{1,2}个月).*?)', r, re.S)
            try:
                if not time_re:
                    time = '无'
                else:
                    time = time_re[0][0]
            except:
                time = "无"

            #提取罚金
            try:
                money = re.findall('罚金人民币(.*?)元', r, re.S)[0]
            except:
                money = "无"
            # print('---->',time,money)

            # 提取出生日期
            paragraphs = wenben['paragraphs'][0]
            # print('提取判决结果---->',type(paragraphs))
            subParagraphs = paragraphs['subParagraphs'][1]
            text = subParagraphs['text'][0]
            try:
                date_of_birth = re.findall("\d{4}年\d{1,2}月\d{1,2}日", text, re.S)[0]
            except:
                date_of_birth = "无"

            # 提取出生地
            try:
                if "自治区" not in text:
                    if "省" in text and "市" in text:
                        city_of_birth = re.findall("(\S{1,2}省\S{2,3}市)|(\S{1,2}省\S{1,3}县)", text, re.S)[0]
                        if "市" in str(city_of_birth):
                            city_of_birth = city_of_birth[0]
                        elif "县" in str(city_of_birth):
                            city_of_birth = city_of_birth[1]
                    elif "市" in text and "县" in text:
                        city_of_birth = re.findall("\S{1,2}市\S{1,2}县", text, re.S)[0]
                    elif "台湾" in text:
                        city_of_birth = "台湾"
                else:
                    if "内蒙古自治区" in text:
                        city_of_birth = re.findall("(\S{3}自治区\S{2,3}市)|(\S{3}自治区\S{2,3}县)", text, re.S)
                        if "市" in str(city_of_birth):
                            city_of_birth = city_of_birth[0][0]
                        elif "县" in str(city_of_birth):
                            city_of_birth = city_of_birth[0][1]
                    elif "广西壮族自治区" in text:
                        city_of_birth = re.findall("(\S{4}自治区\S{2,3}市)|(\S{4}自治区\S{2,3}县)", text, re.S)
                        if "市" in str(city_of_birth):
                            city_of_birth = city_of_birth[0][0]
                        elif "县" in str(city_of_birth):
                            city_of_birth = city_of_birth[0][1]
                    elif "新疆维吾尔自治区" in text:
                        city_of_birth = re.findall("(\S{5}自治区\S{2,3}市)|(\S{5}自治区\S{2,3}县)", text, re.S)
                        if "市" in str(city_of_birth):
                            city_of_birth = city_of_birth[0][0]
                        elif "县" in str(city_of_birth):
                            city_of_birth = city_of_birth[0][1]
                    else:
                        city_of_birth = "无"
            except:
                city_of_birth = "无"

            # 提取文化程度
            try:
                if '小学' in text:
                    culture = "小学"
                elif '初中' in text:
                    culture = "初中"
                elif '专科 ' or '中专' in text:
                    culture = "专职"
                elif '高中' in text:
                    culture = "高中"
                elif '大学' in text:
                    culture = "大学"
            except:
                culture = '小学'

            #提取职业
            try:
                work = re.findall('文化，(.{1,2})', text, re.S)[0]
            except:
                work = "无业"

            all_infor = [title, caseType, caseNumber, proponents_name, opponents_name,date_of_birth,
                     city_of_birth, culture, work,time,money]
            print(all_infor)
        except:
            continue
        all.append(all_infor)

    save_data("案例爬取信息.csv", all)


get_oneurl_response()

问题：每次抓取到一百条左右的信息，网站就会被禁止访问，下次会更新相应的防封策略

瓦匠徐尼玛

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
无讼案例网爬虫分享

代码```python'''date: 2020.11.11time: 17:21:24网站地址：https://wusong.itslaw.com/bjby：XUZHIWEI'''import reimport requestsimport xlwtimport timeimport numpy as npimport jsondef get_pages(j): """ 不断获取翻页后的源代码（每页源代码包含20案例url需要的参数值） :para
复制链接

扫一扫