python 爬虫工具类

第一版

# -*- coding: utf-8 -*-
import functools
import json
import pprint
import re
import time
import traceback

import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from tqdm import tqdm

# 禁止ssl证书验证警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

# 默认headers
default_headers = {
   
    'User-Agent': 'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/92.0.4515.131Safari/537.36',
    'sec-ch-ua': '"Chromium";v="92","NotA;Brand";v="99","GoogleChrome";v="92"',
    'sec-ch-ua-mobile': '?0'}


def tryError(fn):
    """
    装饰器 直接对函数的运行try-catch
    :param fn:
    :return:
    """

    @functools.wraps(fn)
    def wrapper(*args, **kvargs):
        try:
            ret = fn(*args, **kvargs)
            return ret
        except Exception as e:
            print("=========================")
            msg = f'出错原因: {
     e.__doc__}  {
     e} \n函数名: {
     fn.__name__}\n函数说明: {
     fn.__doc__}'
            print(msg)
            print("=========================")
            time.sleep(0.2)
            traceback.print_exc()

    return wrapper


@tryError
def getText(url: str, headers: dict = None, params: dict = None) -> str:
    """
    获取返回的Text
    :param params:
    :param url:
    :param headers:
    :return:
    """
    t_headers = headers if headers else default_headers  # 如果headers为None就使用默认的
    t_headers['Accept-Encoding'] = 'utf-8'
    t_headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
    with requests.session() as session, session.get(url=url, headers=t_headers, verify=False, params=params) as resp:
        resp.encoding = resp.apparent_encoding
        return resp.text


@tryError
def postText(url: str, headers: dict = None, datas: dict = None) -> str:
    """
    获取返回的Text post提交
    :param datas:
    :param url:
    :param headers:
    :return:
    """
    t_headers = headers if headers else default_headers
    t_headers['Accept-Encoding'] = 'utf-8'
    t_headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
    with requests.session() as session, session.post(url=url, headers=t_headers, verify=False, data=datas) as resp:
        resp.encoding = resp.apparent_encoding
        return resp.text


@tryError
def getJson(url: str, headers: dict = None, params: dict = None) -> dict:
    """
    获取返回的Json
    :param params:
    :param url:
    :param headers:
    :return:
    """
    t_headers = headers if headers else default_headers  # 如果headers为None就使用默认的
    t_headers['Accept-Encoding'] = 'utf-8'
    t_headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
    return json.loads(getText(url=url, headers=t_headers, params=params))


# @tryError
# def getBin(url: str, headers: dict) -> bytes:
#     """
#       从网络上面获取二进制字节流
#     :param url:
#     :param headers:
#     :return:
#     """
#     with requests.session() as session, session.get(url=url, headers=headers, stream=True, verify=False) as resp:
#         return resp.content


@tryError
def writeBin(filePath: str, url: str, headers: dict = None) -> None:
    """
    从网络上面下载二进制文件到本地
    :param filePath:
    :param url:
    :param headers:
    :return:
    """
    t_headers = headers if headers else default_headers
    with requests.session() as session, session.get(url=url, headers=t_headers, stream=True, verify=False) as resp:
        with open(filePath, 'wb+') as wf:
            wf.write(resp.content)
        print(f'{
     filePath} down ok !')


@tryError
def writeText(filePath: str, url: str, headers: dict = None) -> None:
    """
    从网络上面下载文本文件到本地
    :param filePath:
    :param url:
    :param headers:
    :return:
    """
    t_headers = headers if headers else default_headers
    t_headers['Accept-Encoding'] = 'utf-8'
    t_headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
    with requests.session() as session, session.get(url=url, headers=t_headers, stream=True, verify=False) as resp:
        with open(filePath, 'w', encoding='utf-8') as wf:
            wf.write(resp.text)
        print(f'{
     filePath} down ok !')


@tryError
def writeBinBar(filePath: str, url: str, headers: dict = None) -> None:
    """
    从网络上面下载二进制文件到本地
    :param filePath:
    :param url:
    :param headers:
    :return:
    """
    t_headers = headers if headers else default_headers
    with requests.session() as session, session.get(url=url, headers=t_headers, stream=True, verify=False) as resp:
        fileSize = int(resp.headers['content-Length'])
        with tqdm(initial=0, total=fileSize, unit_scale=True, unit='B') as pbar:
            with open(filePath, 'wb+') as wf:
                for chunk in resp.iter_content(chunk_size=1024):
                    if chunk:
                        wf.write(chunk)
                        pbar.update(len(chunk))
            print(f'{
     filePath} down ok !')


@tryError
def getHeadersKV(pstr=''):
    """
    获取Headers KV
    :return:
    """
    filePath = 'getKV.txt'
    if pstr == '':
        with open(filePath, 'r', encoding='utf-8') as rf:
            string = rf.read()
    else:
        string = pstr
    s = string.replace(' ', '').splitlines()
    s = [item for item in s if item != '']
    headers = {
   }
    for item in s:
        key = item.split(':')[0]
        value = item.replace(fr'{
     key}:', '').strip()
        headers[key] = value
    print("headers=", end='')
    pprint.pprint(headers)


def removeBlank(pstr: str) -> str:
    """
    移除字符串中的空白 利用正则表达式
    :param pstr:
    :return:
    """
    return re.sub(re.compile(r'\s+', re.S), '', pstr)


@tryError
def getListByRe(restr: str, url: str, headers: dict = None) -> list[list]:
    """
    用正则表达式从文本提取 所有符合条件的
    :param restr:
    :param url:
    :param headers:
    :return:
    """
    getHtml = getText(url=url, headers=headers)
    getHtml = removeBlank(getHtml)
    names = re.compile(r'P<(?P<getName>.*?)>', re.S).findall(restr)
    print(names)
    finditer = re.compile(restr, re.S).finditer(getHtml)
    tasks = []
    for item in finditer:
        task = []
        for name in names:
            task.append(item.group(name))
        tasks.append(task)

    return tasks


@tryError
def getSingleByRe(restr: str, url: str, headers: dict = None) -> list:
    """
    用正则表达式从文本提取 第一个符合条件的
    :param restr:
    :param url:
    :param headers:
    :return:
    """
    getHtml = getText(url=url, headers=headers)
    getHtml = removeBlank(getHtml)
    names = re.compile(r'P<(?P<getName>.*?)>', re.S).findall(restr)
    search = re.compile(restr, re.S).search(getHtml)
    task = []
    for name in names:
        task.append(search.group(name))
    return task


if __name__ == '__main__':
    getHeadersKV()

第二版

# -*- coding: utf-8 -*-
import datetime
import functools
import json
  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值