Python爬虫数据获取模板与使用方法（v2.0版本）

和谐号hexh

已于 2023-08-26 16:17:00 修改

阅读量678

点赞数

文章标签： python 爬虫开发语言

于 2023-08-21 06:14:51 首次发布

本文链接：https://blog.csdn.net/m0_72524813/article/details/132400506

版权

1.模板架构

模板有四个py文件，我放在crawlerTemplate包下。

（1）getAgent模块

# -*- coding: utf-8 -*-
# @Time: 2023-08-20 20:14
# @Author: hexh
# @File: getAgent.py
# @Software: PyCharm
from random import randint

# 随机获取身份
def main():
    USER_AGENTS = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    ]
    return USER_AGENTS[randint(0, len(USER_AGENTS) - 1)]

user-Agent是我们每次发送请求的身份标识。

每次使用不同的user-Agent，可以更好的隐藏身份，防止被封。

参考网址：

爬虫请求网站时报错http.client.RemoteDisconnected: Remote end closed connection without response 请求网站时报错_如果我变成回忆l的博客-CSDN博客

（2）getHTMLByUrllib模块

# -*- coding: utf-8 -*-
# @File: getHTMLByUrllib.py
# @Author: 和谐号
# @Software: PyCharm
# @CreationTime: 2023-08-23 3:25
# @OverviewDescription:
import gzip
import urllib
from io import BytesIO

def main(info,configLog, timeoutTime):
    # 根据从目标网页的txt文件中提取请求信息info，正式爬取HTML响应内容
    #
    # 传入：
    # 从目标网页的txt文件中提取请求信息：info ([url,method,data,header])
    # 相关配置信息：configLog
    # 最大允许等待时间：timeoutTime
    #
    # 返回一个参数：
    # 响应内容：html

    # 打包请求信息
    if info[1] == "POST":
        if configLog["表单数据形式"] == "字典":
            data = bytes(urllib.parse.urlencode(info[2]), encoding="utf-8")  # 打包data表单
        else:
            data = info[2].encode("utf-8")
        req = urllib.request.Request(url=info[0], headers=info[3], data=data, method="POST")
    elif info[1] == "GET":
        req = urllib.request.Request(url=info[0], headers=info[3])
    else:
        print("请求类型错误：", info[1])
        return None

    try:
        # 发送请求，得到响应response
        if timeoutTime > 0:
            response = urllib.request.urlopen(req, timeout=timeoutTime)
        else:
            response = urllib.request.urlopen(req)

        # 解码responses到html，若是Gzip压缩，二进制文件以"1f8b08"开头，否则直接解码
        html = response.read()
        if html.hex().startswith("1f8b08"):
            buff = BytesIO(html)
            f = gzip.GzipFile(fileobj=buff)
            html = f.read().decode('utf-8')
        else:
            html = html.decode('utf-8')
        return html

    except (urllib.error.URLError, Exception) as e:
        if hasattr(e, "code"):
            print("urllib报错，响应状态码：", e.code)
        if hasattr(e, "reason"):
            print("urllib报错，原因：", e.reason)

（3）getHTMLByRequests模块

# -*- coding: utf-8 -*-
# @File: getHTMLByRequests.py
# @Author: 和谐号
# @Software: PyCharm
# @CreationTime: 2023-08-23 3:26
# @OverviewDescription:

import requests
from requests.exceptions import ReadTimeout, HTTPError, RequestException


def main(info, configLog, timeoutTime):
    # 根据从目标网页的txt文件中提取请求信息，正式爬取HTML响应内容
    #
    # 传入：
    # 从目标网页的txt文件中提取请求信息：info ([url,method,data,header])
    # 相关配置信息：configLog
    # 最大允许等待时间：timeoutTime
    #
    # 返回一个参数：
    # 响应内容：html

    try:
        if info[1] == "POST":
            if timeoutTime > 0:
                if configLog["ContentType"] in ["json(自动配置)", "json(手动配置)"]:
                    response = requests.post(info[0], headers=info[3], json=info[2], timeout=timeoutTime)
                else:
                    response = requests.post(info[0], headers=info[3], data=info[2], timeout=timeoutTime)
            else:
                if configLog["ContentType"] in ["json(自动配置)", "json(手动配置)"]:
                    response = requests.post(info[0], headers=info[3], json=info[2])
                else:
                    response = requests.post(info[0], headers=info[3], data=info[2])
        elif info[1] == "GET":
            if len(info[2]) > 0:  # 这里或许能优化
                if timeoutTime > 0:
                    response = requests.get(info[0], headers=info[3], params=info[2], timeout=timeoutTime)
                else:
                    response = requests.get(info[0], headers=info[3], params=info[2])
            else:
                if timeoutTime > 0:
                    response = requests.get(info[0], headers=info[3], timeout=timeoutTime)
                else:
                    response = requests.get(info[0], headers=info[3])
        else:
            print("请求类型错误：", info[1])
            return None

        # 解码：
        if response.status_code == 200:
            html = response.text  # 如果输出乱码，这里可以考虑使用contents属性
            return html
        else:
            print('请求失败，状态码:', response.status_code)
            print('Error response:', response.text)
            return None

    except ReadTimeout as e:
        print('Timeout', e)
    except HTTPError as e:
        print('Http error', e)
    except RequestException as e:
        print('Error', e)

（4）crawler模块

# -*- coding: utf-8 -*-
# @Time: 2023-08-20 22:23
# @Author: hexh
# @File: crawler.py
# @Software: PyCharm
from crawlerTemplate import getAgent, getHTMLByUrllib, getHTMLByRequests


def toDict(theList, noNeedKey):
    # 将data或header的格式，从字符串list转换为字典
    #
    # 传入：
    # 待转换的list：List
    # 不需要的字段key列表：noNeedKey
    #
    # 返回一个参数：
    # 转换后的字典：res
    res = {}
    for item in theList:
        if ":" not in item:
            continue
        i = item.index(":")
        if item[0:i] in noNeedKey:
            continue
        res[item[0:i]] = item[i + 2:-1] if item.endswith("\n") else item[i + 2:]
    return res


def getRequestInfoFromTxt(path, data, headerNoneedKey):
    # 从目标网页的txt文件中提取请求信息
    #
    # 传入：
    # txt文件路径：path
    # 手动配置的数据表单：data:
    # header中不需要的key：headerNoneedKey
    #
    # 返回一个参数：列表info
    # [url, method, data, header]

    # 读
    contextList = []
    try:
        f = open(path, "r", encoding='utf-8')
        try:
            contextList = f.readlines()
        except Exception as e:
            print(e)
        finally:
            f.close()
    except Exception as e:
        print(e)

    # 解析：
    header = []
    url = ""
    method = "未检测出请求类型，请检查配置文件"
    tmp = "请求 URL:\n"
    try:
        if tmp in contextList:
            i = contextList.index(tmp)
            if data == "auto":
                data = contextList[0:i]
            url = contextList[i + 1][0:-1]
            method = contextList[i + 3][0:-1]
            header = contextList[i + 10:]
            header = toDict(header, headerNoneedKey)
    except Exception as e:
        print("txt文件配置错误", e)
    return [url, method, data, header]


def config(data, info, libraryUsed, isPrint):
    # 记录、更改爬虫配置
    #
    # 传入：
    # 主方法中手动配置的数据表单：data   //主要判断是否是auto
    # txt文件中读取的信息：info
    # 主方法中选择的爬虫库：libraryUsed
    # 是否打印配置信息：isPrint
    #
    # 返回一个参数：
    # 配置日志：configLog
    configLog = {"表单数据获取方式": None, "表单数据形式": None, "爬虫库": None, "ContentType": None}

    # 判断表单数据获取方式，并自动获取表单数据形式
    if data == "auto":
        configLog["表单数据获取方式"] = "自动获取"
        configLog["表单数据形式"] = "字符串" if len(info[2]) == 1 else "字典"
    else:
        configLog["表单数据获取方式"] = "手动配置"
        if isinstance(info[2], dict):
            configLog["表单数据形式"] = "字典"
        elif isinstance(info[2], str):
            configLog["表单数据形式"] = "字符串"
        else:
            configLog["表单数据形式"] = "错误"

    # 随机User-Agent
    if info[3].get("User-Agent") == "True":
        info[3]["User-Agent"] = getAgent.main()
        print("本次采用的随机User-Agent为：", info[3]["User-Agent"])

    # 读取header中的Content-Type，用于判断是否用json=data
    tmpList = ["content-type", "Content-type", "content-Type", "Content-Type"]
    contentType = ""
    for item in tmpList:
        contentType = info[3].get(item, "")
        if contentType != "":
            break
    configLog["ContentType"] = "json(手动配置)" if "json" in contentType else "非json(手动配置)"

    # 如果libraryUsed没有传入，即为默认值auto，则自动优化配置“爬虫库”和“ContentType”
    if libraryUsed == "auto":
        if configLog["表单数据形式"] == "字典":
            configLog["爬虫库"] = "requests(自动配置)"
            configLog["ContentType"] = "json(自动配置)"
        elif configLog["表单数据形式"] == "字符串":
            configLog["爬虫库"] = "requests(自动配置)"
            configLog["ContentType"] = "非json(自动配置)"
    elif libraryUsed == "r":
        configLog["爬虫库"] = "requests(手动配置)"
    elif libraryUsed == "u":
        configLog["爬虫库"] = "urllib(手动配置)"
    else:
        configLog["爬虫库"] = "错误"

    if isPrint:
        for key, value in configLog.items():
            print(key + "：" + value)

    return configLog


def dataProcessing(info, configLog):
    # 表单数据data的处理
    #
    # 传入：
    # txt文件中读取的信息：info
    # 配置日志：configLog
    #
    # 无返回值

    if configLog["表单数据获取方式"] == "自动获取":
        if configLog["表单数据形式"] == "字典":
            info[2] = toDict(info[2], [])
        elif configLog["表单数据形式"] == "字符串":
            info[2] = info[2][0][:-1]


def main(filepath, libraryUsed="auto", data="auto", isPrint=True,timeoutTime=0, headerNoneedKey=None):
    # 爬虫主函数，根据提供的目标网站txt（文件路径），返回爬虫结果
    # txt文件要求：
    # 先从F12中显示原始，将浏览器请求信息，拷贝到txt文件
    # 如果是POST方法，需要将表单data拷贝到请求信息前
    # 如果需要采用随机user-Agent，请将txt中该行设置为"User-Agent: True",注意True前有空格，后无空格
    #
    # 传入：
    # 目标网页的txt配置文件路径:filepath  //必填参数
    # 爬虫库选择参数：libraryUsed    //如果不写，默认为"auto"，自动配置爬虫库和ContextType。可选参数："r"：requests库，"u"：urllib库
    # 表单数据：data   //如果不写，默认为"auto"，自动从txt中获取，否则用形参中的data
    # 是否打印配置信息：isPrint  //如果不写，默认为True,打印配置信息
    # 爬虫timeout秒数，即最多等服务器反应的时间：timeoutTime  //如果不写，默认为0，即不设置
    # header中不需要的键：headerNoneedKey    //一般不写，取默认值["Date", "Server", "Transfer-Encoding"]
    #
    # 返回一个参数：
    # 响应内容：html
    #
    # requests库有时比urllib更快，但在使用requests库时要注意表单类型(json类型/data类型)
    # 一般来说，header里如果content-type里包含了json字样，就是json类型，json=data；否则是data类型，data=data
    # 如果出现错误：400，Error response: {"message":"Expecting object or array (near 1:1)","status":400}
    # 很有可能是context-type配置错了，可以取消libraryUsed自动配置，在txt中手动配置context—type
    # 另外注：urllib目前不分json和data，统一是data，只有用requests库时要考虑

    if headerNoneedKey is None:
        headerNoneedKey = ["Date", "Server", "Transfer-Encoding"]
    info = getRequestInfoFromTxt(filepath, data, headerNoneedKey)

    configLog = config(data, info, libraryUsed,isPrint)

    # 配置检查：
    if None in configLog.values() or "错误" in configLog.values():
        print("配置错误")
        return None

    dataProcessing(info, configLog)

    if configLog["爬虫库"] in ["urllib(自动配置)", "urllib(手动配置)"]:
        return getHTMLByUrllib.main(info, configLog, timeoutTime)
    elif configLog["爬虫库"] in ["requests(自动配置)", "requests(手动配置)"]:
        return getHTMLByRequests.main(info, configLog, timeoutTime)

2.代码中的一些解释

（如果你只是想用模板的话，这部分可以跳过不看）

（1）导包

自定义包和模块要放在软件包里，不要放在目录里，否则会带来麻烦。

如果使用目录，当两个文件不在一起时，可能会因搜索不到而报错。此时可以用sys.path方法添加系统路径来解决。但这样在导包的地方仍然会报错，程序能够运行。

而如果用软件包的话，两个文件或许可以离得更远，不用sys.path就行，也不会报错。

（2）header中的参数

因为并不是所有的header都是有用的，也不是header越多越好

像我例子中的url，只要cookie中的JSESSIONID和array就可以了，其它都没有都可以。

但是如果有，必须要写对，错一个都可能不行。

特别的，Transfer-Encoding: chunked不能加到header里。

所以说，有些header是没用的，将其写到headerNoneedKey里，并设置了默认值。

（3）关于cookie

JSESSIONID，就像是一把密钥，当我们在登录系统完成，服务器会给我们一个JSESSIONID，然后在系统内，我们每次请求都要带着这把密钥，才能得到响应，这个密钥里也包含了我们的一些身份信息。当然，这个密钥也是会过期的，与浏览器的打开与关闭无关。每次登录都会得到一个新JSESSIONID，但是原来的旧JSESSIONID只要没过期，仍然可以用。

（4）gzip压缩

有些网页是gzip压缩的，与一般的读取方式不同，识别方法就是先hex一下，看是不是以"1f8b08"开头。

3.参数介绍

需要的确定的参数：

（1）txt文件中

url（目标网址），method（POST或GET），data（表单数据），header（请求头）

其中，header中有三个比较重要的字段：

①cookie：与登录有关

②User-Agent：当前身份。如果需要随机身份，需要将其值设置为：“ True”

③Context-Type：表单数据的文本类型，如果出现json字样，则要用json类型，没有则为data类型。更准确的方法是根据表单数据取判断，如果有列表[]，特殊情况，建议用json类型。

（2）crawler.main（）方法形参

最重要的就是txt文件的路径filepath，必填。

其余的都可以使用默认值，程序会自动配置。

如果报错，可以试着调调libraryUsed和Context-Type，以及data

def main(filepath, libraryUsed="auto", data="auto", isPrint=True,timeoutTime=0, headerNoneedKey=None):
    # 爬虫主函数，根据提供的目标网站txt（文件路径），返回爬虫结果
    # txt文件要求：
    # 先从F12中显示原始，将浏览器请求信息，拷贝到txt文件
    # 如果是POST方法，需要将表单data拷贝到请求信息前
    # 如果需要采用随机user-Agent，请将txt中该行设置为"User-Agent: True",注意True前有空格，后无空格
    #
    # 传入：
    # 目标网页的txt配置文件路径:filepath  //必填参数
    # 爬虫库选择参数：libraryUsed    //如果不写，默认为"auto"，自动配置爬虫库和ContextType。可选参数："r"：requests库，"u"：urllib库
    # 表单数据：data   //如果不写，默认为"auto"，自动从txt中获取，否则用形参中的data
    # 是否打印配置信息：isPrint  //如果不写，默认为True,打印配置信息
    # 爬虫timeout秒数，即最多等服务器反应的时间：timeoutTime  //如果不写，默认为0，即不设置
    # header中不需要的键：headerNoneedKey    //一般不写，取默认值["Date", "Server", "Transfer-Encoding"]
    #
    # 返回一个参数：
    # 响应内容：html
    #
    # requests库有时比urllib更快，但在使用requests库时要注意表单类型(json类型/data类型)
    # 一般来说，header里如果content-type里包含了json字样，就是json类型，json=data；否则是data类型，data=data
    # 如果出现错误：400，Error response: {"message":"Expecting object or array (near 1:1)","status":400}
    # 很有可能是context-type配置错了，可以取消libraryUsed自动配置，在txt中手动配置context—type
    # 另外注：urllib目前不分json和data，统一是data，只有用requests库时要考虑

4.模板使用

（1）准备目标网站的txt文件：

url，method，data，header 这些可以从F12中找到（注意把“原始”勾上）

url对应请求url，method对应请求方法，header对应响应标头+请求标头

如果是POST方法，data在负载里可以看到：

我们将这些内容放到一个txt文件中（data在前，其它直接复制在后面）：

例如：

ordered: true
sortType: desc
请求 URL:
https://jwgl.dhu.edu.cn/dhu/common/semesterSS
请求方法:
POST
状态代码:
200 OK
远程地址:
218.193.151.149:443
引用者策略:
strict-origin-when-cross-origin
HTTP/1.1 200 OK
x-frame-options: SAMEORIGIN
Pragma: no-cache
Cache-Control: no-cache, no-store, max-age=0
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Content-Type: application/json;charset=UTF-8
Content-Language: zh-CN
Date: Sun, 20 Aug 2023 21:43:15 GMT
Server:
Set-Cookie: array=jwgl_01; Secure
Transfer-Encoding: chunked
Connection: Keep-alive
Via: 1.1 ID-0016035530113266 uproxy-3
POST /dhu/common/semesterSS HTTP/1.1
Accept: application/json, text/javascript, */*; q=0.01
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6
Connection: keep-alive
Content-Length: 26
Content-Type: application/x-www-form-urlencoded;charset=UTF-8
Cookie: array=jwgl_01; array=jwgl_01; JSESSIONID=; array=jwgl_01; iPlanetDirectoryPro=
Host: jwgl.dhu.edu.cn
Origin: https://jwgl.dhu.edu.cn
Referer: 
Sec-Fetch-Dest: empty
Sec-Fetch-Mode: cors
Sec-Fetch-Site: same-origin
User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36 Edg/115.0.1901.203
X-Requested-With: XMLHttpRequest
sec-ch-ua: "Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"
sec-ch-ua-mobile: ?1
sec-ch-ua-platform: "Android"

这样我们称为一个url的配置文件txt。

将要爬取的网页的txt文件都按照上面所讲的方法准备好，放到target目录下：

（2）方法调用

在目录新建一个文件main.py，调用我们的模板：

# -*- coding: utf-8 -*-
# @Time: 2023-08-18 16:33
# @Author: hexh
# @File: main.py
# @Software: PyCharm
import os
import random

from crawlerTemplate import crawler
import time

if __name__ == "__main__":
    start = time.time()

    folderpath = r"./target"
    os.chdir(folderpath)

    for i, item in enumerate(os.listdir()):
        print(item[:-4] + ":")
        print(str(i) + "\t爬取结果为：\n", crawler.main(item))
        print()
        # time.sleep(random.randint(-2500,2500)/1000+5)

    end = time.time()
    print(end - start)

    # encoded_data = json.dumps(json_data).encode("utf-8")

① 其中，end-start是我调试时用来计时的，一般可以不写。

② 中间可以设置一个几秒的休眠，避免被封。

得到数据后就可以去做数据解析和可视化了。