某投诉网站爬虫-获取对应的投诉信息

最新推荐文章于 2023-06-08 01:55:23 发布

置顶半根蓝白

最新推荐文章于 2023-06-08 01:55:23 发布

阅读量4.9k

点赞数 1

分类专栏： py爬虫

本文链接：https://blog.csdn.net/BanGenLanBai/article/details/113482917

版权

py爬虫专栏收录该内容

6 篇文章 2 订阅

订阅专栏

某投诉网站爬虫-获取对应的投诉信息

此爬虫原本是一道面试题，说难不难，说简单也不简单；因为站点的反爬机制，我现在也没摸清楚，不过目前看来应该是你在一段时间内访问数据超过对应的数量就会封ip 5-10分钟，然后就可以正常访问了

爬虫源代码

# -*- coding:utf-8  -*-
# @Time     : 2020-12-03 13:20
# @Author   : BGLB
# @Email    : bglb@qq.com
# @Software : PyCharm
import csv
import hashlib
import json
import os
import random
import threading
import time

from requests import get

""" 
题目要求:
        1. 用任意一语言，或者Python或者其他熟悉的，写一个爬虫程序，爬取tousu.sina.com.cn网站中
        最新投诉的内容，并且输出到一个文本文件或者excel文件。
        2. 输出内容:
            1.投诉内容
            2.投诉对象
            3.投诉要求
            4.投诉时间
        3. 测试时间:测试时间为48小时
        4. 提交内容
            a.源程序并附上注释
            b.输出的文件实例

"""


def time_logging(func):
    """
    记录函数运行时间的装饰器
    :param func: 需要记录的函数名
    :return:
    """

    def wrapper(*args, **kw):
        start_time = time.time()
        func_result = func(*args, **kw)
        runtime = time.time()-start_time
        if runtime < 60:
            runtime = "{:.2f}s".format(runtime)
        elif runtime < 3600:
            runtime = "{:.2f}m".format(runtime/60)
        else:
            runtime = "{:.2f}h".format(runtime/3600)
        content = '[{0:^15}] - 运行时间 - [{1:^6}]'.format(func.__name__, runtime)
        print("{}".format(content))
        return func_result

    return wrapper


class SinaTousu(object):
    def __init__(self, host_str="全国投诉", type_str="最热投诉", count=100):
        self.host_str = host_str
        self.type_str = type_str
        self.__page_size = 30
        self.__pages = 1
        self.__max_thread_count = 10
        self.__current_thread_count = 0
        self.__url = self.__create_url()
        self.count = count
        if self.count > self.__page_size:
            self.__pages += int(count/self.__page_size)
        else:
            self.__page_size = self.count

    __data_type_dict = {
        "最热投诉": 1,
        "最新投诉": 2,
        "已回复": 3,
        "已完成": 4,
    }

    __hosts_type_dict = {
        "湖北投诉": "https://hb.tousu.sina.com.cn",
        "全国投诉": "https://tousu.sina.com.cn"
    }

    # 站点的相关接口 由于只爬取投诉接口 所以没有过多优化

    def __create_url(self):
        """
        构造url
        """
        __api_dict = {
            "LAWS_FEED": self.__hosts_type_dict[self.host_str]+"/api/laws/feed",
            "INDEX_FEED": self.__hosts_type_dict[self.host_str]+"/api/index/feed",  # 投诉相关的接口 通过js 解析获得
            "index_article": self.__hosts_type_dict[self.host_str]+"/api/articles/notice",
        }
        return __api_dict["INDEX_FEED"]

    def forged_param(self, page):
        """
        构造参数：
        :param page: 当前抓取页数
        :return:
        """
        ts = int(time.time()*1000)
        key = "$d6eb7ff91ee257475%"
        rs = ""
        a = [str(x) for x in range(0, 9)] + \
            [chr(x).lower() for x in range(65, 91)] + \
            [chr(x) for x in range(65, 91)]

        for i in range(16):
            rs += a[random.randint(1, len(a)-1)]
        a = "".join(
            sorted([str(ts), rs, key, str(self.__data_type_dict[self.type_str]), str(self.__page_size), str(page)]))
        signature = hashlib.sha256(a.encode("utf8")).hexdigest()

        return {
            "ts": ts,
            "type": self.__data_type_dict[self.type_str],
            "page_size": self.__page_size,
            "page": page,
            "rs": rs,
            "signature": signature
        }

    @staticmethod
    def parse_json(jsondata):
        """
        处理json数据
        :param jsondata:
        :return:
        """
        if jsondata is None:
            return
        res_lists = []
        host = "https:"
        for item in jsondata["lists"]:
            res_item = {
                "投诉编号": item["main"]["sn"],
                "投诉对象": item["main"]["cotitle"],
                "投诉标题": item["main"]["title"],
                "投诉时间": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(item["main"]["timestamp"]))),
                "投诉详情": item["main"]["summary"],
                "投诉要求": item["main"]["appeal"],
                "详情页面": host+item["main"]["url"],
                "投诉发起人昵称": item["author"]["title"],
                "投诉发起人微博": host+item["author"]["wb_profile"],
                # "投诉发起人性别": item["author"]["gender"]
            }
            res_lists.append(res_item)
        return res_lists

    def get_json(self, param_dict):
        res = get(self.__url, param_dict)
        # print(self.__url)
        res.encoding = res.apparent_encoding
        data = res.content.decode()
        print(res.status_code)
        if res.status_code == 456:
            print("您被封啦，请等待5~60分钟自动解封")
        if res.status_code == 200:
            result = json.loads(data)["result"]
            if result["status"]["code"] == 0:
                return result["data"]
            print("{}参数错误！检查参数".format(result["status"]))
        return None

    @staticmethod
    def write_file(filename, datas):
        """
        写入文件
        :param filename: 文件名称
        :param data: 数据
        :return: 写入数据行数
        """
        ext = filename.split('.')[-1]
        path = "./"+filename
        # is_write = os.path.exists(path)
        with open(path, "w", encoding="utf8", newline='') as f:
            if ext == "csv":
                header = [x for x in datas[0].keys()]
                w = csv.DictWriter(f, fieldnames=header)
                w.writerows(datas)  # 写入数据
            if ext == "json":
                json.dump(datas, f, ensure_ascii=False)
        print("写入文件-[{}]-[{}]条数据".format(path, len(datas)))
        return len(datas)

    @staticmethod
    def file_walker(path):
        file_dict = {"json": [], "result": []}
        for root, dirs, files in os.walk(path):
            for fn in files:
                if fn.startswith("result"):
                    file_dict["result"].append(fn)
                if fn.startswith("jsondata"):
                    file_dict["json"].append(fn)
        for v in file_dict.values():
            if len(v) > 0:
                v.sort(key=lambda x: int(x.split('.')[0].split('_')[-1]))
        return file_dict

    @staticmethod
    def combine(res_filename, filelist):
        ext = res_filename.split('.')[-1]
        resfilepath = "./"+res_filename
        if os.path.exists(resfilepath):
            os.remove(resfilepath)
        if len(filelist) == 0:
            return
        with open(res_filename, 'wb+') as fw:
            if ext == "csv":
                fw.write("投诉编号,投诉对象,投诉标题,投诉时间,投诉详情,投诉要求,详情页面,投诉发起人昵称,投诉发起人微博\n".encode('utf8'))
                for file in filelist:
                    file_path = "./"+file
                    fw.write(open(file_path, 'rb').read())
                    os.remove(file_path)
            if ext == "json":
                fw.write("{".encode("utf8"))
                for file in filelist:
                    file_path = "./"+file
                    fw.write('"result_{}":'.format(file.split('.')[0].split('_')[-1]).encode('utf8'))
                    fw.write(open(file_path, 'rb').read())
                    if filelist.index(file) is not len(filelist)-1:
                        fw.write(','.encode('utf8'))
                    os.remove(file_path)
                fw.write("}".encode("utf8"))

            # raise ValueError("参数错误{}".format(res_filename))

    def set_pages_pagesize(self, pages, page_size):
        self.count = pages*page_size
        if page_size > 30:
            self.__page_size = 30
            self.__pages = int(self.count/self.__page_size)+1
            print("警告：每页最大数据条数为30条，已为您选取最优选择：\n{}".format({"pages": self.__pages, "page_size": self.__page_size}))
        else:
            self.__pages = pages
            self.__page_size = page_size

    def get_pages_pagesize(self):
        return {"pages": self.__pages, "page_size": self.__page_size}

    def __start(self, page, isMultithreading):
        lock = None
        if isMultithreading:
            lock = threading.Lock()
            lock.acquire()
        params = self.forged_param(page)
        data_json = self.get_json(params)
        if data_json is None:
            return
        pages = data_json['pager']['page_amount']
        item_count = data_json['pager']['page_amount']
        if pages < self.__pages:
            self.__pages = pages
            print("----数据总页数-[{}]-数据总条数-[{}]--------".format(pages, item_count))
        result_data = self.parse_json(data_json)
        self.write_file("jsondata_{}.json".format(page), data_json)
        self.write_file("result_{}.csv".format(page), result_data)
        if isMultithreading:
            lock.release()

    def thread_manage(self, f, kwargs):
        """
        线程管理函数
        :param f: 函数
        :return:
        """
        t = threading.Thread(target=f, kwargs=kwargs)

        self.__current_thread_count += 1

        time.sleep(0.1)
        t.start()
        if threading.active_count()-2 >= self.__max_thread_count+3:
            t.join()
        # if self.__current_thread_count >= self.__max_thread_count:

        if self.__current_thread_count > 40:
            self.__current_thread_count = 0
            time.sleep(10)

    def rm_file(self):
        for file in ["./result.csv", "./jsondata.json"]:
            if os.path.exists(file):
                os.remove(file)

    def run(self, page_list=None, isMultithreading=True):
        """
        传入需要爬取的页码list
        :param page_list: 默认值空
        :param isMultithreading: 默认多线程
        :return:
        """
        self.rm_file()
        print("------------一共爬取{}条数据，选取最优的爬取速度为 [每次抓取量:{}, 抓取次数:{}]-------------".format(
            self.count, self.__page_size, self.__pages))
        page_count_list = [x for x in range(1, self.__pages+1)]
        if page_list is not None:
            page_count_list = page_list
        random.shuffle(page_count_list)
        for page in page_count_list:
            # time.sleep(0.5)
            # self.__start(page)
            self.thread_manage(self.__start, kwargs={"page": page,'isMultithreading':isMultithreading})
            print(
                "--------------[{}]-[{}]-当前抓取次数-[{}]------------------".format(
                    self.host_str, self.type_str, page))
        while True:
            if threading.active_count() == 1:
                fs_dict = self.file_walker("./")
                self.combine("result.csv", fs_dict["result"])
                self.combine("jsondata.json", fs_dict["json"])
                print("抓取完毕")
                break


if __name__ == '__main__':
    q = SinaTousu("全国投诉", "最新投诉")
    # print(q.get_pages_pagesize())
    q.set_pages_pagesize(100, 30)
    
    @time_logging
    def main():
        q.run()
    #   q.run(isMultithreading=False)  # 关闭多线程

    main()

    """
    时间 - 访问次数
    25s - 180
     4m - 325
    41s - 90 
    41s - 95
    6m - 406
    11s - 40 
    
    封
    
    """

爬虫解析

网站接口主要的加密方式为 sha256
主要变量有时间戳ts, key="$d6eb7ff91ee257475%" 字符串数组a [0-9,a-z,A-Z] 当前访问页码page,当前访问每页数据量 page_size 访问类型对应的数字 - _type = ["最热投诉","最新投诉",'已完成', '已回复'] 数组中随机一个字符串rs
先来一个空数组 base_sign = [] 数组中有六个变量字符串ts rs key _type_index + 1 page_size page
然后把 base_sign 升序排列转为字符串最后通过 sha256 加密这个字符串就可以得到 signature 的值
网站接口分为 全国站点 和 湖北站点 目前也只发现这两个站点
每次访问最多可以获取30条数据，网站上每次固定十条数据

代码解析

代码每次运行会删除上次爬的数据
代码加入了多线程，可以设置关闭，多线程还有点小问题会导致反爬

测试图片

在这里插入图片描述

大概是访问了 80多次就封了
有大佬可以请教下这个反爬该怎么避免吗？

半根蓝白

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
2
评论
某投诉网站爬虫-获取对应的投诉信息

某投诉网站爬虫-获取对应的投诉信息此爬虫原本是一道面试题，说难不难，说简单也不简单；因为站点的反爬机制，我现在也没摸清楚，不过目前看来应该是你在一段时间内访问数据超过对应的数量就会封ip 5-10分钟，然后就可以正常访问了爬虫源代码# -*- coding:utf-8 -*-# @Time : 2020-12-03 13:20# @Author : BGLB# @Email : [email protected]# @Software : PyCharmimport csvim
复制链接

扫一扫