微信公众号历史文章抓取

微信公众号历史文章抓取

  • 目录结构
WechatSpider
│   README.md
│   chromedriver.exe
│   main.py   
│   gzhspider.py
│   requirements.txt
  • gzhspider.py 文件
import json
import time
import pathlib
import requests
from queue import Queue
from threading import Thread
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options


class Spider:
    def __init__(self, url, sleep=3):
        self.headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "cookie": None,
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
        }
        self.cookie = None
        self.url = url
        self.root = url.split("?").pop(0)
        self.queue = Queue()
        self.SLEEP = sleep

    def __init_headers(self):
        ch_options = Options()
        ch_options.add_argument("--headless")  # => 为Chrome配置无头模式
        web = webdriver.Chrome(options=ch_options)
        web.get(self.url)
        for request in web.requests:
            for t in request.headers.items():
                if "cookie" in t and self.root in str(request.headers.items()):
                    self.headers["cookie"] = dict(request.headers.items())['cookie']
                    print(">>请求头初始化完成...")
                    return True
        if self.headers["cookie"] is None:
            print("获取请求头部文件失败")
            return False

    def __get_params(self):
        url = self.url.split('profile_ext?').pop()
        url = url.split("&")

        d = {}
        for i in url:
            li = i.split("=")
            key = li.pop(0)
            d[key] = "=".join(li)
        params = {
            'action': 'getmsg',
            '__biz': d['__biz'],
            'f': 'json',
            'offset': '0',
            'count': '10',
            'is_ok': '1',
            'scene': d['scene'],
            'uin': d['uin'],
            'key': d['key'],
            'pass_ticket': d['pass_ticket'],
            'wxtoken': '',
            'appmsg_token': '',
            'x5': '0'
        }
        return params

    def __crawling(self):
        page = 0
        params = self.__get_params()
        print(">>循环抓取中...")
        while True:
            offset = page * 10
            params['offset'] = offset
            url = self.url.replace("https", "http")
            res = requests.get(url=url + "?", headers=self.headers, params=params)
            data = res.json()
            if res.status_code == 200 and "general_msg_list" in data:
                if data['msg_count'] == 0:
                    self.queue.put(None)
                    break
                data = data['general_msg_list']  # json字符串
                data = json.loads(data)['list']  # data目前是列表  里面是字典
                # nex_data = []
                nex_data = [x['app_msg_ext_info'] for x in data]  # 遍历data生成列表[dict, dict,...]
                self.queue.put(nex_data)  # 数组放进队列
                # print(nex_data)
                page += 1
                print(">>成功抓取" + str(page) + "页!!!")
            else:
                print("数据抓取失败")
                print(data)
                self.queue.put(None)
                break
            time.sleep(self.SLEEP)
            # self.queue.put(None)
            # break

    def __save(self, filename="test", dir_path="./"):
        if not dir_path.endswith("/"):
            dir_path = dir_path + "/"
        suffix = ".csv"
        pathlib_dir_path = pathlib.Path(dir_path)
        pathlib_filename = pathlib.Path(filename)
        pathlib_dir_path.mkdir(parents=True, exist_ok=True)  # 创建目录

        # 标题栏
        # title = ["序号", "标题", "url", "\n"]
        # with open(file_ab_name, "a+", encoding='utf-8') as f:
        #     f.writelines(",".join(title))
        count = 1
        file_ab_name = dir_path + filename + suffix
        while True:
            nex_data = self.queue.get()
            if nex_data is None:  # 跳出循环
                break

            # 做格式化处理
            with open(file_ab_name, "a+", encoding='utf-8') as f:
                for i in range(len(nex_data)):
                    line_id = '{}'.format(str(i + 1))
                    title = '"{}"'.format(nex_data[i]['title'])
                    url = '"{}"'.format(nex_data[i]['content_url'])
                    data_line = [line_id, title, url, '\n']
                    f.writelines(",".join(data_line))

    def start(self, save_path, filename):
        if self.__init_headers():
            th_craw = Thread(target=self.__crawling)
            th_save = Thread(target=self.__save, args=(filename, save_path))
            th_craw.start()
            th_save.start()
            th_craw.join()
            th_save.join()
            print("爬取完成_ok_ok")
        print("程序退出..")
        
  • main.py 文件
from gzhspider import Spider


# 【***************************请先修改配置参数再运行***************************】
# 推荐使用python3.9.12版本。
# 公众号主页URL
HOME_URL = "xxxxxxxxxxxxxxx"

# 文件保存目录
SAVE_PATH = "./"

# 保存文件名(不需要加后缀)
FILE_NAME = "test1"

# 请求间隔(单位:秒)
SLEEP = 3


if __name__ == '__main__':
    spider = Spider(HOME_URL, SLEEP)
    spider.start(SAVE_PATH, FILE_NAME)

  • requirements.txt 文件
requests==2.28.1
selenium==4.7.2
selenium_wire==5.1.0

  • README.md 文件
## 推荐python版本 3.9.12
1.安装目录内 ChromeSetup.exe(依赖Chrome浏览器108.0.5359)
2.安装依赖库 requests 和 selenium-wire
    推荐使用 pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt安装依赖(必须在本文将同目录下运行)
3.修改main.py内参数(可设置保存路径和文件名, HOME_URL必传)
4.命令行运行 main.py文件 :  python main.py

  • chromedriver.exe 文件
爬虫执行依赖,需要与自己的chrome浏览器版本对应(需要安装chrome浏览器才能使用本爬虫程序)
下载地址:http://chromedriver.storage.googleapis.com/index.html
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值