微信公众号历史文章抓取
WechatSpider
│ README.md
│ chromedriver.exe
│ main.py
│ gzhspider.py
│ requirements.txt
import json
import time
import pathlib
import requests
from queue import Queue
from threading import Thread
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
class Spider:
def __init__(self, url, sleep=3):
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"cookie": None,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
self.cookie = None
self.url = url
self.root = url.split("?").pop(0)
self.queue = Queue()
self.SLEEP = sleep
def __init_headers(self):
ch_options = Options()
ch_options.add_argument("--headless")
web = webdriver.Chrome(options=ch_options)
web.get(self.url)
for request in web.requests:
for t in request.headers.items():
if "cookie" in t and self.root in str(request.headers.items()):
self.headers["cookie"] = dict(request.headers.items())['cookie']
print(">>请求头初始化完成...")
return True
if self.headers["cookie"] is None:
print("获取请求头部文件失败")
return False
def __get_params(self):
url = self.url.split('profile_ext?').pop()
url = url.split("&")
d = {}
for i in url:
li = i.split("=")
key = li.pop(0)
d[key] = "=".join(li)
params = {
'action': 'getmsg',
'__biz': d['__biz'],
'f': 'json',
'offset': '0',
'count': '10',
'is_ok': '1',
'scene': d['scene'],
'uin': d['uin'],
'key': d['key'],
'pass_ticket': d['pass_ticket'],
'wxtoken': '',
'appmsg_token': '',
'x5': '0'
}
return params
def __crawling(self):
page = 0
params = self.__get_params()
print(">>循环抓取中...")
while True:
offset = page * 10
params['offset'] = offset
url = self.url.replace("https", "http")
res = requests.get(url=url + "?", headers=self.headers, params=params)
data = res.json()
if res.status_code == 200 and "general_msg_list" in data:
if data['msg_count'] == 0:
self.queue.put(None)
break
data = data['general_msg_list']
data = json.loads(data)['list']
nex_data = [x['app_msg_ext_info'] for x in data]
self.queue.put(nex_data)
page += 1
print(">>成功抓取" + str(page) + "页!!!")
else:
print("数据抓取失败")
print(data)
self.queue.put(None)
break
time.sleep(self.SLEEP)
def __save(self, filename="test", dir_path="./"):
if not dir_path.endswith("/"):
dir_path = dir_path + "/"
suffix = ".csv"
pathlib_dir_path = pathlib.Path(dir_path)
pathlib_filename = pathlib.Path(filename)
pathlib_dir_path.mkdir(parents=True, exist_ok=True)
count = 1
file_ab_name = dir_path + filename + suffix
while True:
nex_data = self.queue.get()
if nex_data is None:
break
with open(file_ab_name, "a+", encoding='utf-8') as f:
for i in range(len(nex_data)):
line_id = '{}'.format(str(i + 1))
title = '"{}"'.format(nex_data[i]['title'])
url = '"{}"'.format(nex_data[i]['content_url'])
data_line = [line_id, title, url, '\n']
f.writelines(",".join(data_line))
def start(self, save_path, filename):
if self.__init_headers():
th_craw = Thread(target=self.__crawling)
th_save = Thread(target=self.__save, args=(filename, save_path))
th_craw.start()
th_save.start()
th_craw.join()
th_save.join()
print("爬取完成_ok_ok")
print("程序退出..")
from gzhspider import Spider
HOME_URL = "xxxxxxxxxxxxxxx"
SAVE_PATH = "./"
FILE_NAME = "test1"
SLEEP = 3
if __name__ == '__main__':
spider = Spider(HOME_URL, SLEEP)
spider.start(SAVE_PATH, FILE_NAME)
requests==2.28.1
selenium==4.7.2
selenium_wire==5.1.0
## 推荐python版本 3.9.12
1.安装目录内 ChromeSetup.exe(依赖Chrome浏览器108.0.5359)
2.安装依赖库 requests 和 selenium-wire
推荐使用 pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt安装依赖(必须在本文将同目录下运行)
3.修改main.py内参数(可设置保存路径和文件名, HOME_URL必传)
4.命令行运行 main.py文件 : python main.py
爬虫执行依赖,需要与自己的chrome浏览器版本对应(需要安装chrome浏览器才能使用本爬虫程序)
下载地址:http://chromedriver.storage.googleapis.com/index.html