实现内容为,利用公众号平台,获取任一检索公众号的标题 超链接 以及更多信息,并汇总为xlsx,便于浏览和存储, 本文仅供技术研究,请勿用于非法采集,后果自负。
准备工作
PC端
1.登录微信公众号平台
2.找到图文编辑
3.在图文/文字编辑里,找到超链接,并打开
4.打开后,目前选择的账号是:哇凉啊哇凉,按下F12
获取要素
1.token
token获取,在F12-元素里,ctrl+F 检索token,复制储备
2.cookie
cookie获取,在F12-网络,箭头选择账号位置
点击名称下的任意字段,而后打开标头,找到cookie,进行复制储备
3.nickname,即账号的名字
代码运行
网页要保留,而后打开pycharm
代码如下
import traceback
import requests
import pandas as pd
"""
日期:2024年11月28日
公众号:哇凉哇哇凉
声明:本文仅供技术研究,请勿用于非法采集,后果自负。
"""
class WeChatSpider:
def __init__(self, cookie, token):
self.session = requests.Session()
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36",
"Cookie": cookie,
}
self.base_params = {
"lang": "zh_CN",
"f": "json",
"token": token,
}
def get_fakeid(self, nickname, begin=0, count=5):
"""获取公众号的 FakeID"""
search_url = "https://mp.weixin.qq.com/cgi-bin/searchbiz"
params = {
**self.base_params,
"action": "search_biz",
"query": nickname,
"begin": begin,
"count": count,
"ajax": "1",
}
try:
response = self.session.get(search_url, headers=self.headers, params=params)
response.raise_for_status()
data = response.json()
if "list" in data and data["list"]:
return data["list"][0].get("fakeid")
return None
except Exception as e:
raise Exception(f"获取公众号{nickname}的fakeid失败: {traceback.format_exc()}")
def get_articles(self, fakeid, begin=0, count=29):
"""获取公众号的文章列表并翻页"""
all_articles = []
while True:
art_url = "https://mp.weixin.qq.com/cgi-bin/appmsg"
params = {
**self.base_params,
"query": "",
"begin": begin,
"count": count,
"type": 9,
"action": "list_ex",
"fakeid": fakeid,
}
try:
response = self.session.get(art_url, headers=self.headers, params=params)
response.raise_for_status()
data = response.json()
if "app_msg_list" in data:
articles = [
{
"标题": item.get("title"),
"链接": item.get("link")
}
for item in data["app_msg_list"]
]
all_articles.extend(articles)
# 判断是否有下一页
if len(data["app_msg_list"]) < count:
break # 如果当前页的文章数少于请求的数量,表示已获取所有文章
else:
begin += count # 否则,翻到下一页,继续获取
else:
break
except Exception as e:
raise Exception(f"获取fakeid={fakeid}的文章失败: {traceback.format_exc()}")
return all_articles
def fetch_articles_by_nickname(self, nickname, begin=0, count=5):
"""通过昵称直接获取文章"""
fakeid = self.get_fakeid(nickname, begin, count)
if not fakeid:
raise ValueError(f"未找到公众号 {nickname} 的 fakeid")
return self.get_articles(fakeid, begin, count)
def main():
cookie = "RK=q4EkVA94P3; ptcz=2f39=0; _qimeq36=; _qimei_h38=e14777d97cd7ee6ac3036c4102000006118102; eas_sid=01m7L0Z7U3N0Y8u0s4n1x2V2F0; LW_uid=k1h760t7Q380z862i1j2b4c3o4; qq_domain_video_guid_verify=be0e8adaedba6bc0; ua_id=aovHsyXofV8GQ73WAAAAAF1OHXtBu5_g57k3fC36kG0=; wxuin=09049720819582; mm_lang=zh_CN; ts_uid=2829397120; fqm_pvqid=6873888a-f647-49b3-94bd-7b5ca1e10c45; suid=user_1_1479147389; pac_uid=0_xBRhKTZ9mWx25; ptui_loginuin=1479147389; LW_sid=N1J7x3x2Q3u405I4T9H8Z5v8z3; _qimei_fingerprint=ba704b74df0639c6cf78985f43709354; uuid=b23809642eb91de37f35a5e45e7e5599; _clck=dqu2xs|1|fr8|0; rand_info=CAESIIDvgl4ECzFuI/uWIXtlvxczAEp8GrLHYacoHaVeUe1Q; slave_bizuin=3094180885; data_bizuin=3094180885; bizuin=3094180885; data_ticket=XHRoGDl/49F/nlNyI53uSO3iwIKgph2CkSZD9YDwXdBfgeovFbKLkV615FDVUCD3; slave_sid=UVg2b19BUDNvWjBkM0hfMXlubXU0S2lwZWk0N0U4bHg5QTFhdzh3RFdXS0NQSVN2MWJZTlliSjVfNDdycXFvM0JlMURoNTZlN01Qc1djeEpyeHZUUk1EZDVRRmxGQ2gyZFpoYmh2aU5tdE8zcE5RS0lpdnJnYXVmUTZtNFBqeFVkSVJtb09RWEpxMkt6ZVMy; slave_user=gh_783e32eae883; xid=295606d1be7e037ac4f8e27ad4e98de4; rewardsn=; wxtokenkey=777; _clsk=lxhd5g|1732686481063|7|1|mp.weixin.qq.com/weheat-agent/payload/record"
token = "179888914" # 需要填入有效的 token
nickname = "哇凉啊哇凉"
spider = WeChatSpider(cookie, token)
try:
articles = spider.fetch_articles_by_nickname(nickname)
if not articles:
print(f"未获取到公众号 {nickname} 的任何文章。")
return
# 将文章列表转换为 DataFrame
df = pd.DataFrame(articles)
# 导出到 Excel 文件
excel_file = fr"C:\Users\iQier\Desktop\{nickname}_articles.xlsx"
df.to_excel(excel_file, index=False, encoding='utf-8-sig') # 使用 utf-8-sig 以支持中文
print(f"成功将文章导出到 {excel_file}")
except Exception as e:
print(f"发生错误: {e}")
if __name__ == "__main__":
main()
运行结果如下
对应文件为
声明:本文仅供技术研究,请勿用于非法采集,后果自负。