# -*- coding:utf-8 -*-
import json
import random
import re
import time
from bs4 import BeautifulSoup
from datetime import datetime
from pyExcelerator import * # 导入excel相关包
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# from utils import pgs, es
class WxMps(object):
def __init__(self, _biz, _pass_ticket, _app_msg_token, _cookie, wap_sid2, _offset=0):
self.start_time = 1575129600 # 截止开始时间 2019/12/01 00:00:00
self.offset = _offset
self.biz = _biz # 公众号标志
self.msg_token = _app_msg_token # 票据(非固定)
self.pass_ticket = _pass_ticket # 票据(非固定)
self.wap_sid2 = wap_sid2
self.headers = {
'Cookie': _cookie, # Cookie(非固定)
'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 '
}
self.LIKE_URL = 'https://mp.weixin.qq.com/mp/getappmsgext?__biz=%s&appmsg_type=9&mid=%s&sn=%s&idx=%s&appmsg_token=%s&is_need_ad=0'
# excel 第一行数据
self.excel_data = [u'编号', u'时间', u'文章标题', u'关键词', u'阅读量', u'点赞数', u'评论', u'文章地址', u'文章内容']
# 定义excel操作句柄
self.excle_w = Workbook()
excel_sheet_name = time.strftime('%Y-%m-%d')
self.excel_content = self.excle_w.add_sheet(excel_sheet_name)
self.container = []
self.line = 1
# wx_mps = 'wxmps' # 这里数据库、用户、密码一致(需替换成实际的)
# self.postgres = pgs.Pgs(host='localhost', port='12432', db_name=wx_mps, user=wx_mps, password=wx_mps)
# self.elastic = es.Es(host='localhost', port=12900, index='mp', doc='article')
def log(self, msg):
print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)
def start(self):
"""请求获取公众号的文章接口"""
cols = 0
for data in self.excel_data:
self.excel_content.write(0, cols, data)
cols += 1
self.excle_w.save('test3.xls')
count = 0
offset = self.offset
while True:
api = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={0}&f=json&offset={1}' \
'&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={2}&wxtoken=&appmsg_token' \
'={3}&x5=1&f=json'.format(self.biz, offset, self.pass_ticket, self.msg_token)
resp = requests.get(api, headers=self.headers, verify=False).json()
ret, status = resp.get('ret'), resp.get('errmsg') # 状态信息
self.log(u'状态信息,ret:%s' % ret)
if 0 == ret or 'ok' == status:
# print('Crawl article: ' + api)
general_msg_list = resp['general_msg_list']
msg_list = json.loads(general_msg_list)['list'] # 获取文章列表
self.log(u'文章列表个数:%s' % len(msg_list))
for msg in msg_list:
self.log(u'文章(%d/%d)' % (count, len(msg_list)))
# if count >= 11:
# return
# try:
# if count % 10 == 0 and count != 0:
# print '写入Excel'
# first = count - 10
# for line in range(first, count):
# cols = 0
# for data in self.excel_data:
# self.excel_content.write(line, cols, self.container[line][cols])
# cols += 1
# self.excle_w.save('test3.xls')
# except Exception as e:
# print(str(e))
comm_msg_info = msg['comm_msg_info'] # 该数据是本次推送多篇文章公共的
msg_id = comm_msg_info['id'] # 文章id
self.log(u'文章id:%s' % msg_id)
post_time = datetime.fromtimestamp(comm_msg_info['datetime']) # 发布时间
s_time = datetime.fromtimestamp(self.start_time) # 发布时间
if post_time.__lt__(s_time):
print "时间已到截止日期前,停止爬虫"
return
self.log(u'文章时间:%s' % post_time)
msg_type = comm_msg_info['type'] # 文章类型
self.log(u'文章类型:%s' % msg_type)
# msg_data = json.dumps(comm_msg_info, ensure_ascii=False) # msg原数据
# self.log(u'文章原始数据:%s' % msg_data)
if 49 == msg_type:
# 图文消息
app_msg_ext_info = msg.get('app_msg_ext_info') # article原数据
if app_msg_ext_info:
# 本次推送的首条文章
self._parse_articles(app_msg_ext_info, msg_id, post_time, msg_type, count)
count += 1
# 本次推送的其余文章
multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list')
if multi_app_msg_item_list:
for item in multi_app_msg_item_list:
msg_id = item['fileid'] # 文章id
if msg_id or not isinstance(msg_id, int):
msg_id = int(time.time()) # 设置唯一id,解决部分文章id=0出现唯一索引冲突的情况
self._parse_articles(item, msg_id, post_time, msg_type, count)
count += 1
elif 1 == msg_type:
# 文字消息
content = comm_msg_info.get('content')
# if content:
# self._save_text_and_image(msg_id, post_time, msg_type, digest=content)
elif 3 == msg_type:
# 图片消息
image_msg_ext_info = msg.get('image_msg_ext_info')
cdn_url = image_msg_ext_info.get('cdn_url')
# if cdn_url:
# self._save_text_and_image(msg_id, post_time, msg_type, cover=cdn_url)
# 0:结束;1:继续
can_msg_continue = resp.get('can_msg_continue')
if not can_msg_continue:
print('Break , Current offset : %d' % offset)
break
offset = resp.get('next_offset') # 下一次请求偏移量
print('Next offset : %d' % offset)
@staticmethod
def crawl_article_content(content_url):
"""抓取文章内容
:param content_url: 文章地址
"""
try:
html = requests.get(content_url, verify=False).text
except:
print(content_url)
pass
else:
bs = BeautifulSoup(html, 'html.parser')
js_content = bs.find(id='js_content')
if js_content:
p_list = js_content.find_all('p')
content_list = list(map(lambda p: p.text, filter(lambda p: p.text != '', p_list)))
content = ''.join(content_list)
return content
def _parse_articles(self, info, msg_id, post_time, msg_type, count):
# """解析嵌套文章数据并保存入库"""
row = []
row.append(count)
row.append(str(post_time))
self.log(u'post_time:%s' % post_time)
title = info.get('title') # 标题
row.append(title)
self.log(u'标题:%s' % title)
cover = info.get('cover') # 封面图
author = info.get('author') # 作者
self.log(u'作者:%s' % author)
digest = info.get('digest') # 关键字
row.append(digest)
self.log(u'关键字:%s' % digest)
# source_url = info.get('source_url') # 原文地址
content_url = info.get('content_url') # 微信地址
# ext_data = json.dumps(info, ensure_ascii=False) # 原始数据
content_url = content_url.replace('amp;', '').replace('#wechat_redirect', '').replace('http', 'https')
self._parse_article_detail(content_url, row)
row.append(content_url)
self.log(u'文章链接:%s' % content_url)
content = self.crawl_article_content(content_url)
self.log(u'文章内容:%s' % content)
row.append(content)
self.container.append(row)
try:
print '写入Excel, line:', self.line
cols = 0
for data in row:
if data:
self.excel_content.write(self.line, cols, data)
cols += 1
self.line += 1
self.excle_w.save('test3.xls')
except Exception as e:
print(str(e))
time.sleep(random.randint(2, 3))
def _parse_article_detail(self, content_url, row):
# """从文章页提取相关参数用于获取评论,article_id是已保存的文章id"""
try:
resp = requests.get(content_url, headers=self.headers, verify=False)
except Exception as e:
print('获取评论失败')
else:
# group(0) is current line
readNum, likeNum = self.getMoreInfo(content_url)
row.append(readNum)
row.append(likeNum)
html = resp.text
str_comment = re.search(r'var comment_id = "(.*)" \|\| "(.*)" \* 1;', html)
str_msg = re.search(r"var appmsgid = '' \|\| '(.*)'\|\|", html)
str_token = re.search(r'window.appmsg_token = "(.*)";', html)
if str_comment and str_msg and str_token:
comment_id = str_comment.group(1) # 评论id(固定)
app_msg_id = str_msg.group(1) # 票据id(非固定)
appmsg_token = str_token.group(1) # 票据token(非固定)
# 缺一不可
if appmsg_token and app_msg_id and comment_id:
print('Crawl article comments')
row.append(self._crawl_comments(app_msg_id, comment_id, appmsg_token))
def getMoreInfo(self, link):
# 获得mid,_biz,idx,sn 这几个在link中的信息
mid = link.split("&")[1].split("=")[1]
idx = link.split("&")[2].split("=")[1]
sn = link.split("&")[3].split("=")[1]
_biz = link.split("&")[0].split("_biz=")[1]
# fillder 中取得一些不变得信息
req_id = "1121Emzkpus2YzixHv8OZPro"
pass_ticket = self.pass_ticket
appmsg_token = self.msg_token
# 目标url
url = "http://mp.weixin.qq.com/mp/getappmsgext"
# 添加Cookie避免登陆操作,这里的"User-Agent"最好为手机浏览器的标识
# phoneCookie = "devicetype=iOS11.3; lang=zh_CN; pass_ticket=AXCuXaypgHXqC3Grop/GDPNNyOIKC/Any1k/BMd35JUK805ZvVRvZ47cKc6S7Xcj; rewardsn=; version=2607033d; wap_sid2=CInatZ8IElwxY0hJQUZPWjFwdkZrYU9kLUw0UUdId0htQkY3VnNzeENXRncyTlkzNVhqcHhtWWJrNENBTGcyQThlRnNkbEhJYmxfb3ZXRG1Rb2RsTTFFWHJlWWM2OThEQUFBfjDclOHhBTgNQAE=; wxtokenkey=777; wxuin=2213375241; pgv_pvid=2213375241"
phoneCookie = "rewardsn=; wxuin=2213375241; devicetype=android-24; lang:zh_CN; wxtokenkey=777; pgv_info=ssid=s2906320611; pgv_pvid=1873061148; version:2607033d; pass_ticket=%s; wap_sid2=%s" % (pass_ticket, self.wap_sid2)
headers = {
"Cookie": phoneCookie,
# "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E216 MicroMessenger/6.7.2 NetType/WIFI Language/zh_CN"
"User-Agent":"Mozilla/5.0 (Linux; Android 7.0; JMM-AL10 Build/HONORJMM-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/66.0.3359.126 MQQBrowser/6.2 TBS/044431 Mobile Safari/537.36 MMWEBID/3894 MicroMessenger/6.7.3.1360(0x2607033D) NetType/WIFI Language/zh_CN Process/toolsmp"
}
# 添加data,`req_id`、`pass_ticket`分别对应文章的信息,从fiddler复制即可。
data = {
"is_only_read": 1,
"req_id": req_id,
"pass_ticket": pass_ticket,
"is_temp_url": 0,
"appmsg_like_type": 2,
"more_read_type": 0,
"item_show_type": 0,
"appmsg_type": 9
}
"""
添加请求参数
__biz对应公众号的信息,唯一
mid、sn、idx分别对应每篇文章的url的信息,需要从url中进行提取
key、appmsg_token从fiddler上复制即可
pass_ticket对应的文章的信息,也可以直接从fiddler复制
"""
params = {
"__biz": _biz,
"mid": mid,
"sn": sn,
"idx": idx,
"key": "777",
"pass_ticket": pass_ticket,
"appmsg_token": appmsg_token,
"uin": "777",
"wxtoken": "777"
}
# 使用post方法进行提交
content = requests.post(url, headers=headers, data=data, params=params).json()
# 提取其中的阅读数和点赞数
print '提取其中的阅读数和点赞数:'
print(content["appmsgstat"]["read_num"], content["appmsgstat"]["like_num"])
readNum = content["appmsgstat"]["read_num"]
likeNum = content["appmsgstat"]["like_num"]
# 歇10s,防止被封
# time.sleep(15)
return readNum, likeNum
def _crawl_comments(self, app_msg_id, comment_id, appmsg_token):
"""抓取文章的评论"""
commentList = []
api = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz={0}' \
'&appmsgid={1}&idx=1&comment_id={2}&offset=0&limit=100&uin=777&key=777' \
'&pass_ticket={3}&wxtoken=777&devicetype=android-26&clientversion=26060739' \
'&appmsg_token={4}&x5=1&f=json'.format(self.biz, app_msg_id, comment_id,
self.pass_ticket, appmsg_token)
try:
resp = requests.get(api, headers=self.headers, verify=False).json()
except:
print('Article {} no Comment')
else:
ret, status = resp['base_resp']['ret'], resp['base_resp']['errmsg']
if ret == 0 or status == 'ok':
elected_comment = resp['elected_comment']
for comment in elected_comment:
nick_name = comment.get('nick_name') # 昵称
# self.log(u'昵称:%s' % nick_name)
logo_url = comment.get('logo_url') # 头像
comment_time = datetime.fromtimestamp(comment.get('create_time')) # 评论时间
content = comment.get('content') # 评论内容
# self.log(u'评论内容:%s' % content)
content_id = comment.get('content_id') # id
like_num = comment.get('like_num') # 点赞数
# self.log(u'点赞数:%s' % like_num)
commentList.append("{昵称:%s, 时间:%s, 内容:%s, 点赞数:%s}" % (nick_name, comment_time, content, like_num))
reply_list = comment.get('reply')['reply_list'] # 回复数据
reply_content, reply_like_num, reply_create_time = None, None, None
if reply_list:
first_reply = reply_list[0]
reply_content = first_reply.get('content')
reply_like_num = first_reply.get('reply_like_num')
reply_create_time = datetime.fromtimestamp(first_reply.get('create_time'))
return "; ".join(commentList)
if __name__ == '__main__':
biz = 'MjM5MjAxNDM4MA=='
app_msg_token = '1051_V0N4pX2JM165AQpdDhHFM3K4cphiT4GBURq37O2BI8nQbqkudDryhnItAI2SzgumjOImQzBdx5wIFaGS'
# 上面3个定义与公众号相关【_id仅为个人数据库中的一个标志】,下面3个跟当前微信会话有关
pass_ticket = '2uTeu/aMHkWuuQNFqUHXlRssyPyzYLOYAYRd9vNuaa72kVwfxe+/GQqdJlTFXVCI'
wap_sid2 = 'CInatZ8IElxKYnFndm1keTc4cGFwenI1YnVSVzZmalM5Mjh6LTJGZlhrb1VDR1lJZ3kzam4wanhOTm1EYUlMQ0I5LS11ZmFTSFkwOV9qdHdidjFCZ1hLZV9nM2Y4eHNFQUFBfjCJlI3zBTgNQAE='
cookie = 'wxuin=2213375241; version=2607033d; pass_ticket={}; wap_sid2={}'.format(pass_ticket, wap_sid2)
wxMps = WxMps(biz, pass_ticket, app_msg_token, cookie, wap_sid2)
# 开始爬取文章及评论
wxMps.start()