爬取微博客户端用户评论
'''
# -*- coding:utf-8 -*-
@FileName: weibo_comment.py
@Date : 2018/12/5 13:01
@Author : 风夏
@Hobby : mi mi 猫
'''
import requests
import json
import time
import emoji #pip install emoji
import schedule #pip install schedule
import re
def task():
print("当前时间::"+time.strftime("%Y-%m-%d", time.localtime(time.time()))+' '+time.strftime("%H:%M:%S",time.localtime(time.time())))
# with open('comment.txt', 'r') as f:
# file = f.read()
for i in range(1, 3): #微博客户端网站目前只能爬取16页数据
print('====================第{0}页===================='.format(i))
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'
}
if i == 1:
url = 'https://m.weibo.cn/comments/hotflow?id=4578675531450036&mid=4578675531450036&max_id_type=0'
else:
url = 'https://m.weibo.cn/comments/hotflow?id=4578675531450036&mid=4578675531450036&max_id={0}&max_id_type=0'.format(max_id)
cookie = {
# 'cookie': 'SINAGLOBAL=2047840506036.7466.1593615163995; login_sid_t=cb30a1f6e67792a0335c3f33548f6668; cross_origin_proto=SSL; _s_tentry=www.baidu.com; UOR=,,www.baidu.com; wb_view_log=1536*8641.25; Apache=5893161647764.307.1607144696612; ULV=1607144696622:3:1:1:5893161647764.307.1607144696612:1597844770933; ALF=1638680740; SSOLoginState=1607144740; SUB=_2A25yz2V1DeRhGeNN71MX8i3JzDuIHXVRvdG9rDV8PUNbmtAKLXPgkW9NSaEirp-WyJ2_7j_uxZVqfNFXn836Ounx; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFxSSvsZRv-x2F6AKv8nL435JpX5KzhUgL.Fo-0Sh2ceoefS0M2dJLoIEBLxK-LBo5L12qLxKML12eLB-zLxKqLBKzLBKMLxKqLBK2L1K5t; wvr=6; wb_view_log_5341623577=1536*8641.25; webim_unReadCount=%7B%22time%22%3A1607146504278%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A4%2C%22msgbox%22%3A0%7D',
'cookie':'_T_WM=85071206373; WEIBOCN_FROM=1110006030; XSRF-TOKEN=9bf1a0; SUB=_2A25yz27UDeRhGeNN71MX8i3JzDuIHXVuMHKcrDV6PUJbktCOLUndkW1NSaEirof-g7dxz5Gao5BdGJb0jv5aSO6i; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFxSSvsZRv-x2F6AKv8nL435NHD95Qfe0BpSoz0SKMNWs4Dqcj.i--fi-z7iKysi--NiKyhi-8Fi--ci-2Ei-2Ni--ci-2piK.7; SSOLoginState=1607147140; MLOGIN=1',
}
response = requests.get(url, headers=header, cookies=cookie)
text = json.loads(response.text)['data']
max_id = text['max_id']
comments = text['data']
for j in comments:
comment = j['text'] #评论内容
comment = emoji.demojize(comment)#去除表情
comment = re.sub(":\S+?:"," ",comment)
name = j['user']['screen_name'] #评论者昵称
# name = emoji.demojize(name)
# name = re.sub(":\S+?:"," ",name)
created_at = j['created_at'] #评论时间
print(name, created_at, comment)
with open("comment.txt", "a",encoding='utf-8') as f:
f.write(name + " " + created_at + " " + comment + "\n")
# if comment in file:
# pass
# else:
# try:
# with open("1.txt", "a") as f:
# f.write(name + " " + created_at + " " + comment + "\n")
# except:
# pass
time.sleep(2) #防止爬得太快被反爬,进行睡眠2s
# schedule.every(30).minutes.do(job) #定时任务,每隔30分钟调用一次job函数
schedule.every(20).seconds.do(task) #定时任务,每隔20秒调用一次job函数
# task()
while True:
# print("当前时间:",time.strftime('%Y-%M-%D %H:%M:%S',time.localtime(time.time())))
# time.sleep(1)
schedule.run_pending()