需求
爬取微博私信信息,包括:文本、对话者信息、图片路径,并将截图、文本留存。
缺陷
微博私信网页长时间查看时,经常有刷新失败、页面崩溃的情况,页面崩溃时需要手动刷新页面,比较烦人。
代码
# coding:utf8
"""
需求:
1.获取截图,一条一截,标红
2.记录对话信息:对话文本、时间
3.记录对话者id
4.将数据信息持久化保存
"""
import datetime
import os
import time
import traceback
from typing import List
from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.webdriver.support.wait import WebDriverWait
# ------------------------------
# 全局变量定义
# ------------------------------
SAVE_PATH = r"d:\xx" # main directory
CHAT_PATH = r"\chat" # private chat directory
RELOAD_LIMIT_TIME = 180 # 多少s内页面未正常刷新,重新加载页面
# year and date of target data,last_date_str晚于first_date_str
year_str, last_date_str, first_date_str = '2020', '11-30', '2016-01-01'
# 是否全部截图
IS_SCREEN_SHOT = False
# ------------------------------
# 类定义
# ------------------------------
class wb_user:
"""
微博用户
"""
name = code = ''
def __init__(self, name, code):
self.code, self.name = code, name
def __eq__(self, o):
return type(self) == type(o) and self.code == o.code and self.name == o.name
me = None
class a_log:
"""
单条对话中的信息
"""
usr = data = time = None
def __init__(self, usr, data, l_time):
self.usr, self.time, self.data = usr, l_time, data
def __str__(self):
return self.usr.name + '\t' + self.usr.code + '\t' + self.data.replace('\n', '\\n') + '\t' + self.time.strftime('%Y-%m-%d %H:%M')
class user_log:
"""
所有对话信息
"""
user1, user2 = None, None
logs = []
def __init__(self, user1: wb_user, user2: wb_user, logs: List[a_log]):
self.logs, self.user1, self.user2 = logs, user1, user2
def add_log(self, log: a_log):
self.logs.append(log)
def __str__(self):
ret = ''
for l in self.logs:
ret &