python爬虫,使用selenium爬取网站数据

python爬虫,使用selenium爬取网站数据,二维码识别使用的cv2和pyzbar结合

使用环境

为了爬取某网站的群名片开发制作的

目录结构:

文件:run_app.py
文件夹:pyscript
文件:MethodsClass.py
文件:read_link_to_get.py

源码

文件:run_app.py

from pyscript import read_link_to_get


if __name__ == '__main__':
    #####################################################################
    read_link_to_get.main('文件传输助手')
    #####################################################################
    

文件夹:pyscript内
文件:MethodsClass.py

import json
import os
import re
from datetime import datetime
from urllib import request
import requests
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
import random
import cv2
import numpy as np
from pyzbar.pyzbar import decode
from config import *


# 二维码解码类方法:
class OcrQrcode:
    """
    # 群二维码链接解析类
    # file_ocr(self, path):图片二维码识别
    # link_ocr(self, url): 链接二维码识别
    """

    # 识别本地图片
    def file_ocr(self, path):
        img = cv2.imread(path)
        result = self.img_ocr(img)
        return {"result": result, "img": img}

    # 识别网络图片
    def link_ocr(self, url):
        # 1.2 从网络读取图像
        response = request.urlopen(url)
        img_byt = np.array(bytearray(response.read()), dtype=np.uint8)
        img = cv2.imdecode(img_byt, cv2.IMREAD_COLOR)
        result = self.img_ocr(img)
        return {"result": result, "img": img}

    # cv2 + pyzbar双重识别函数
    @staticmethod
    def img_ocr(img):
        qrcode = cv2.QRCodeDetector()
        # result识别结果; points二维码轮廓; code二维码原始排列
        result, points, code = qrcode.detectAndDecode(img)
        if len(result) < 1:
            # 识别二维码
            decoded = decode(img)
            # 打印结果
            if len(decoded) > 0:
                result = decoded[0].data.decode("utf-8")
        return result


# 浏览器chrome drivers调用方法:
class RunChrome:
    """
    # 启动一个无头浏览器,然后设置好运行环境
    # run_llq(current_path):
    从 current_path 路径运行chromedriver
    """

    @staticmethod
    def run_llq():
        #####################################################################
        # 浏览器和谷歌浏览器驱动位置
        path = chrome_path
        s = Service(driver_path)
        # 设置浏览器启动配置
        option = webdriver.ChromeOptions()
        # 异步加载数据,无需等页面加载完成就可以继续执行后续操作,后续操作不会等待,如果不手动设置等待,没法正常使用
        # desired_capabilities = DesiredCapabilities.CHROME
        # desired_capabilities["pageLoadStrategy"] = "none"
        # # 浏览器不提供可视化页面(无头模式)
        # option.add_argument('--headless')
        # 禁用gpu加速
        # option.add_argument('--disable-gpu')
        # 隐身模式(无痕模式)
        option.add_argument('--incognito')
        # 移除window.navigator.webdriver属性值
        option.add_argument('--disable-blink-features=AutomationControlled')
        # 禁止策略化,去掉提示受到自动软件控制
        option.add_argument('--disable-infobars')
        # 指定浏览器分辨率,不起作用
        # option.add_argument('window-size=1280x800')
        # 指定谷歌浏览器路径
        option.binary_location = path
        # 移除谷歌浏览器正在接受自动化控制的提示
        # #以开发者模式启动调试chrome,可以去掉提示受到自动软件控制
        option.add_experimental_option("excludeSwitches", ['enable-automation'])
        # 去掉提示以开发者模式调用
        option.add_experimental_option('useAutomationExtension', False)
        # 设置浏览器user-Agent
        option.add_argument('''User-Agent=Mozilla/5.0 (Linux; Android 11; Redmi Note 8 Pro Build/RP1A.200720.011; wv) 
             AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/111.0.5563.116 Mobile Safari/537.36 
             XWEB/5197 MMWEBSDK/20230701 MMWEBID/1151 MicroMessenger/8.0.40.2420(0x2800283F) WeChat/arm64 
             Weixin NetType/WIFI Language/zh_CN ABI/arm64''')
        # 禁止加载图片后网站打开速度很快
        option.add_argument('--blink-settings=imagesEnabled=false')
        # 禁用javascript
        # option.add_argument('--disable-javascript')

        # 浏览器初始化
        _driver = Chrome(service=s, options=option)
        return _driver

    @staticmethod
    def random_str(str_len=28):
        """
        :param str_len: 字符串长度
        :return: 随机字符串
        """
        rand_str = ''
        base_str = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_'
        length = len(base_str) - 1
        for i in range(str_len):
            rand_str += base_str[random.randint(0, length)]
        return rand_str

    @staticmethod
    def set_cookies(driver):
        # 启动浏览器,打开网址,设置新的cookies
        driver.get(r'https://readooapi.youshu.cc')
        # driver.set_window_size(width=1280, height=960)
        cookies = {"FROM_YS_SOURCE": "WXH5", "HTTP_X_YS_OS": "wx", "WECHAT_ID": "5", "UNION_ID": RunChrome.random_str(),
                   "OPEN_ID": "oF-" + RunChrome.random_str(25),
                   "ys_wx_auth_id": "wo_wx66666efccac66666_oF-" + RunChrome.random_str(25),
                   "HTTP_X_YS_OPEN_TOKEN": RunChrome.random_str(40).lower()}
        for key, value in cookies.items():
            cookie = {"name": key, "value": value}
            driver.add_cookie(cookie)

    @staticmethod
    def rq_get_url(g_url, g_headers, g_cookies):
        url_new = g_url
        session = requests.Session()
        while True:
            _resp = session.get(url=url_new, headers=g_headers, cookies=g_cookies, allow_redirects=False)
            for kn, vn in _resp.headers.items():
                if str(kn).lower() == "location":
                    url_new = vn
                    print("重定向的链接地址: " + url_new)
                    break
            if re.search("uniform_join_cust", url_new):
                break
        return url_new

    # 随机headers
    @staticmethod
    def rnd_headers():
        headers_arr = [{
            'User-Agent': 'Mozilla/5.0 (Linux; Android 11; Redmi Note 8 Pro Build/RP1A.200720.011; wv) '
            + 'AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/111.0.5563.116 Mobile Safari/537.36 '
            + 'XWEB/5197 MMWEBSDK/20230701 MMWEBID/1151 MicroMessenger/8.0.40.2420(0x2800283F) '
            + 'WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64',
        }, {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 10; Huawei 11 Pro Build/RP1A.200920.011; wv) '
            + 'AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/111.0.5563.116 Mobile Safari/537.36 '
            + 'XWEB/5197 MMWEBSDK/20230701 MMWEBID/1151 MicroMessenger/8.0.40.2420(0x2800283F) '
            + 'WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64',
        }, {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 11; Summing 12 Pro Build/RP1A.200910.011; wv) '
            + 'AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/109.0.5563.116 Mobile Safari/537.36 '
            + 'XWEB/5197 MMWEBSDK/20230701 MMWEBID/1151 MicroMessenger/8.0.40.2420(0x2800283F) '
            + 'WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64',
        }]
        return headers_arr[random.randint(0, 3)]


# Json文件加载和写入
class JsonRW:

    # 读取Json文件
    @staticmethod
    def json_load(path):
        json_b = dict()
        # 如果文件存在
        if os.path.exists(path):
            with open(path, "r", encoding="UTF-8") as f:
                json_b = json.load(f)
        return json_b

    @staticmethod
    def json_dump(path, data_json):
        with open(path, "w+", encoding='UTF-8') as f:
            json.dump(data_json, f, indent=4, ensure_ascii=False)

    # 读取群名称文件,返回一个二级的嵌套数组
    @staticmethod
    def sort_url_read(path):
        with open(path, "r", encoding='UTF-8') as f:
            lines = f.readlines()
        sort_url_array = list()
        tmp = list()
        for index, line in enumerate(lines):
            if line.strip('\n') != "":
                tmp.append(line.strip('\n'))
            else:
                sort_url_array.append(tmp)
                tmp = list()
            if index == len(lines) - 1:
                sort_url_array.append(tmp)
        return sort_url_array


# 生成日志,并写入文件中
class OutLogs:
    @staticmethod
    def print_write(path, logs):
        with open(path, 'a+', encoding='UTF-8') as f:
            for log in logs:
                f.write(str(log).strip() + os.linesep)

    @staticmethod
    def print_var(logs, var):
        print(logs)
        var.append(str(datetime.now())[:19] + ': ' + logs)


# 定时器类
class TimerSwitch:

    @staticmethod
    def timer_tick(run_h, stop_h):
        """
        # 定时器,返回是否执行bool
        # run_h: 开始运行时间
        # stop_h: 结束运行时间
        """
        # 开始小于结束时
        if run_h < stop_h:
            if run_h <= datetime.now().hour < stop_h:
                return True
            else:
                return False
        else:
            if stop_h <= datetime.now().hour < run_h:
                return False
            else:
                return True

文件:read_link_to_get.py

import time as tm
from threading import Thread, Lock
from wxauto import WeChat
from pyscript.MethodsClass import *

OcrQrcode = OcrQrcode()
isSendLock = Lock()
# 获取当前微信客户端
wx = WeChat()
# 获取会话列表
wx.GetSessionList()
logs = list()


# 线程1:发送时间文本的计时器函数
def target_send(who_s):
    while True:
        # print(TimerSwitch.timer_tick(6, 1))
        if TimerSwitch.timer_tick(6, 1):
            if (datetime.now().minute % 10 == 0) and (datetime.now().second < 30):
                print('激活发送时间代码:')
                print('min:' + str(datetime.now().minute % 10) + '; sec:' + str(datetime.now().second))
                # 加锁,阻塞线程
                isSendLock.acquire()
                wx.ChatWith(who_s)  # 打开`who`聊天窗口
                wx.SendMsg(str(datetime.now())[:19])  # 向`who`发送消息~
                # 解锁,释放线程
                isSendLock.release()
                tm.sleep(30)  # 模拟事件发生周期
        tm.sleep(10)  # 模拟事件发生周期


# 长链接循环监测群二维码更新情况
def long_cycle_monitor(long_url, data_json, driver, m_who):
    global logs
    while True:
        is_continue = True
        # print(TimerSwitch.timer_tick(6, 1))
        if TimerSwitch.timer_tick(6, 1):
            OutLogs.print_var('timer_tick开关为打开状态,正常运行循环监测', logs)
            try:
                OutLogs.print_var('循环开始时间: ' + str(datetime.now())[:19], logs)
                result_wx = list()
                # 从嵌套数组读取群短链接数组
                OutLogs.print_var('进入for循环: ', logs)
                for Num, long_link in enumerate(long_url):
                    OutLogs.print_var('删除所有的cookies记录', logs)
                    driver.delete_all_cookies()
                    OutLogs.print_var('第' + str(Num + 1) + '个链接' + ': ', logs)
                    driver.get(long_link)
                    for a in range(30):
                        if driver.execute_script("return document.images[0].src.length") >= 10:
                            break
                        tm.sleep(0.2)
                    pic_url = driver.execute_script("return document.images[0].src")
                    # 如果图片链接地址存在,继续执行下面语句
                    if len(pic_url) > 10:
                        OutLogs.print_var('目标url解析成功,判断data_json[' + long_link + ']是否存在', logs)
                        if str(Num) not in data_json:
                            data_json[str(Num)] = {'二维码链接': '', '图片Url': ''}
                        # 识别二维码结果
                        OutLogs.print_var('调用OcrQrcode.link_ocr,进行二维码识别', logs)
                        res = OcrQrcode.link_ocr(pic_url)
                        OutLogs.print_var('old: ' + data_json[str(Num)]['二维码链接'], logs)
                        OutLogs.print_var('new: ' + res['result'], logs)
                        if res['result'] != data_json[str(Num)]['二维码链接']:
                            OutLogs.print_var('检测到群名片发生更新,开始保存群二维码文件', logs)
                            cv2.imwrite(result_path + 'temp.png', res['img'])
                            print(result_path + 'temp.png')
                            # 加锁,阻塞线程
                            isSendLock.acquire()
                            OutLogs.print_var('打开聊天窗口发送文件', logs)
                            wx.ChatWith(m_who)  # 打开`who`聊天窗口
                            # 向聊天窗口发送文件
                            wx.SendFiles(result_path + 'temp.png')
                            print(result_path + 'temp.png')
                            OutLogs.print_var('文件发送成功', logs)
                            # 解锁,释放线程
                            isSendLock.release()
                            # 写群二维码解析出来的链接到字典中
                            OutLogs.print_var('写群二维码解析出来的信息到字典中', logs)
                            data_json[str(Num)]['二维码链接'] = res['result']
                            if '图片Url' in data_json[str(Num)]:
                                del data_json[str(Num)]['图片Url']
                            data_json[str(Num)]['图片Url'] = pic_url
                            OutLogs.print_var("图片Url: " + pic_url, logs)
                            result_wx.append(data_json[str(Num)]['二维码链接'])
                            data_json['result_wx'] = result_wx
                            JsonRW.json_dump(long_output_json_file, data_json)
                            OutLogs.print_var('第一层for完成,写logs到日志文件中', logs)
                            OutLogs.print_write(output_logs, logs)
                            # 清空logs列表
                            logs = list()
                    else:
                        is_continue = False
                        continue
            except Exception as e:
                OutLogs.print_write(output_logs, logs)
                # 清空logs列表
                logs = list()
                OutLogs.print_write(output_logs, str(e))
        else:
            tm.sleep(60)
        # 跳出while循环
        if not is_continue:
            break


# 短链接循环监测群二维码更新情况
def short_cycle_monitor(sort_url, data_json, driver, m_who):
    global logs
    while True:
        is_continue = True
        # print(TimerSwitch.timer_tick(6, 1))
        if TimerSwitch.timer_tick(6, 1):
            OutLogs.print_var('run_btn开关为打开状态,正常运行循环监测', logs)
            try:
                OutLogs.print_var('循环开始时间: ' + str(datetime.now())[:19], logs)
                result_wx = list()
                OutLogs.print_var('删除所有的cookies记录', logs)
                driver.delete_all_cookies()
                # 从嵌套数组读取群短链接数组
                for index, arr in enumerate(sort_url):
                    OutLogs.print_var('进入第一层for: 为driver设置新的cookies', logs)
                    # 浏览器设置新的cookies
                    RunChrome.set_cookies(driver)
                    # 从群短链接数组读取群短链接
                    for Num, su in enumerate(arr):
                        OutLogs.print_var('第' + str(index + 1) + '轮链接' + str(Num + 1) + ': ' + su, logs)
                        driver.get(su)
                        # 此处为查找图片出现,才继续执行,超时为6秒
                        for a in range(30):
                            if driver.execute_script("return document.images[0].src.length") >= 10:
                                break
                            tm.sleep(0.2)
                        # 返回找到的图片链接
                        pic_url = driver.execute_script("return document.images[0].src")
                        # 如果图片链接地址存在,继续执行下面语句,如果没有找到图片,则is_continue为0,跳出循环
                        if len(pic_url) > 10:
                            OutLogs.print_var('目标url解析成功,判断data_json[' + su + ']是否存在', logs)
                            if str(su) not in data_json:
                                data_json[su] = {'二维码链接': '', '图片Url': ''}
                            # 识别二维码结果
                            OutLogs.print_var('调用OcrQrcode.link_ocr,进行二维码识别', logs)
                            res = OcrQrcode.link_ocr(pic_url)
                            OutLogs.print_var('old: ' + data_json[su]['二维码链接'], logs)
                            OutLogs.print_var('new: ' + res['result'], logs)
                            if res['result'] != data_json[su]['二维码链接']:
                                OutLogs.print_var('检测到群名片发生更新,保存群二维码文件', logs)
                                cv2.imwrite(result_path + 'temp.png', res['img'])
                                print(result_path + 'temp.png')
                                # 加锁,阻塞线程
                                isSendLock.acquire()
                                OutLogs.print_var('准备打开聊天窗口发送文件', logs)
                                wx.ChatWith(m_who)  # 打开`who`聊天窗口
                                # 向聊天窗口发送文件
                                wx.SendFiles(result_path + 'temp.png')
                                print(result_path + 'temp.png')
                                OutLogs.print_var('文件发送成功', logs)
                                # 解锁,释放线程
                                isSendLock.release()
                                # 写群二维码解析出来的链接到字典中
                                OutLogs.print_var('写群二维码解析出来的信息到字典中', logs)
                                data_json[su]['二维码链接'] = res['result']
                                if '图片Url' in data_json[su]:
                                    del data_json[su]['图片Url']
                                data_json[su]['图片Url'] = pic_url
                                OutLogs.print_var("图片Url: " + pic_url, logs)
                                result_wx.append(data_json[su]['二维码链接'])
                                data_json['result_wx'] = result_wx
                                JsonRW.json_dump(long_output_json_file, data_json)
                                OutLogs.print_var('第一层for完成,写logs到日志文件中', logs)
                                OutLogs.print_write(output_logs, logs)
                                # 清空logs列表
                                logs = list()
                        else:
                            is_continue = False
                            break
            except Exception as err:
                OutLogs.print_write(output_logs, logs)
                # 清空logs列表
                logs = list()
                OutLogs.print_write(output_logs, str(err))
        else:
            tm.sleep(60)
            # 跳出while循环
        if not is_continue:
            break





# 主线程:
def main(m_who):
    #####################################################################
    global logs
    # 加载群名称文件到变量
    OutLogs.print_var('加载群文件和JSON文件', logs)
    with open(long_url_file, 'r', encoding='UTF-8') as f:
        long_url = f.readlines()
    # 加载Json数据到变量
    long_output_json = JsonRW.json_load(long_output_json_file)
    # print(str(long_output_json))
    # 创建线程1
    OutLogs.print_var('创建线程:发送时间到微信', logs)
    send_time_to_wx = Thread(target=target_send, args=(m_who,), name='发送时间到微信')
    # 设置守护线程【可选】,主线程结束后自动停止
    send_time_to_wx.setDaemon(True)
    # 启动线程
    send_time_to_wx.start()

    while True:
        # 启动chromedriver
        OutLogs.print_var('启动chromedriver运行', logs)
        driver = RunChrome.run_llq()

        # 启动循环监测群名称函数
        long_cycle_monitor(long_url, long_output_json, driver, m_who)

        driver.quit()


# 测试线程:
def test(m_who):
    #####################################################################
    global logs

    # 创建线程1
    OutLogs.print_var('创建线程:发送时间到微信', logs)
    send_time_to_wx = Thread(target=target_send, args=(m_who,), name='发送时间到微信')
    # 设置守护线程【可选】,主线程结束后自动停止
    send_time_to_wx.setDaemon(True)
    # 启动线程
    send_time_to_wx.start()

    # 如果内部代码有异常,会退出到该步骤,重新进入新一轮循环
    while True:
        # 加载群链接文件到变量
        OutLogs.print_var('加载群文件和JSON文件', logs)
        short_url = JsonRW.sort_url_read(input_text_file)
        # 加载Json数据到变量
        output_json = JsonRW.json_load(output_json_file)

        # 启动chromedriver
        OutLogs.print_var('启动chromedriver运行', logs)
        driver = RunChrome.run_llq()

        # 启动循环监测群名称函数
        week_cycle_monitor(short_url, output_json, driver, m_who)

        driver.quit()

仅供学习参考,cookies不对,需要自己抓取正确的cookies

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

toss007

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值