pyppeteer实战项目 自动化脚本(解决了async多线程的问题;窗口置顶方法)

导入所需库

import asyncio
import time

import pyppeteer as pyp
import bs4
import requests

import win32gui
import win32con
import threading

使Chrome浏览器窗口始终置顶


def get_all_hwnd(hwnd, mouse):

    # 获取所有窗口句柄
    hwnd_title = {}
    if (win32gui.IsWindow(hwnd)
            and win32gui.IsWindowEnabled(hwnd)
            and win32gui.IsWindowVisible(hwnd)):
        hwnd_title.update({hwnd: win32gui.GetWindowText(hwnd)})
    for h, t in hwnd_title.items():
        if t:
            print(h, t)
            windows_names.append(t)

# 桌面所有当前存在的窗口名称
def get_windows_names():
    global windows_names
    windows_names = []
    global googlewin_name
    googlewin_name=""
    print("所有窗口:")
    win32gui.EnumWindows(get_all_hwnd, 0)
    for x in windows_names:
        if " Google" in x:  # 有" Google"字段的窗口置顶!
            googlewin_name = x
            print("定位到窗口:")
            print(googlewin_name)
# 将googlewin_name窗口置顶
def place_top():
    # 置顶窗口
    print("置顶窗口")
    hwnd = win32gui.FindWindow(None, googlewin_name)
    # hwnd = win32gui.FindWindow('xx.exe', None)
    # 窗口需要正常大小且在后台,不能最小化
    win32gui.ShowWindow(hwnd, win32con.SW_SHOWNORMAL)
    # 窗口需要最大化且在后台,不能最小化
    # ctypes.windll.user32.ShowWindow(hwnd, 3)
    win32gui.SetWindowPos(hwnd, win32con.HWND_TOPMOST, 0, 0, 0, 0,
                          win32con.SWP_NOMOVE | win32con.SWP_NOACTIVATE | win32con.SWP_NOOWNERZORDER | win32con.SWP_SHOWWINDOW | win32con.SWP_NOSIZE)
    # 取消置顶
    # win32gui.SetWindowPos(hwnd, win32.HWND_NOTOPMOST, 0, 0, 0, 0,win32con.SWP_SHOWWINDOW|win32con.SWP_NOSIZE|win32con.SWP_NOMOVE)
def set_top():
    while True:
        try:
            toplace_top = input()
            if toplace_top == "1":
                place_top()
        except:
            pass

多线程问题的解决(截取的部分线程问题解决的关键代码)

async def getOjSourceCode(loginUrl):
    width, height = 1400, 600 #网页宽高
    browser = await pyp.launch(handleSIGINT=False,  # 这三个见https://blog.csdn.net/weixin_41822224/article/details/103719863
                               handleSIGTERM= False,  # 用于 多线程pyppeteer 遇到的一个问题
                               handleSIGHUP= False,# ValueError: signal only works in main threadpyppeteer
                               headless=False,
        userDataDir = r"D:\用户\LENOVO\Chrom浏览器用户数据\用于爬虫使用\UserData18132506429",args=[f'--window-size={width},{height+500}'],
        executablePath=r'C:\Users\LENOVO\AppData\Local\Google\Chrome\Application\chrome.exe')

def crawl():
    url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
    print("0")
    # asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
    print("1")
    # loop = asyncio.new_event_loop()
    # asyncio.set_event_loop(loop)
    new_loop = asyncio.new_event_loop()
    asyncio.set_event_loop(new_loop)
    loop = asyncio.get_event_loop()
    task=asyncio.ensure_future(getOjSourceCode(url))
    loop.run_until_complete(asyncio.wait([task]))
    st = task.result()
    # getOjSourceCode(url)  # 将协程加入到事件循环loop
    # loop.close()

 编写的爬虫代码

async def getOjSourceCode(loginUrl):
    width, height = 1400, 600 #网页宽高
    browser = await pyp.launch(handleSIGINT=False,  # 这三个见https://blog.csdn.net/weixin_41822224/article/details/103719863
                               handleSIGTERM= False,  # 用于 多线程pyppeteer 遇到的一个问题
                               handleSIGHUP= False,# ValueError: signal only works in main threadpyppeteer
                               headless=False,
        userDataDir = r"D:\用户\LENOVO\Chrom浏览器用户数据\用于爬虫使用\UserData18132506429",args=[f'--window-size={width},{height+500}'],
        executablePath=r'C:\Users\LENOVO\AppData\Local\Google\Chrome\Application\chrome.exe')
    page = await browser.newPage()
    await antiAntiCrawler(page)
    await page.setExtraHTTPHeaders({
        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8'
    })
    await page.setViewport({'width': width, 'height': height})
    await page.goto(loginUrl)
    # time.sleep(500)
    count=0
    cnt=0
    # 输入特定的页面序号按回车进入
    # element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > span.be-pager-options-elevator > input")
    # await element.type("2")
    # await page.keyboard.press('Enter') #按下回车键

    time.sleep(5)
    # 窗口置顶!!!!!!!
    get_windows_names()
    place_top()

    element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > li:nth-child(6)")  # 最后一页
    # element = await page.xpath("/html/body/div[2]/div[4]/div/div/div/div[2]/div[2]/div[2]/ul[2]/li[3]")
    # await element[0].click()
    await element.click(options={'button': 'left', #left, right, of middle, defaults to left
                           'clickCount': 1,   # 1 or 2
                           'delay': 500,     # 毫秒
                           })



    print(element,"第",count)
    time.sleep(5)
    while True:
        try:
            element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > li:nth-child(5)")  # 倒数第二页
            await element.click()
            time.sleep(5)
            # op_count=1
            for i in range(1,21):
                try:
                    element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child("+str(i)+") > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > span")  # 已关注(或已互粉)按钮

                    # await element.focus()  # 没用的  要用hover()
                    await element.hover()  # 悬停在这个按钮以点击取消关注
                    # await ActionChains(page).move_to_element(element)
                    time.sleep(5)
                    #
                    # hover_element = webdriver.Chrome().find
                    # #   对该元素执行悬停操作
                    # ActionChains(driver).move_to_element(hover_element).perform()

                    element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child("+str(i)+") > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)")  # 找到登录按钮
                    await element.click()
                    cnt+=1
                    print("已取消关注"+str(cnt)+"个")
                    # await set_top()
                    time.sleep(5)
                except:
                    cnt+=1
                    print("取消关注"+str(cnt)+"个时失败")
                    cnt-=1
                    pass
            count += 1
            print("第"+str(count)+"页")
            if count == 4 or count==7:
                time.sleep(100)
            elif count > 10:
                return
        except:
            pass

# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(5) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > span"
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(5) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)"
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(6) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)"




async def antiAntiCrawler(page):
    #为page添加反反爬虫手段
    await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) \ '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/78.0.3904.70 Safari/537.36')

    # user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70
    await page.evaluateOnNewDocument(
    '() =>{ Object.defineProperties(navigator,'
    '{ webdriver:{ get: () => false } }) }')

# url = "https://www.bilibili.com/video/BV1De4y1Z7jb?spm_id_from=333.999.0.0&vd_source=4890592f27e3e22248b3d74c7563dde8"# 8-29 2



# "https://space.bilibili.com/616010334/fans/follow?spm_id_from=333.999.0.0"
# 18132506429: 1002437080
# paprika: 616010334

def crawl():
    url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
    print("0")
    # asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
    print("1")
    # loop = asyncio.new_event_loop()
    # asyncio.set_event_loop(loop)
    new_loop = asyncio.new_event_loop()
    asyncio.set_event_loop(new_loop)
    loop = asyncio.get_event_loop()
    task=asyncio.ensure_future(getOjSourceCode(url))
    loop.run_until_complete(asyncio.wait([task]))
    st = task.result()
    # getOjSourceCode(url)  # 将协程加入到事件循环loop
    # loop.close()
# def crawl():
#     url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
#     print("0")
#     asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
#     print("1")
# asyncio.run(set_top)

运行代码

light1 = threading.Thread(target=set_top, )
light2 = threading.Thread(target=crawl, )
# light1.setDaemon(True)
# light2.setDaemon(True)
light2.start()
light1.start()

完整代码

import asyncio
import time

import pyppeteer as pyp
import bs4
import requests

import win32gui
import win32con
import threading

def get_all_hwnd(hwnd, mouse):

    # 获取所有窗口句柄
    hwnd_title = {}
    if (win32gui.IsWindow(hwnd)
            and win32gui.IsWindowEnabled(hwnd)
            and win32gui.IsWindowVisible(hwnd)):
        hwnd_title.update({hwnd: win32gui.GetWindowText(hwnd)})
    for h, t in hwnd_title.items():
        if t:
            print(h, t)
            windows_names.append(t)

# 桌面所有当前存在的窗口名称
def get_windows_names():
    global windows_names
    windows_names = []
    global googlewin_name
    googlewin_name=""
    print("所有窗口:")
    win32gui.EnumWindows(get_all_hwnd, 0)
    for x in windows_names:
        if " Google" in x:  # 有" Google"字段的窗口置顶!
            googlewin_name = x
            print("定位到窗口:")
            print(googlewin_name)
# 将googlewin_name窗口置顶
def place_top():
    # 置顶窗口
    print("置顶窗口")
    hwnd = win32gui.FindWindow(None, googlewin_name)
    # hwnd = win32gui.FindWindow('xx.exe', None)
    # 窗口需要正常大小且在后台,不能最小化
    win32gui.ShowWindow(hwnd, win32con.SW_SHOWNORMAL)
    # 窗口需要最大化且在后台,不能最小化
    # ctypes.windll.user32.ShowWindow(hwnd, 3)
    win32gui.SetWindowPos(hwnd, win32con.HWND_TOPMOST, 0, 0, 0, 0,
                          win32con.SWP_NOMOVE | win32con.SWP_NOACTIVATE | win32con.SWP_NOOWNERZORDER | win32con.SWP_SHOWWINDOW | win32con.SWP_NOSIZE)
    # 取消置顶
    # win32gui.SetWindowPos(hwnd, win32.HWND_NOTOPMOST, 0, 0, 0, 0,win32con.SWP_SHOWWINDOW|win32con.SWP_NOSIZE|win32con.SWP_NOMOVE)
def set_top():
    while True:
        try:
            toplace_top = input()
            if toplace_top == "1":
                place_top()
        except:
            pass
async def getOjSourceCode(loginUrl):
    width, height = 1400, 600 #网页宽高
    browser = await pyp.launch(handleSIGINT=False,  # 这三个见https://blog.csdn.net/weixin_41822224/article/details/103719863
                               handleSIGTERM= False,  # 用于 多线程pyppeteer 遇到的一个问题
                               handleSIGHUP= False,# ValueError: signal only works in main threadpyppeteer
                               headless=False,
        userDataDir = r"D:\用户\LENOVO\Chrom浏览器用户数据\用于爬虫使用\UserData18132506429",args=[f'--window-size={width},{height+500}'],
        executablePath=r'C:\Users\LENOVO\AppData\Local\Google\Chrome\Application\chrome.exe')
    page = await browser.newPage()
    await antiAntiCrawler(page)
    await page.setExtraHTTPHeaders({
        'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8'
    })
    await page.setViewport({'width': width, 'height': height})
    await page.goto(loginUrl)
    # time.sleep(500)
    count=0
    cnt=0
    # 输入特定的页面序号按回车进入
    # element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > span.be-pager-options-elevator > input")
    # await element.type("2")
    # await page.keyboard.press('Enter') #按下回车键

    time.sleep(5)
    # 窗口置顶!!!!!!!
    get_windows_names()
    place_top()

    element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > li:nth-child(6)")  # 最后一页
    # element = await page.xpath("/html/body/div[2]/div[4]/div/div/div/div[2]/div[2]/div[2]/ul[2]/li[3]")
    # await element[0].click()
    await element.click(options={'button': 'left', #left, right, of middle, defaults to left
                           'clickCount': 1,   # 1 or 2
                           'delay': 500,     # 毫秒
                           })



    print(element,"第",count)
    time.sleep(5)
    while True:
        try:
            element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > li:nth-child(5)")  # 倒数第二页
            await element.click()
            time.sleep(5)
            # op_count=1
            for i in range(1,21):
                try:
                    element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child("+str(i)+") > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > span")  # 已关注(或已互粉)按钮

                    # await element.focus()  # 没用的  要用hover()
                    await element.hover()  # 悬停在这个按钮以点击取消关注
                    # await ActionChains(page).move_to_element(element)
                    time.sleep(5)
                    #
                    # hover_element = webdriver.Chrome().find
                    # #   对该元素执行悬停操作
                    # ActionChains(driver).move_to_element(hover_element).perform()

                    element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child("+str(i)+") > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)")  # 找到登录按钮
                    await element.click()
                    cnt+=1
                    print("已取消关注"+str(cnt)+"个")
                    # await set_top()
                    time.sleep(5)
                except:
                    cnt+=1
                    print("取消关注"+str(cnt)+"个时失败")
                    cnt-=1
                    pass
            count += 1
            print("第"+str(count)+"页")
            if count == 4 or count==7:
                time.sleep(100)
            elif count > 10:
                return
        except:
            pass

# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(5) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > span"
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(5) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)"
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(6) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)"




async def antiAntiCrawler(page):
    #为page添加反反爬虫手段
    await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) \ '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/78.0.3904.70 Safari/537.36')

    # user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70
    await page.evaluateOnNewDocument(
    '() =>{ Object.defineProperties(navigator,'
    '{ webdriver:{ get: () => false } }) }')

# url = "https://www.bilibili.com/video/BV1De4y1Z7jb?spm_id_from=333.999.0.0&vd_source=4890592f27e3e22248b3d74c7563dde8"# 8-29 2



# "https://space.bilibili.com/616010334/fans/follow?spm_id_from=333.999.0.0"
# 18132506429: 1002437080
# paprika: 616010334

def crawl():
    url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
    print("0")
    # asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
    print("1")
    # loop = asyncio.new_event_loop()
    # asyncio.set_event_loop(loop)
    new_loop = asyncio.new_event_loop()
    asyncio.set_event_loop(new_loop)
    loop = asyncio.get_event_loop()
    task=asyncio.ensure_future(getOjSourceCode(url))
    loop.run_until_complete(asyncio.wait([task]))
    st = task.result()
    # getOjSourceCode(url)  # 将协程加入到事件循环loop
    # loop.close()
# def crawl():
#     url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
#     print("0")
#     asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
#     print("1")
# asyncio.run(set_top)

light1 = threading.Thread(target=set_top, )
light2 = threading.Thread(target=crawl, )
# light1.setDaemon(True)
# light2.setDaemon(True)
light2.start()
light1.start()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值