导入所需库
import asyncio
import time
import pyppeteer as pyp
import bs4
import requests
import win32gui
import win32con
import threading
使Chrome浏览器窗口始终置顶
def get_all_hwnd(hwnd, mouse):
# 获取所有窗口句柄
hwnd_title = {}
if (win32gui.IsWindow(hwnd)
and win32gui.IsWindowEnabled(hwnd)
and win32gui.IsWindowVisible(hwnd)):
hwnd_title.update({hwnd: win32gui.GetWindowText(hwnd)})
for h, t in hwnd_title.items():
if t:
print(h, t)
windows_names.append(t)
# 桌面所有当前存在的窗口名称
def get_windows_names():
global windows_names
windows_names = []
global googlewin_name
googlewin_name=""
print("所有窗口:")
win32gui.EnumWindows(get_all_hwnd, 0)
for x in windows_names:
if " Google" in x: # 有" Google"字段的窗口置顶!
googlewin_name = x
print("定位到窗口:")
print(googlewin_name)
# 将googlewin_name窗口置顶
def place_top():
# 置顶窗口
print("置顶窗口")
hwnd = win32gui.FindWindow(None, googlewin_name)
# hwnd = win32gui.FindWindow('xx.exe', None)
# 窗口需要正常大小且在后台,不能最小化
win32gui.ShowWindow(hwnd, win32con.SW_SHOWNORMAL)
# 窗口需要最大化且在后台,不能最小化
# ctypes.windll.user32.ShowWindow(hwnd, 3)
win32gui.SetWindowPos(hwnd, win32con.HWND_TOPMOST, 0, 0, 0, 0,
win32con.SWP_NOMOVE | win32con.SWP_NOACTIVATE | win32con.SWP_NOOWNERZORDER | win32con.SWP_SHOWWINDOW | win32con.SWP_NOSIZE)
# 取消置顶
# win32gui.SetWindowPos(hwnd, win32.HWND_NOTOPMOST, 0, 0, 0, 0,win32con.SWP_SHOWWINDOW|win32con.SWP_NOSIZE|win32con.SWP_NOMOVE)
def set_top():
while True:
try:
toplace_top = input()
if toplace_top == "1":
place_top()
except:
pass
多线程问题的解决(截取的部分线程问题解决的关键代码)
async def getOjSourceCode(loginUrl):
width, height = 1400, 600 #网页宽高
browser = await pyp.launch(handleSIGINT=False, # 这三个见https://blog.csdn.net/weixin_41822224/article/details/103719863
handleSIGTERM= False, # 用于 多线程pyppeteer 遇到的一个问题
handleSIGHUP= False,# ValueError: signal only works in main threadpyppeteer
headless=False,
userDataDir = r"D:\用户\LENOVO\Chrom浏览器用户数据\用于爬虫使用\UserData18132506429",args=[f'--window-size={width},{height+500}'],
executablePath=r'C:\Users\LENOVO\AppData\Local\Google\Chrome\Application\chrome.exe')
def crawl():
url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
print("0")
# asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
print("1")
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)
new_loop = asyncio.new_event_loop()
asyncio.set_event_loop(new_loop)
loop = asyncio.get_event_loop()
task=asyncio.ensure_future(getOjSourceCode(url))
loop.run_until_complete(asyncio.wait([task]))
st = task.result()
# getOjSourceCode(url) # 将协程加入到事件循环loop
# loop.close()
编写的爬虫代码
async def getOjSourceCode(loginUrl):
width, height = 1400, 600 #网页宽高
browser = await pyp.launch(handleSIGINT=False, # 这三个见https://blog.csdn.net/weixin_41822224/article/details/103719863
handleSIGTERM= False, # 用于 多线程pyppeteer 遇到的一个问题
handleSIGHUP= False,# ValueError: signal only works in main threadpyppeteer
headless=False,
userDataDir = r"D:\用户\LENOVO\Chrom浏览器用户数据\用于爬虫使用\UserData18132506429",args=[f'--window-size={width},{height+500}'],
executablePath=r'C:\Users\LENOVO\AppData\Local\Google\Chrome\Application\chrome.exe')
page = await browser.newPage()
await antiAntiCrawler(page)
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8'
})
await page.setViewport({'width': width, 'height': height})
await page.goto(loginUrl)
# time.sleep(500)
count=0
cnt=0
# 输入特定的页面序号按回车进入
# element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > span.be-pager-options-elevator > input")
# await element.type("2")
# await page.keyboard.press('Enter') #按下回车键
time.sleep(5)
# 窗口置顶!!!!!!!
get_windows_names()
place_top()
element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > li:nth-child(6)") # 最后一页
# element = await page.xpath("/html/body/div[2]/div[4]/div/div/div/div[2]/div[2]/div[2]/ul[2]/li[3]")
# await element[0].click()
await element.click(options={'button': 'left', #left, right, of middle, defaults to left
'clickCount': 1, # 1 or 2
'delay': 500, # 毫秒
})
print(element,"第",count)
time.sleep(5)
while True:
try:
element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > li:nth-child(5)") # 倒数第二页
await element.click()
time.sleep(5)
# op_count=1
for i in range(1,21):
try:
element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child("+str(i)+") > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > span") # 已关注(或已互粉)按钮
# await element.focus() # 没用的 要用hover()
await element.hover() # 悬停在这个按钮以点击取消关注
# await ActionChains(page).move_to_element(element)
time.sleep(5)
#
# hover_element = webdriver.Chrome().find
# # 对该元素执行悬停操作
# ActionChains(driver).move_to_element(hover_element).perform()
element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child("+str(i)+") > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)") # 找到登录按钮
await element.click()
cnt+=1
print("已取消关注"+str(cnt)+"个")
# await set_top()
time.sleep(5)
except:
cnt+=1
print("取消关注"+str(cnt)+"个时失败")
cnt-=1
pass
count += 1
print("第"+str(count)+"页")
if count == 4 or count==7:
time.sleep(100)
elif count > 10:
return
except:
pass
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(5) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > span"
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(5) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)"
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(6) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)"
async def antiAntiCrawler(page):
#为page添加反反爬虫手段
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) \ '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/78.0.3904.70 Safari/537.36')
# user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70
await page.evaluateOnNewDocument(
'() =>{ Object.defineProperties(navigator,'
'{ webdriver:{ get: () => false } }) }')
# url = "https://www.bilibili.com/video/BV1De4y1Z7jb?spm_id_from=333.999.0.0&vd_source=4890592f27e3e22248b3d74c7563dde8"# 8-29 2
# "https://space.bilibili.com/616010334/fans/follow?spm_id_from=333.999.0.0"
# 18132506429: 1002437080
# paprika: 616010334
def crawl():
url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
print("0")
# asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
print("1")
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)
new_loop = asyncio.new_event_loop()
asyncio.set_event_loop(new_loop)
loop = asyncio.get_event_loop()
task=asyncio.ensure_future(getOjSourceCode(url))
loop.run_until_complete(asyncio.wait([task]))
st = task.result()
# getOjSourceCode(url) # 将协程加入到事件循环loop
# loop.close()
# def crawl():
# url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
# print("0")
# asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
# print("1")
# asyncio.run(set_top)
运行代码
light1 = threading.Thread(target=set_top, )
light2 = threading.Thread(target=crawl, )
# light1.setDaemon(True)
# light2.setDaemon(True)
light2.start()
light1.start()
完整代码
import asyncio
import time
import pyppeteer as pyp
import bs4
import requests
import win32gui
import win32con
import threading
def get_all_hwnd(hwnd, mouse):
# 获取所有窗口句柄
hwnd_title = {}
if (win32gui.IsWindow(hwnd)
and win32gui.IsWindowEnabled(hwnd)
and win32gui.IsWindowVisible(hwnd)):
hwnd_title.update({hwnd: win32gui.GetWindowText(hwnd)})
for h, t in hwnd_title.items():
if t:
print(h, t)
windows_names.append(t)
# 桌面所有当前存在的窗口名称
def get_windows_names():
global windows_names
windows_names = []
global googlewin_name
googlewin_name=""
print("所有窗口:")
win32gui.EnumWindows(get_all_hwnd, 0)
for x in windows_names:
if " Google" in x: # 有" Google"字段的窗口置顶!
googlewin_name = x
print("定位到窗口:")
print(googlewin_name)
# 将googlewin_name窗口置顶
def place_top():
# 置顶窗口
print("置顶窗口")
hwnd = win32gui.FindWindow(None, googlewin_name)
# hwnd = win32gui.FindWindow('xx.exe', None)
# 窗口需要正常大小且在后台,不能最小化
win32gui.ShowWindow(hwnd, win32con.SW_SHOWNORMAL)
# 窗口需要最大化且在后台,不能最小化
# ctypes.windll.user32.ShowWindow(hwnd, 3)
win32gui.SetWindowPos(hwnd, win32con.HWND_TOPMOST, 0, 0, 0, 0,
win32con.SWP_NOMOVE | win32con.SWP_NOACTIVATE | win32con.SWP_NOOWNERZORDER | win32con.SWP_SHOWWINDOW | win32con.SWP_NOSIZE)
# 取消置顶
# win32gui.SetWindowPos(hwnd, win32.HWND_NOTOPMOST, 0, 0, 0, 0,win32con.SWP_SHOWWINDOW|win32con.SWP_NOSIZE|win32con.SWP_NOMOVE)
def set_top():
while True:
try:
toplace_top = input()
if toplace_top == "1":
place_top()
except:
pass
async def getOjSourceCode(loginUrl):
width, height = 1400, 600 #网页宽高
browser = await pyp.launch(handleSIGINT=False, # 这三个见https://blog.csdn.net/weixin_41822224/article/details/103719863
handleSIGTERM= False, # 用于 多线程pyppeteer 遇到的一个问题
handleSIGHUP= False,# ValueError: signal only works in main threadpyppeteer
headless=False,
userDataDir = r"D:\用户\LENOVO\Chrom浏览器用户数据\用于爬虫使用\UserData18132506429",args=[f'--window-size={width},{height+500}'],
executablePath=r'C:\Users\LENOVO\AppData\Local\Google\Chrome\Application\chrome.exe')
page = await browser.newPage()
await antiAntiCrawler(page)
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8'
})
await page.setViewport({'width': width, 'height': height})
await page.goto(loginUrl)
# time.sleep(500)
count=0
cnt=0
# 输入特定的页面序号按回车进入
# element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > span.be-pager-options-elevator > input")
# await element.type("2")
# await page.keyboard.press('Enter') #按下回车键
time.sleep(5)
# 窗口置顶!!!!!!!
get_windows_names()
place_top()
element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > li:nth-child(6)") # 最后一页
# element = await page.xpath("/html/body/div[2]/div[4]/div/div/div/div[2]/div[2]/div[2]/ul[2]/li[3]")
# await element[0].click()
await element.click(options={'button': 'left', #left, right, of middle, defaults to left
'clickCount': 1, # 1 or 2
'delay': 500, # 毫秒
})
print(element,"第",count)
time.sleep(5)
while True:
try:
element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.be-pager > li:nth-child(5)") # 倒数第二页
await element.click()
time.sleep(5)
# op_count=1
for i in range(1,21):
try:
element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child("+str(i)+") > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > span") # 已关注(或已互粉)按钮
# await element.focus() # 没用的 要用hover()
await element.hover() # 悬停在这个按钮以点击取消关注
# await ActionChains(page).move_to_element(element)
time.sleep(5)
#
# hover_element = webdriver.Chrome().find
# # 对该元素执行悬停操作
# ActionChains(driver).move_to_element(hover_element).perform()
element = await page.querySelector("#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child("+str(i)+") > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)") # 找到登录按钮
await element.click()
cnt+=1
print("已取消关注"+str(cnt)+"个")
# await set_top()
time.sleep(5)
except:
cnt+=1
print("取消关注"+str(cnt)+"个时失败")
cnt-=1
pass
count += 1
print("第"+str(count)+"页")
if count == 4 or count==7:
time.sleep(100)
elif count > 10:
return
except:
pass
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(5) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > span"
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(5) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)"
# "#page-follows > div > div.follow-main > div.follow-content.section > div.content > ul.relation-list > li:nth-child(6) > div.content > div > div.be-dropdown.fans-action-btn.fans-action-follow > ul > li:nth-child(2)"
async def antiAntiCrawler(page):
#为page添加反反爬虫手段
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) \ '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/78.0.3904.70 Safari/537.36')
# user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70
await page.evaluateOnNewDocument(
'() =>{ Object.defineProperties(navigator,'
'{ webdriver:{ get: () => false } }) }')
# url = "https://www.bilibili.com/video/BV1De4y1Z7jb?spm_id_from=333.999.0.0&vd_source=4890592f27e3e22248b3d74c7563dde8"# 8-29 2
# "https://space.bilibili.com/616010334/fans/follow?spm_id_from=333.999.0.0"
# 18132506429: 1002437080
# paprika: 616010334
def crawl():
url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
print("0")
# asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
print("1")
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)
new_loop = asyncio.new_event_loop()
asyncio.set_event_loop(new_loop)
loop = asyncio.get_event_loop()
task=asyncio.ensure_future(getOjSourceCode(url))
loop.run_until_complete(asyncio.wait([task]))
st = task.result()
# getOjSourceCode(url) # 将协程加入到事件循环loop
# loop.close()
# def crawl():
# url = "https://space.bilibili.com/1002437080/fans/follow?spm_id_from=333.999.0.0"
# print("0")
# asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
# print("1")
# asyncio.run(set_top)
light1 = threading.Thread(target=set_top, )
light2 = threading.Thread(target=crawl, )
# light1.setDaemon(True)
# light2.setDaemon(True)
light2.start()
light1.start()