pa需要登录的网站且其三方QQ登录

最新推荐文章于 2023-04-19 14:34:36 发布

eye123456789

最新推荐文章于 2023-04-19 14:34:36 发布

阅读量1.3k

点赞数

文章标签： python javascript 开发语言

本文链接：https://blog.csdn.net/eye123456789/article/details/123577049

版权

pa的网站：https://ibaotu.com/sy/17-0-0-0-0-112.html（供学习使用）

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
from bs4 import BeautifulSoup
import re
import os
from selenium.webdriver.common.action_chains import ActionChains
import requests
import parsel
import threading

option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument("--disable-blink-features")
option.add_argument("--disable-blink-features=AutomationControlled")
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"

#打开谷歌浏览器
driver=webdriver.Chrome(options=option)
#打开网页
driver.get('https://ibaotu.com/sy/17-0-0-0-0-112.html')
#为div弹框，直接是与定位其他元素一样，直接进行定位（'class name'）
#点击登录
driver.find_element_by_css_selector('body > header.b-header.b-header-float-fixed.b-header-classify.header-have-boxshodow.b-header-fixed--no > div > div.b-header-right.clearfix > div:nth-child(4) > div > p').click()


#浏览器窗口切换？切换到 QQ
current_window=driver.window_handles
#print(current_window)
driver.implicitly_wait(5)
driver.switch_to.window(current_window[0])
time.sleep(2)
driver.find_element_by_css_selector('body > div.re-popbox.reg-pop.login-New > div > div.login-in-way.clearfix > a.in-way-WX.ibaotu-md-click.auth-type-QQ.btn-social-login-item').click()
#driver.get_screenshot_as_file ("D:/1.PNG")


#再次切换到网页内部的iframe窗口
current_window1=driver.window_handles
#print(current_window1)
driver.switch_to.window(current_window1[1])
driver.switch_to.frame(0)
driver.find_element_by_css_selector('#switcher_plogin').click()


#登陆进去
username='351916740'
password='920825ZlyTC10'
time.sleep(3)
driver.switch_to.window(current_window1[1])
driver.switch_to.frame(0)#再一次踩坑，登录框框有iframe，需要先进入到iframe，再去定位
driver.find_element_by_css_selector('#u').send_keys(username)
time.sleep(2)
driver.find_element_by_css_selector('#p').send_keys(password)
time.sleep(3)
driver.find_element_by_id("login_button").click()
time.sleep(3)

#验证条解锁


#窗口切换到包图网
current_window2=driver.window_handles
#print(current_window2)
driver.switch_to.window(current_window[0])



all_window_height =  []  # 创建一个列表，用于记录每一次拖动滚动条后页面的最大高度
all_window_height.append(driver.execute_script("return document.body.scrollHeight;")) #当前页面的最大高度加入列表
while True:
    driver.execute_script("scroll(0,15000)") # 执行拖动滚动条操作
    time.sleep(3)
    check_height = driver.execute_script("return document.body.scrollHeight;")
    if check_height == all_window_height[-1]:  #判断拖动滚动条后的最大高度与上一次的最大高度的大小，相等表明到了最底部
        break
    else:
        all_window_height.append(check_height) #如果不想等，将当前页面最大高度加入列表。


#解析数据部分
driver.enconding='UTF-8'
soup=BeautifulSoup(driver.page_source,'html.parser')#得到全部的element代码
body=soup.find('div',attrs={'class':'skin-wrap body-background-gradient'})
body=body.find('div',attrs={'class':'search-list box-bg-search box-bottom-gradient clearfix'})#find只会找到与它内容匹配的第一个
body=body.find_all('div',attrs={'class':'hover-pop'})
dwonload_url_list=[]

f = open('D:/tags.txt', 'w')
list=[]
for hover in body:
    a_label=hover.find('a')
    A_label_content=a_label['href']
    txt_url='https:' + a_label['href']
    list.append(txt_url)
    f.write(txt_url)
    f.write('\n')
f.close()

f_url=open('D:/tags.txt', 'r')
count=0
path = 'D:/'


lenth=len(list)
for i in range(lenth-1,lenth-21,-1):
    if(count<20):
        print(list[i])
        driver.get(list[i])
        driver.enconding = 'UTF-8'
        soup = BeautifulSoup(driver.page_source, 'html.parser')  # 得到全部的element代码
        body = soup.find('div', attrs={'class': 'related-search clearfix'})
        if os.path.isdir(path + str(count)):
            pass
        else:
            os.mkdir(path + str(count))
        txt_path = path + str(count) + '/' + str(count) + '.txt'
        txt_file = open(txt_path, 'w')
        for a in body.find_all('a'):
            print(a.text)
            txt_file.write(a.text)
            txt_file.write('\n')
        count += 1
        time.sleep(6)
    else:
        break

由于网站会不定时更新，且是从最后一页溢出，从首页第一张开始爬的话，会出现重复的现象，原先在x的位置的图，在更新之后x+k，而下次爬的位置又恰是x+k，所以就与上一次爬的图重复了，因此选择了从最后一页的最后一张倒序开始爬。

效果：