pa的网站:https://ibaotu.com/sy/17-0-0-0-0-112.html(供学习使用)
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
from bs4 import BeautifulSoup
import re
import os
from selenium.webdriver.common.action_chains import ActionChains
import requests
import parsel
import threading
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument("--disable-blink-features")
option.add_argument("--disable-blink-features=AutomationControlled")
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
#打开谷歌浏览器
driver=webdriver.Chrome(options=option)
#打开网页
driver.get('https://ibaotu.com/sy/17-0-0-0-0-112.html')
#为div弹框,直接是与定位其他元素一样,直接进行定位('class name')
#点击登录
driver.find_element_by_css_selector('body > header.b-header.b-header-float-fixed.b-header-classify.header-have-boxshodow.b-header-fixed--no > div > div.b-header-right.clearfix > div:nth-child(4) > div > p').click()
#浏览器窗口切换?切换到 QQ
current_window=driver.window_handles
#print(current_window)
driver.implicitly_wait(5)
driver.switch_to.window(current_window[0])
time.sleep(2)
driver.find_element_by_css_selector('body > div.re-popbox.reg-pop.login-New > div > div.login-in-way.clearfix > a.in-way-WX.ibaotu-md-click.auth-type-QQ.btn-social-login-item').click()
#driver.get_screenshot_as_file ("D:/1.PNG")
#再次切换到网页内部的iframe窗口
current_window1=driver.window_handles
#print(current_window1)
driver.switch_to.window(current_window1[1])
driver.switch_to.frame(0)
driver.find_element_by_css_selector('#switcher_plogin').click()
#登陆进去
username='351916740'
password='920825ZlyTC10'
time.sleep(3)
driver.switch_to.window(current_window1[1])
driver.switch_to.frame(0)#再一次踩坑,登录框框有iframe,需要先进入到iframe,再去定位
driver.find_element_by_css_selector('#u').send_keys(username)
time.sleep(2)
driver.find_element_by_css_selector('#p').send_keys(password)
time.sleep(3)
driver.find_element_by_id("login_button").click()
time.sleep(3)
#验证条解锁
#窗口切换到包图网
current_window2=driver.window_handles
#print(current_window2)
driver.switch_to.window(current_window[0])
all_window_height = [] # 创建一个列表,用于记录每一次拖动滚动条后页面的最大高度
all_window_height.append(driver.execute_script("return document.body.scrollHeight;")) #当前页面的最大高度加入列表
while True:
driver.execute_script("scroll(0,15000)") # 执行拖动滚动条操作
time.sleep(3)
check_height = driver.execute_script("return document.body.scrollHeight;")
if check_height == all_window_height[-1]: #判断拖动滚动条后的最大高度与上一次的最大高度的大小,相等表明到了最底部
break
else:
all_window_height.append(check_height) #如果不想等,将当前页面最大高度加入列表。
#解析数据部分
driver.enconding='UTF-8'
soup=BeautifulSoup(driver.page_source,'html.parser')#得到全部的element代码
body=soup.find('div',attrs={'class':'skin-wrap body-background-gradient'})
body=body.find('div',attrs={'class':'search-list box-bg-search box-bottom-gradient clearfix'})#find只会找到与它内容匹配的第一个
body=body.find_all('div',attrs={'class':'hover-pop'})
dwonload_url_list=[]
f = open('D:/tags.txt', 'w')
list=[]
for hover in body:
a_label=hover.find('a')
A_label_content=a_label['href']
txt_url='https:' + a_label['href']
list.append(txt_url)
f.write(txt_url)
f.write('\n')
f.close()
f_url=open('D:/tags.txt', 'r')
count=0
path = 'D:/'
lenth=len(list)
for i in range(lenth-1,lenth-21,-1):
if(count<20):
print(list[i])
driver.get(list[i])
driver.enconding = 'UTF-8'
soup = BeautifulSoup(driver.page_source, 'html.parser') # 得到全部的element代码
body = soup.find('div', attrs={'class': 'related-search clearfix'})
if os.path.isdir(path + str(count)):
pass
else:
os.mkdir(path + str(count))
txt_path = path + str(count) + '/' + str(count) + '.txt'
txt_file = open(txt_path, 'w')
for a in body.find_all('a'):
print(a.text)
txt_file.write(a.text)
txt_file.write('\n')
count += 1
time.sleep(6)
else:
break
由于网站会不定时更新,且是从最后一页溢出,从首页第一张开始爬的话,会出现重复的现象,原先在x的位置的图,在更新之后x+k,而下次爬的位置又恰是x+k,所以就与上一次爬的图重复了,因此选择了从最后一页的最后一张倒序开始爬。
效果:
得到的标签: