1.打开网页,批量保存
其中xpath可能会改变,自行debug更改
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
import traceback
import logging
f = open("C:\\yuyang2\\桌面\\yyDownloadError.log",'w+',encoding='utf-8')
xpathDict = {"jpgc":'//*[@id="root"]/div/div[2]/div[2]/div/div[1]/div/section/div/div/div/div[1]/div/div/div[1]/div[2]/a/div[2]/div[1]/div/div'}
xpathDict['jprb'] = '//*[@id="root"]/div/div[2]/div[2]/div/div[1]/div/section/div/div/div/div[1]/div/div/div[1]/div[3]/a/div[2]/div[1]/div/div'
xpathDict['jphg'] = '//*[@id="root"]/div/div[2]/div[2]/div/div[1]/div/section/div/div/div/div[1]/div/div/div[1]/div[2]/a/div[2]/div[1]/div/div'
driver = webdriver.Chrome("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe") #拿到 driver
driver.get("https://shimo.im/login?from=home")
driver.implicitly_wait(15) #隐式等待最长时间5s
waitTime, frequency = 10, 1
driver.find_element_by_class_name("wechat").click() #微信扫码登陆
WebDriverWait(driver, waitTime, frequency).until(EC.url_to_be("https://shimo.im/desktop"))#等待10s,直到页面跳转
driver.find_element_by_link_text("工作台").click()
jpgc_ele = WebDriverWait(driver,waitTime, frequency).until(EC.presence_of_element_located((By.XPATH,xpathDict['jphg'])))
jpgc_ele.click() #直到找到橘片国产,点击链接
page_url = driver.page_source #获取这整个页面html字符串
url_list = re.findall('href="(https://pan.baidu.com.*?)"',page_url)
for i,url in enumerate(url_list):
print("正在保存第%d个视频!" % (i))
driver.get(url) #获取第一条视频链接
try:
time.sleep(0.5)
WebDriverWait(driver,waitTime, frequency).until(EC.presence_of_element_located((By.ID,"accessCode"))).send_keys("WJSY") #填写提取码
except Exception as e:
messsage = "链接为%s的第%d个视频保存失败!,错误信息为%s %s\n"%(url,i,e,traceback.format_exc())
f.write(messsage)
print(messsage)
continue
driver.find_element_by_link_text("提取文件").click()
time.sleep(0.1)
try:
WebDriverWait(driver,waitTime, frequency).until(EC.presence_of_element_located((By.CLASS_NAME,"EOGexf"))).click() #选中保存内容
except Exception as e:
messsage = "链接为%s的第%d个视频保存失败!,错误信息为%s %s\n"%(url,i,e,traceback.format_exc())
print(messsage)
f.write(messsage)
continue
driver.find_element_by_link_text("保存到网盘").click()
if i == 0:
WebDriverWait(driver,waitTime, frequency).until(EC.presence_of_element_located((By.ID,"TANGRAM__PSP_11__footerQrcodeBtn"))).click()#点击扫一扫
WebDriverWait(driver,waitTime, frequency).until(EC.presence_of_element_located((By.CLASS_NAME,"user-name"))) #等待知道出现用户名 沧海二阳
WebDriverWait(driver,waitTime, frequency).until(EC.presence_of_element_located((By.CLASS_NAME,"EOGexf"))).click() #等待手机扫码结束10s内,选中存储文件
driver.find_element_by_link_text("保存到网盘").click()
time.sleep(1)
#移动鼠标到弹窗
user_info_ele = driver.find_element_by_id("fileTreeDialog")
ActionChains(driver).move_to_element(user_info_ele).perform() #开始移动
selectTree = {'movie':'//div[2]/div/ul/li/ul/li[6]/div/span/span','jlfq':'//div[2]/div/ul/li/ul/li[6]/ul/li[9]/div/span/span','gc':'//div[2]/div/ul/li/ul/li[6]/ul/li[9]/ul/li/div/span/span','rb':'//div[2]/div/ul/li/ul/li[6]/ul/li[9]/ul/li[2]/div/span/span','hg':'//div[2]/div/ul/li/ul/li[6]/ul/li[9]/ul/li[2]/div'}
for k,xpath in selectTree.items(): #将选择tree的xpath放入字典中
if k == 'gc'or k == 'rb': #国产已经保存完毕,所以跳过国产
continue
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,xpath))).click()
driver.find_element_by_css_selector("#fileTreeDialog > div.dialog-footer.g-clearfix > a.g-button.g-button-blue-large > span > span").click()#单击确定
WebDriverWait(driver,waitTime, frequency).until(EC.presence_of_element_located((By.CLASS_NAME,'trans-tip-view'))) #等待5s内,直到出现点击查看,
driver.quit()
2. 后处理
将被禁的url列表,从网页中找出名字,并重新保存
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
import traceback
xpathDict = {}
xpathDict['jphg'] = '//*[@id="root"]/div/div[2]/div[2]/div/div[1]/div/section/div/div/div/div[1]/div/div/div[1]/div[2]/a/div[2]/div[1]/div/div'
driver = webdriver.Chrome("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe") #拿到 driver
driver.get("https://shimo.im/login?from=home")
driver.implicitly_wait(15) #隐式等待最长时间5s
waitTime, frequency = 10, 1
driver.find_element_by_class_name("wechat").click() #微信扫码登陆
WebDriverWait(driver, waitTime, frequency).until(EC.url_to_be("https://shimo.im/desktop"))#等待10s,直到页面跳转
driver.find_element_by_link_text("工作台").click()
jpgc_ele = WebDriverWait(driver,waitTime, frequency).until(EC.presence_of_element_located((By.XPATH,xpathDict['jphg'])))
jpgc_ele.click() #直到找到橘片国产,点击链接
page_url = driver.page_source #获取这整个页面html字符串
url_list = re.findall('href="(https://pan.baidu.com.*?)"',page_url)
name_list = re.findall('(【\S+\s*\S*\s*\S*\s*\S*】\s*\S*)链接:*<',page_url)
assert len(url_list) == len(name_list)
selectDict = {}
for url, name in zip(url_list,name_list):
selectDict[url] = name
save_str = ''
with open("C:\\yuyang2\\桌面\\橘片被禁列表.txt",'r',encoding='utf-8') as f:
line_list = f.read().splitlines()
for line in line_list:
if line == "欧美:":
save_str += line + '\n'
if line in selectDict.keys():
save_str += selectDict[line] + ": " + line + '\n'
with open("C:\\yuyang2\\桌面\\橘片被禁列表_yy.txt", 'a+', encoding='utf-8') as f:
f.write(save_str)
注:可以将名字和url一起打印,则不需后处理