用request_heml现在爬不到任何东西,应该是做了屏蔽。那么就要换个思路了,不用request请求网站,而且全部用selenium来操作。还是打开任意一个作品的第一页,然后获取网页源代码,通过源代码来获取所有的所有的跳转链接和当前界面所有的下载链接。主要更新了这个方法:
def getAllSession(url): #当前页所有的链接
print('It\'s comming getAllSession()!')
html = driver.page_source.encode("utf8")
pattern1 = re.compile(r'(\/Musiclist\/mmc_\w+\.htm)') # 编写正则表达式
pattern2 = re.compile(r'(\/down_\d{1,6}\.html)') # 编写正则表达式
m = pattern1.findall(str(html)) #跳转链接
n = pattern2.findall(str(html)) #下载链接
# print('m:',m,'\nn:',n)
L1 = []
for x in m:
if x not in L1 :
L1 .append(x)
L1 .sort()
#L1 .pop(0) #去掉首页跳转链接
downLink = [[],[]]
downLink[0] = L1
downLink[1] = n
# print('downLink:',downLink)
if downLink != None:
return downLink
有了跳转链接和所有的下载链接,把这些内容写到csv里面去。然后自动下载:
from selenium import webdriver
import requests
import time
import re
import pandas as pd
driver = webdriver.Chrome()
targetUrl = 'https://www.pingshu8.com/MusicList/mmc_235_2916_1.Htm'
headUrl = 'https://www.pingshu8.com'
a = 0
def openDriver(url):
driver.get(url) #打开浏览器.
def getAllSession(url): #当前页所有的链接
print('It\'s comming getAllSession()!')
html = driver.page_source.encode("utf8")
pattern1 = re.compile(r'(\/Musiclist\/mmc_\w+\.htm)') # 编写正则表达式
pattern2 = re.compile(r'(\/down_\d{1,6}\.html)') # 编写正则表达式
m = pattern1.findall(str(html)) #跳转链接
n = pattern2.findall(str(html)) #下载链接
# print('m:',m,'\nn:',n)
L1 = []
for x in m:
if x not in L1 :
L1 .append(x)
L1 .sort()
#L1 .pop(0) #去掉首页跳转链接
downLink = [[],[]]
downLink[0] = L1
downLink[1] = n
# print('downLink:',downLink)
if downLink != None:
return downLink
def jump_link(): #获取第一页的跳转链接
print('It\'s comming jump_link()!')
for i in getAllSession(targetUrl)[0]: # 这是页面跳转链接
pattern = re.compile(r'\/Musiclist\/mmc_\w+\.htm') # 编写正则表达式
m = pattern.findall(str(i)) # 用正则表达式查出总共有多少页
# print(m)
'''
通过split切割出最终的页码
'''
pagenum_list = []
for n in m:
pagename = re.split(r'\/Musiclist\/mmc_\d{2,3}_\d{0,}_', n, maxsplit=0, flags=0) # 去掉页码前缀
# print('pagename:',pagename)
pagenum = pagename[1].split('.htm') # 去掉页码后缀
pagenum_list.append(pagenum)
def end_num(): #最终的页码
num1 = pagenum_list[0][0]
num1 = int(num1)
try:
num2 = pagenum_list[1][0]
num2 = int(num2)
return num2
except:
num2 = 0
return num2
finally:
if num1 < num2:
# print('num1:',num1,type(num1))
return num2
else:
# print('num2:',num2,type(num2))
return num1
'''
获取总页数之后,拼接每一页的跳转链接
'''
pagejump = []
pages = 2
# print(end_num())
num = end_num()
num = int(num)
result1 = re.match('(\/Musiclist\/mmc_\d{2,3}_\d{0,}_)', n)
t = result1.group(1)
result2 = '.htm'
while pages <= num:
result = headUrl + t + str(pages) + result2
pages += 1
pagejump.append(result)
# print('result:',result)
if pagejump != None:
return pagejump
def down_link(url): #每一页的下载链接
print('It\'s comming down_link()!')
save_down_link = []
for i in getAllSession(url): #这是所有的链接
pattern = re.compile(r'\/down_\d{1,6}\.html') #编写正则表达式+
#print('pattern:',pattern)
re_link = pattern.findall(str(i)) #用正则表达式查出所有的down地址
for j in re_link:
links = f'{headUrl}'+j
#print(links,type(links))#拼接完整的下载地址
save_down_link.append(links)
if save_down_link != None:
return save_down_link
def mp3Name(url): #当前页所有的评书题目
print('It\'s comming mp3Name()!')
num = 5
pstitle = []
numlist = []
while num < 33:
numlist.append(num)
num += 3
maxfor = len(down_link(url))
for nums in numlist[0:maxfor]:
sel = driver.find_element_by_xpath(
f'/html/body/div[2]/div[13]/div[1]/ul[2]/div[2]/ul/form/li[{nums}]/a').text
pstitle.append(sel)
if pstitle != None:
return pstitle
def getAllLinks(url): #打开每一页获取下载链接
print('It\'s comming getAllLinks()!')
ps_link_list = []
ps_txt_list = []
mylist = []
ps_link_list.append(down_link(targetUrl))
ps_txt_list.append(mp3Name(targetUrl))
for i in jump_link():
openDriver(i) # 打开第二页
result = driver.find_element_by_link_text('首页')
if result != None:
ps_link_list.append(down_link(i))
ps_txt_list.append(mp3Name(i))
else:
time.sleep(50)
ps_link_list.append(down_link(i))
ps_txt_list.append(mp3Name(i))
for links in ps_link_list:
links.sort()
print(ps_link_list)
x = 10 #正常情况每页有10条
y = len(ps_link_list)
z = len(ps_link_list[-1])
m = 0
while m < y-1: #正常遍历
for n in range(x):
mytext = ps_txt_list[m][n]
mylink = ps_link_list[m][n]
mylist.append((mytext, mylink))
m += 1
if z < x: #判断最后一页有没有10条,没有10条遍历10次会报溢出
l = 0
while l < z:
mytext = ps_txt_list[-1][l]
mylink = ps_link_list[-1][l]
mylist.append((mytext, mylink))
l += 1
if mylist != None:
return mylist
def openXss(url,title): #下载主方法
'''
计数 1/3 2/3
'''
global a
a += 1
print('\n',a,'/',lenxs)
openDriver(url)
time.sleep(1)
handle = driver.current_window_handle # 获取当前标签句柄
'''
获取文件名并赋值给filename
'''
def name(title):
name = title
return name
bt = driver.find_element_by_id('clickina').click()
handles = driver.window_handles # 获取当前所有标签句柄
for newHand in handles: # 对标签进行遍历
if newHand != handle: # 筛选新打开的标签
driver.switch_to.window(newHand) # 切换到新打开的标签
link = driver.current_url # 获取当前页面地址
print(link)
driver.close()
driver.switch_to.window(handles[0])
myfile = requests.get(link)
filename = name(title)
open(f'e:\pydownload\\{filename}.mp3', 'wb').write(myfile.content)
def Name():
name = driver.find_element_by_xpath('/html/body/div[2]/div[12]/div/h1').text
return name
def csv(list):
print('It\'s comming csv()!')
mp3name = Name()
df = pd.DataFrame(list)
df.columns = ['title','link']
df.to_csv(f'{mp3name}.csv', encoding='gbk', index=False)
def running(url):
mylist = []
ps_txt_list = []
ps_link_list = []
openDriver(url)
def ex():
try:
sel = driver.find_element_by_xpath('/html/body/div[2]/div[13]/div[1]/ul[2]/div[3]/div/a[1]')
return True
except:
return False
def z():
try:
z = len(ps_link_list[-1])
return z
except:
return x+1
if ex():
jump_link()
return csv(getAllLinks(targetUrl))
else:
x = 10 #正常情况每页有10条
m = 0
ps_link_list.append(down_link(targetUrl))
ps_txt_list.append(mp3Name(targetUrl))
z = z()
if z < x: #判断最后一页有没有10条,没有10条遍历10次会报溢出
l = 0
while l < z:
mytext = ps_txt_list[-1][l]
mylink = ps_link_list[-1][l]
mylist.append((mytext, mylink))
l += 1
elif z == x: #第一页正好10条
csv(getAllLinks(targetUrl))
if mylist != None:
return csv(mylist)
def downCsv():
mp3name = Name()
data = pd.read_csv(f'{mp3name}.csv', encoding='GB18030')
content = data.values
lenxs = len(content)
a = 0
j=0
while j<5:
try:
for i in content:
title = i[0]
link = i[1]
print(title, link)
openXss(link, title)
except:
j+=1
print(f'第{j}次重试')
running(targetUrl)
downCsv()