最近学了一下爬虫,就写段代码来试一下成果如何.(目的是爬取某动漫网站上的一部动漫darling in the franxx)
版本是python3.7
import requests
import re
from selenium import webdriver
import os
headers={
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
url="http://www.imomoe.in/view/7288.html"
response = requests.get(url,params=headers)
html=response.text
step1_name=re.findall("/player/7288-0-.*?.html",html)
if not os.path.exists("DarlingInTheFranxx"):
os.mkdir("DarlingInTheFranxx")
for i in range(24):
file_name = "DITF" + str(i + 1)
if not os.path.exists("DarlingInTheFranxx" + '/' + file_name + '.mp4'):
url='http://imomoe.in'+step1_name[i]
response=requests.get(url,headers=headers)
#这里实在是找不到什么好的方法来获取iframe的源代码
browser=webdriver.Chrome()
browser.get(url)
browser.switch_to.frame("play2")
temp=browser.page_source
browser.quit()
video_url=re.findall("https://.*?.mp4",temp)
with open("DarlingInTheFranxx"+'/'+file_name+'.mp4','wb') as fuck:
response = requests.get(video_url[0], stream=True)
print("正在下载第" + str(i + 1) + "集")
for chunk in response.iter_content(chunk_size=1024):
if chunk:
fuck.write(chunk)
print("已下载第"+str(i+1)+"集")
else:
print("第"+str(i+1)+'集已经存在')
运行测试结果如下: