from bs4 import BeautifulSoup
import requests
#获取页面信息
def getHtml(url):
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36’
}
response=requests.get(url,headers=headers)
response.encoding=response.apparent_encoding
isStatus(response)
html=BeautifulSoup(response.text,‘html.parser’)
return html
#判断状态码
def isStatus(response):
if response.status_code==200:
print(‘访问成功’)
else:
print(‘访问失败’)
#提取html里面的相关数据
def extractData(text,li):
datas=text.find(‘div’,class_=‘clearfix’).find_all(‘div’,class_=‘mask’)
#print(datas)
for data in datas:
li.append(data.find(‘a’,class_=‘a3’).get(‘primaryid’))
#print(li)
return li
#使用selenium模拟浏览器打开相应接口
def getMySelenium(url):
chrome_options = webdriver.ChromeOptions();
chrome_options.add_experimental_option(“excludeSwitches”, [‘enable-automation’]);
driver=webdriver.Chrome(options=chrome_options)#选择chrome浏览器并解决chrome正受到自动测试软件的控制的提示
driver.get(url)#打开chrome浏览器
driver.maximize_window()#设置最大化
time.sleep(5)
driver.quit()
if name == ‘main’:
li=[]
i=input(‘请输入要获得第几页的图片’)
url=‘https://www.dpm.org.cn/lights/royal/p/’+i+’.html’
html=getHtml(url)
li2=extractData(html,li)
#print(li2)
for j in range(len(li2)):
url2=‘https://www.dpm.org.cn/download/lights_image/id/’+li2[j]+’/img_size/’+i+’.html’
print(‘正在下载’,url2)
getMySelenium(url2)
2021-11-02
最新推荐文章于 2021-11-10 17:26:45 发布