本次研究的是用python批量下载豆瓣里面某位女明星的照片到本地,按照分页来处理,下载路径也是分页来保存。
#code:--utf8--
#author:bingw
#create_time:2022/11/16 14:44
#name:YM_prc.py
import requests
from lxml import etree
import time
import os
class YM:
def __init__(self,headers,page):
self.before_url = 'https://movie.douban.com/celebrity/1052359/photos/?type=C&start='
self.after_url = '&sortby=like&size=a&subtype=a'
self.headers = headers
self.page = page
self.time = 0.6
def spider(self):
result_list = []
url = self.before_url + str(30*(self.page-1)) + self.after_url
print(url)
con = requests.get(url=url,headers=self.headers).content.decode('utf8')
result_list.append(con)
return result_list
def parser(self):
src_list = []
result_list = self.spider()
for result in result_list:
html = etree.HTML(result)
src = html.xpath('//div[@class="cover"]/a/img/@src')
print(src)
src_list.append(src)
time.sleep(self.time)
return src_list
def create_path(self):
# 创建文件夹 data
path = f"data/杨幂/{self.page}/"
isExists = os.path.exists(path)
if not isExists:
first_path = os.makedirs(path)
else:
pass
return path
def downloads(self):
src_list = self.parser()
num = len(src_list)
for i in range(1,num+1):
src = src_list[i-1]
num_ = len(src)
for j in range(1,num_+1):
img = src[j-1]
con = requests.get(url=img).content
print(self.page,j)
with open(f'D:\\pycharm\\DouYin\\data\\img\\{self.page}-{j}'+'.jpg','wb') as w:
w.write(con)
if __name__ == '__main__':
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"referer": "https://movie.douban.com/celebrity/1052359/photos/"
}
for page in range(1,11):
print(f'正在抓取:第 {page} 页图片!')
yangmi = YM(headers=headers,page=page)
yangmi.downloads()
print('图片下载完毕!')