**Pixiv镜像地址:**https://www.vilipix.com/
实例
import time
import re
import requests as rs
import os
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup as bs4
import datetime
'''
多线程爬取页面内容步骤
1.如何提取单个页面内容
2.使用多线程进行重复提取
以抓取下载p站图为例
'''
# 文件名称不能包含 \ / : * ? " < > |
def change(str):
str = str.strip()
return re.sub(r'[\\/:*?"<>|]','_',str)
def mkdir(path):
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
print(f"=======创建{path.split('/')[-1]}文件夹成功=======")
else:
print(f"======={path.split('/')[-1]}文件已存在=======")
id=1
def scrip_pixiv(date,mode,page,file_src):
'''
:param date: 日期 20230511
:param mode: 类型 daily今日 ,周,月
:param page: 第几页
:param file_src: 文件夹路径
:return:
'''
# p站首页
url_index = 'https://www.vilipix.com'
url_rank = url_index+'/ranking'
params = {
'date' : date,
'mode' : mode,
'p':page
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51'
}
resp_rank = rs.get(url_rank,headers=headers,params=params)
# 请求rank榜成功,爬取rank界面
if resp_rank.ok:
bs_rank = bs4(resp_rank.text,'html.parser')
all_div_illust = bs_rank.findAll('div',class_='illust')
for illust_div in all_div_illust:
# 得到a标签中的地址
a_link = illust_div.a.get('href')
# 拼接形成详细页面地址
url_detail = url_index + a_link
# 请求图片详情地址
resp_detail = rs.get(url_detail,headers=headers)
if resp_detail.ok:
bs_detail = bs4(resp_detail.text,'html.parser')
# 创建相关图片文件夹
file_name = change(bs_detail.title.string.split('-')[0])
sec_src = file_src+'/'+file_name
try:
mkdir(sec_src)
except:
sec_src = file_src+'/异常文件名'+id
id+=1
mkdir(sec_src)
ul_illust_pages = bs_detail.find('ul',class_='illust-pages')
print(f'========开始下载:{file_name}=========')
# 遍历每一项图片
for detail_li in ul_illust_pages:
# 得到图片的详情地址
url_img = detail_li.img.get('src')
# 请求并开始下载
resp_img = rs.get(url_img,headers=headers)
# 根据图片详细地址得到每张图片的名字
img_name = re.findall(r"/(\w+\.\w+)\?",url_img)[0]
if resp_img.ok:
try:
with open(sec_src+'/'+img_name,'wb') as f:
f.write(resp_img.content)
print(f'{img_name}下载成功')
except:
print('下载失败')
resp_img.close()
print(f'========下载完成:{file_name}=========')
print()
# time.sleep(1)
resp_detail.close()
resp_rank.close()
if __name__ == '__main__':
# 昨天20230627
day = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y%m%d')
file_src = 'C:/Users/wsx/Desktop/animatess_' + day
# 创建总文件夹
mkdir(file_src)
st = time.time()
all_page = 10
with ThreadPoolExecutor(10) as executor:
for page in range(1,all_page+1):
args=[day,'daily',page,file_src]
executor.submit(lambda p:scrip_pixiv(*p),args)
# for page in range(1,4):
# scrip_pixiv('20230603','daily',page,file_src)
ed = time.time()
print(f'总计耗时为:{ed-st}秒')