GitHub - muzi-xiaoren/pixiv_Crawler: pixiv站爬虫
如果有用的话,拜托各位点个star吧。
pixiv_spider.py
from download import *
if __name__ == "__main__":
print('''1.下载排行榜(日/周/月榜)
2.下载画师主页
3.下载个人主页最近更新''')
url = 'https://www.pixiv.net/'
while True:
choice = input('请输入想要下载的模式:')
if choice == '1': # https://www.pixiv.net/ranking.php?mode=monthly&p=1&format=json
print('输入下载的模式: daily / daily_r18 / weekly / weekly_r18 / monthly')
mode = input('输入上面的选项之一:')
if not os.path.exists(mode):
os.makedirs(mode)
page = int(input('输入想要下载的页数(50张为一页):'))
for i in range(page):
url += f"ranking.php?mode={mode}&p={i+1}&format=json"
crawler_ranking(url, i, mode)
elif choice == '2': # https://www.pixiv.net/ajax/user/23945843/profile/all?lang=zh
num = input('输入作者主页号:')
mode = 'user' + num
if not os.path.exists(mode):
os.makedirs(mode)
url += 'ajax/user/' + num + '/profile/all?lang=zh'
crawler_users(url, mode)
elif choice == '3': # https://www.pixiv.net/ajax/follow_latest/illust?lang=zh&mode=r18&p=1
num = int(input('是否只下载r18(否输入0 是输入1)'))
page = int(input('输入想要下载的页数(60张为一页):'))
mode = 'latest'
url += 'ajax/follow_latest/illust?lang=zh'
if num:
url += '&mode=r18'
mode += '_r18'
if not os.path.exists(mode):
os.makedirs(mode)
for i in range(page):
url += f"&p={i+1}"
crawler_latest(url, i, mode)
else:
print('输入错误,请输入 1, 2 or 3')
continue
break
print('下载完成')
# https://www.pixiv.net/artworks/92691155
download,.py
import requests
import os
from threading import Thread
import threading
# Lock = threading.Lock()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63',
'Cookie': ''
}
def download_img(url, referer, i, mode): # 获取到图片url后定义个函数用于下载
headers_download = {
"referer": str(referer)
}
name = str(i) + '_' + url.split("/")[-1] # 将图片链接以斜杠分割后取最后面的信息作为名字,因为爬取的图片有jeg也有png
if os.path.exists(f"{mode}/{name}"):
print(f'{name}存在', end=' ')
return
response = requests.get(url=url, headers=headers_download) # print(url)
with open(f"{mode}/{name}", "wb") as file:
file.write(response.content) # 将图片二进制数据存入,图片也就得到了
def download_img_1(url, referer, mode): # 获取到图片url后定义个函数用于下载
headers_download = {
"referer": str(referer)
}
name = url.split("/")[-1] # 将图片链接以斜杠分割后取最后面的信息作为名字,因为爬取的图片有jeg也有png
if os.path.exists(f"{mode}/{name}"):
print(f'{name}存在', end=' ')
return
response = requests.get(url=url, headers=headers_download) # print(url)
with open(f"{mode}/{name}", "wb") as file:
file.write(response.content) # 将图片二进制数据存入,图片也就得到了
def crawler_ranking(url, page, mode): # https://www.pixiv.net/ranking.php?mode=monthly_r18&p=1&format=json # https://www.pixiv.net/bookmark_new_illust
res = requests.get(url, headers=headers)
datas = res.json()["contents"] # print(datas)
images_list = []
for data in datas:
image = {
"title": data["title"],
"user_name": data["user_name"],
"p_id": data["illust_id"],
"referer": f"https://www.pixiv.net/artworks/{data['illust_id']}"
}
images_list.append(image) # print(images_list)
for i in range(len(images_list)):
image_1 = images_list[i]
image_url = f"https://www.pixiv.net/ajax/illust/{image_1['p_id']}/pages?lang=zh" # 通过以下链接,请求图片详情
print({image_1['p_id']})
image_data = requests.get(image_url, headers=headers).json()["body"] # 数据保存在body字段 print(image_data)
# Lock.acquire()
for b in image_data: # thumb_mini/small/regular/original
t = Thread(target=download_img, args=(b['urls']['original'], image_1["referer"], page * 50 + i + 1, mode),
name=image_1['p_id'])
t.start() # 如果不加referer字段,直接请求下载链接p站不给结果
print(f'第{page * 50 + i + 1}张正在下载')
# Lock.release()
def crawler_users(url, mode): # https://www.pixiv.net/ajax/user/23945843/profile/all?lang=zh
res = requests.get(url, headers=headers)
datas = res.json()["body"] # print(datas["illusts"])
images_list = list(datas["illusts"].keys()) # print(images_list)
for i in range(len(images_list)):
image_1 = images_list[i]
Referer_ = f"https://www.pixiv.net/artworks/{image_1}"
image_url = f"https://www.pixiv.net/ajax/illust/{image_1}/pages?lang=zh" # 通过以下链接,请求图片详情
image_data = requests.get(image_url, headers=headers).json()["body"] # 数据保存在body字段 print(image_data)
for b in image_data: # thumb_mini/small/regular/original
t = Thread(target=download_img_1, args=(b['urls']['original'], Referer_, mode),
name=image_1)
t.start() # 如果不加referer字段,直接请求下载链接p站不给结果
print(f'第{i + 1}张正在下载')
def crawler_latest(url, page, mode):# https://www.pixiv.net/ajax/follow_latest/illust?p=1&mode=r18&lang=zh
res = requests.get(url, headers=headers)
datas = res.json()["body"] # print(datas["illusts"])
images_list = datas["page"]["ids"] # print(images_list, len(images_list))
for i in range(len(images_list)):
image_1 = images_list[i]
Referer_ = f"https://www.pixiv.net/artworks/{image_1}"
image_url = f"https://www.pixiv.net/ajax/illust/{image_1}/pages?lang=zh" # 通过以下链接,请求图片详情
image_data = requests.get(image_url, headers=headers).json()["body"] # 数据保存在body字段 print(image_data)
for b in image_data: # thumb_mini/small/regular/original
t = Thread(target=download_img_1, args=(b['urls']['original'], Referer_, mode),
name=image_1)
t.start() # 如果不加referer字段,直接请求下载链接p站不给结果
print(f'第{page * 60 + i + 1}张正在下载')
使用说明到github里查看一下吧,这里.md文件复制不出来