P站爬虫代码
实现方法:
首先在P站日榜页面找到每个图片的data-id,然后进入相应图片的页面,找到图源url爬取原图。
import urllib.request
import ssl
import requests
import re
import time
import os
dir = '/Users/doudou/Desktop/pixiv'
def getPixiv():
dir_content = os.listdir(dir) # 获取文件夹内容
ssl._create_default_https_context = ssl._create_unverified_context # 解决不受信任SSL证书问题
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.2 Safari/605.1.15'
} # 定义headers
response_1 = requests.get("https://www.pixiv.net/ranking.php", headers=headers)
daily_list = response_1.text # 获取pixiv日榜的html文字
with open("daily.txt", "w") as f:
f.write(daily_list)
ID = re.findall('"data-type=".*?"data-id="(.*?)"',daily_list) #取括号内的片段
#print(ID)#获取日榜图片的id
part = "https://www.pixiv.net/artworks/" # 定义除ID外的部分
for site in ID:
URL = part + str(site)
#print(URL)
opener = urllib.request.build_opener()
opener.addheaders = [('Referer', URL)]
response_2 = requests.get(URL,headers = headers) # 获取展示页面的html文字
html = response_2.text # 得到文字
with open("daily.txt&#