可爬取任意标签,以cosplay标签为例代码如下,仅做学习交流使用
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from urllib.error import HTTPError
import os
import re
url_cosplay = "https://www.tujigu.com/s/36/"#所要爬取的根网页地址,可根据需求修改此处
total_image_name = 1
total_file_name =1
def getImage(url,file):#下载图片,url为下载的url,f为保存的文件夹名称
if not os.path.exists('E:/spiders/%s'%file):
os.makedirs('E:/spiders/%s'%file)
content = getContent(url)
count = 2
x = 1
fdir = 'E:/spiders/'+str(file)+'/'
while content is not None:
if count==2:#判断是否是第一次进入
content = getContent(url)
else:
content = getContent(urlnext)
if content == None:#当内容为空,下载完毕
print("下载完毕"