爬取思路:
1、首先找到图片分类信息,然后获取每个分类的url
2、请求每个分类的url可以得到相应页面的HTML源码
3、得到源码后使用BS4对信息所在的标签进行定位。
4、得到图片URL后保存一下就可以了
code:
import requests
from bs4 import BeautifulSoup
import json
class zhanzhang():
def __init__(self):
self.url = "https://sc.chinaz.com/tupian/"
self.headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"
}
self.img_a_label = []
self.img_url_2 = []
def deal_label(self,html):
soup = BeautifulSoup(html,"html.parser")
fenlei_list = soup.find("div",attrs = {"class":"mt10 feilei"}).find_all("div")
for fenlei in fenlei_list:
self.img_a_label.extend(fenlei.find_all("a",attrs = {"target":"_blank"}))
for img in self.img_a_label:
# print(img.attrs["title"])
# print(img.attrs["href"])
self.img_url_2.append(img.attrs["href"])
def run(self):
res = requests.get(self.url,headers = self.headers)
html = res.text
self.deal_label(html)
self.get_img_url()
def get_img(self,html):
soup = BeautifulSoup(html,"html.parser")
#找到目标图片所在的url
img_list = soup.find("div",attrs = {"id":"container"}).find_all("img")
for i,img_url in enumerate(img_list) :
url = "https:"+img_url.attrs["src2"].replace("_s","")
# self.img_down(url, i)
def img_down(self,url,num):
file = open("C:/Users/孤桥/Desktop/项目/站长之家/{}.jpg".format(num),"wb")
res = requests.get(url,headers = self.headers)
file.write(res.content)
file.close()
def get_img_url(self):
for img in self.img_url_2:
url = self.url+img[8:]
#这样就得到了每一类图片所在的网址,然后再从该网站上爬取每一张图片
res = requests.get(url,headers = self.headers)
self.get_img(res.text)
break
zhanzhang_spide = zhanzhang()
zhanzhang_spide.run()