爬取所需库
from bs4 import BeautifulSoup
import requests
import time
没有下载相关库的话,打开终端输入pip install ~,按下回车 (~为要下载的库,例如bs4、requests、selenium)
#可以参考链接:
https://blog.csdn.net/Itmastergo/article/details/129438321?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522172024870316800215081464%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=172024870316800215081464&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~top_positive~default-2-129438321-null-null.142^v100^pc_search_result_base4&utm_term=python%E4%B8%8B%E8%BD%BD%E5%BA%93&spm=1018.2226.3001.4187```
1.爬取过程
(1)打开浏览器输入网站:https://pic.netbian.com/ 搜索
(2)打开抓包工具,点击4K动漫,title="4K动漫"前面有个href属性,可能会有用处
(3)关闭抓包工具,打开4K动漫,仔细看上面的链接,可以发现,链接=主页的链接+href内容,所以下一步是爬取href内容
(4)bs4爬取——爬取主页href内容
#爬取主页面——href内容
def func_main(url):
main_page = requests.get(main_url)
main_page.close()
main_page.encoding = "gbk" #修改对应的编码方式
main_page_text = main_page.text
content1 = BeautifulSoup(main_page_text, "html.parser") #BeautifulSoup解析html内容
child_page_last = content1.find("div", class_ = "classify clearfix").find("a", title = "4K动漫图片").get("href")
return child_page_last #返回子链接的末尾部分
(5)返回4K动漫页面,抓包工具点击第一张图片,图片的下载地址img标签在src属性里面
(6)打开页面源代码,输入img标签的相关内容,确认是否在源代码里面
确认是在页面的源代码里面,接下来准备使用bs4,爬取子页面的所有图片img里面的src属性内容
(7)bs4爬取——爬取子页src以及下载图片
img标签的位置
爬取子页面函数
#爬取子页面
def func_child(url,child_page_last):
child_url = main_url+child_page_last #子页链接=主页链接+href内容
child_page = requests.get(child_url)
child_page.encoding = "gbk"
content2 = BeautifulSoup(child_page.text, "html.parser")
#img标签都在ul标签里面的li标签列表里面,先定位到ul标签
ul = content2.find("ul", attrs={"class": "clearfix"})
img = ul.find_all("img") #找到img标签
for it in img:
src_last = it.get("src").strip("/")
src = f'{main_url}{src_last}' #下载地址为:主页链接+src
img_download = requests.get(src.strip("/")) #爬取照片链接
name_last = src.split("/")[-1]
with open("img/"+name_last, mode="wb") as f:
f.write(img_download.content) #下载照片到对应文件
print("over!")
time.sleep(1) #睡眠1s
2.完整代码
from bs4 import BeautifulSoup
import requests
import time
#爬取主页面
def func_main(url):
main_page = requests.get(main_url)
main_page.close()
main_page.encoding = "gbk"
main_page_text = main_page.text
content1 = BeautifulSoup(main_page_text, "html.parser")
child_page_last = content1.find("div", class_ = "classify clearfix").find("a", title = "4K动漫图片").get("href")
return child_page_last
#爬取子页面
def func_child(url,child_page_last):
child_url = main_url+child_page_last
child_page = requests.get(child_url)
child_page.encoding = "gbk"
content2 = BeautifulSoup(child_page.text, "html.parser")
ul = content2.find("ul", attrs={"class": "clearfix"})
img = ul.find_all("img")
for it in img:
src_last = it.get("src").strip("/")
src = f'{main_url}{src_last}'
img_download = requests.get(src.strip("/"))
name_last = src.split("/")[-1]
with open("img/"+name_last, mode="wb") as f:
f.write(img_download.content)
print("over!")
time.sleep(1)
if __name__ == '__main__':
main_url = "https://pic.netbian.com/"
child_last_list = func_main(main_url)
func_child(main_url, child_last_list)
print("all over!")