bs4介绍及案例
from bs4 import BeautifulSoup
html = """
<ul>
<li><a href="zhangwuji.com">张无忌</a></li>
<li id="abc"><a href="zhouxingchi.com">周某某</a></li>
<li><a href="zhubajie.com">猪八戎</a></li>
<li><a href="wuzetian.com">武则天</a></li>
<a href="jinmaoshiwang.com">金毛狮王</a>
</ul>
"""
page = BeautifulSoup(html,"html.parser")
li_list = page.find_all("li")
for li in li_list:
a = li.find("a")
text = a.text
href = a.get("href")
print(text,href)
案例
import requests
from bs4 import BeautifulSoup
domain = "https://www.umeitu.com"
"""
注意:
子页面的url如果开头是/,直接在前面拼接上域名即可
子页面的url不是/开头,此时需要找到主面的url,去掉最后一个/后面的所有内容。和当前获取到的url进行拼接
"""
url = "https://www.umeitu.com/bizhitupian/xiaoqingxinbizhi/"
resp = requests.get(url)
resp.encoding="utf-8"
n = 1
main_page = BeautifulSoup(resp.text,"html.parser")
a_list =main_page.find_all("a",attrs={"class":"TypeBigPics"})
for a in a_list:
href = a.get("href")
child_url = domain + href
child_resp = requests.get(child_url)
child_resp.encoding="utf-8"
child_bs = BeautifulSoup(child_resp.text,"html.parser")
div = child_bs.find("div",attrs={"class":"ImageBody"})
img_src = div.find("img").get("src" )
img_resp = requests.get(img_src)
with open(f"{n}.jpg",mode = "wb") as f:
f.write(img_resp.content)
print(f"{n}图片下载完毕")
n+=1