import requests
from bs4 import BeautifulSoup
url = "https://www.youmeitu.com/meinv/"
img_urls = []
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
}
resp = requests.get(url,headers=headers)
page = BeautifulSoup(resp.text,'html.parser')
result1 = page.find("div",attrs={"class":"TypeList"})
results2 = result1.find_all("img")
for result2 in results2:
c = result2.get("src")
img_urls.append(c)
# 开始请求照片并保存
for img_url in img_urls:
na = img_url.split("/")[4]
name = na.split("?")[0]
img = requests.get(img_url)
filename = f"图片文件夹\{name}"
with open(filename,mode="wb") as f:
f.write(img.content)
遇到的问题
1.从BeautifulSoup里提取标签属性值要用get("属性")
2.文件命名时切片里面不能有"."否则会报错(OSError: [Errno 22] Invalid argument:)
3.写入图片mode要用"wb",否则报错(TypeError: write() argument must be str, not bytes)
from bs4 import BeautifulSoup
import requests
import re
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Safari/537.36"}
resp = requests.get('https://movie.douban.com/top250',headers=headers)
soup = BeautifulSoup(resp,'lxml')
print(soup.prettify())#让解析字符串以标准的缩进格式进行输出
print(soup.title.string) #输出title节点的文本
soup.p.atters#获取p节点的属性
soup.p.atters['name']#获取name属性 = soup.p['name']
soup.p.contents #得到的是p节点下的所有直接子节点组成的列表
soup.find_all(text=re.compile('link'))#匹配节点文本,可以是字符串,可以是正则表达式