python爬虫练习笔记(二)
参考B站路飞学城IT
获取网页HTML
import requests
from bs4 import BeautifulSoup
#获取首页HTML
url = "https://www.umeitu.com/p/gaoqing/"
response = requests.get(url)
response.encoding='utf-8'
html = response.text
解析HTML内容
bs = BeautifulSoup(html,"html.parser")
pic_all = bs.find("div",class_="TypeList").find_all("a",class_="TypeBigPics")
#bs.findAll不能后面再接find_all 因为返回的是列表嘛
datalist = []
count = 1
for pic in pic_all:
#注意解析HTML的时候观察url的变化
temp = "https://www.umeitu.com"+pic.get('href')
print(temp)
#发送请求进入子页面
resp1 = requests.get(temp)
resp1.encoding="utf-8"
child_page = BeautifulSoup(resp1.text,"html.parser")
# print(child_page)
pic_add = child_page.find("div",attrs={"class":"ImageBody"}).find("img").get("src")
print(pic_add)
#保存图片路径
datalist.append(pic_add)
#保存图片到本地
#创建文件
f = open("pic_%s.jpg" % count,mode="wb") #wb 表示写入非文本内容
f.write(requests.get(pic_add).content) #获取src对应图片内容
print("第%d张图片下载完成"%count)
count = count + 1
print(datalist)