数据解析——bs4解析
1. 模块安装
pip install bs4
from bs4 import BeautifulSoup
2. bs4 解析流程
初始化 BeautifulSoup 对象
page = BeautifulSoup(html,"html.parser")
- page.find(“标签名”,“属性”=“值”) 查找某个元素,只会找到一个结果 当属性为class时,为避免与类定义冲突,属性名使用==class_==表示
- page.find_all(“标签名”,“属性”=“值”) 找到一堆结果
- .text 获取标签中的文本息
- .get(“属性名”) 获取属性值
3. bs4 实践
思路
- 拿到页面源代码
- 初始化 BuertifulSoup 对象
- 锁定数据标签
- 获取标签内容或数据
- 保存数据
3.1 需求:爬取优美图库图片
import time
import requests
from bs4 import BeautifulSoup
import os
import csv
fp = open("优美图库.csv","w",encoding="utf-8")
aim_url = {}
url = "https://www.umei.cc/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"
}
# 爬取首页的源代码
resp = requests.get(url=url,headers=headers)
resp.encoding = "utf-8"
first_page = resp.text
# 初始化bs4对象
bs4_page = BeautifulSoup(first_page,"lxml")
li_lst_01 = bs4_page.find_all("li",class_="nav-li on")
# 遍历 li标签列表,获取每个模块下的自主题
for li in li_lst_01:
a_lst_01 = li.find("div", class_="sonnav").find_all("a")
for a in a_lst_01:
type = a.text
type_url = url.rstrip("/")+a["href"]
aim_url[type] = type_url
# fp.write(f"{type},{type_url}\n")
# fp.write("\n")
print("可爬取的图片类型有:")
print(aim_url.keys())
type_name = input("请输入你要爬取的主题:")
# 创建相应主题的子目录
if not os.path.exists("优美图库/"+type_name):
os.mkdir("优美图库/"+type_name)
# 爬取子类型的网页源代码
resp_2 = requests.get(aim_url[type_name],headers=headers)
resp_2.encoding = "utf-8"
# 初始化
type_page= BeautifulSoup(resp_2.text,"html.parser")
div_list = type_page.find_all("div",class_="item masonry_brick")
# 遍历div标签,获得每张图片的地址
for div in div_list:
picture_name =div.find("div",class_="title").text
picture_url = url.rstrip("/")+div.find("div",class_="title").find("a")["href"]
# 访问每张图片的地址,获得图片
resp_3 = requests.get(url=picture_url,headers=headers)
resp_3.encoding = "utf-8"
#初始化
picture_page = BeautifulSoup(resp_3.text,"html.parser")
picture_last = picture_page.find("div",class_ = "big-pic").find("img")["src"]
# 获取图片
resp_4 = requests.get(url=picture_last,headers=headers)
# 本地化存储
with open("优美图库/"+type_name+"/"+picture_name+".jpg","wb") as fp_1:
fp_1.write(resp_4.content)
print(picture_name+"爬取完成!!!")
time.sleep(2)
# print(f'{picture_name},{picture_url}\n')
# 结束
fp.close()
resp.close()
resp_2.close()
print("所有图片爬取完成!!!")
💎💎💎图片类型的文件写入时,写入类型为 “wb” 并且写入的内容为二进制文件,“.content”
创建文件要导入os包,文件创建方法为
import os
# 如果要创建的文件目录不存在,则创建目录
if not os.path.exists("优美图库/"+type_name):
os.mkdir("优美图库/"+type_name)
3.2 需求: 爬取三国演义内容
import time
import requests
import os
from bs4 import BeautifulSoup
# get请求,传入参数 url,获取对应页面的源代码
class Request_get():
def __init__(self,url):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"
}
self.url = url
def response(self):
self.resp = requests.get(url = self.url,headers=self.headers)
return self.resp
if __name__ == '__main__':
url = "https://www.gushicimingju.com/novel/sanguoyanyi/"
R = Request_get(url)
resp = R.response()
# 创建三国演义目录,用来存放爬取的内容
if not os.path.exists("三国演义"):
os.mkdir("三国演义")
soup = BeautifulSoup(resp.text,"html.parser")
li_lst = soup.find("ul",class_="content-left left-2-col").find_all("li")
for li in li_lst:
name = li.find("a").text
href = url+li.find("a")["href"].lstrip("/novel/sanguoyanyi/")
# 获取单章的页面源代码
R_2 = Request_get(href)
resp_1 = R_2.response()
# print(resp_1.text)
# 初始化,对单章页面源代码进行处理
soup_2 = BeautifulSoup(resp_1.text,"html.parser")
page = soup_2.find("div",class_="shici-content check-more").text
with open("三国演义/"+name,"w",encoding="utf-8") as fp:
fp.write(page)
# print(f"{name},{href}")
print(name+"\t下载完成!!")
time.sleep(2)
print("爬取完成!!!")
在拼接目标网页的 url 时,一定要注意对比网页是否拼接成功!!!💎💎💎