bs4解析数据
bs4的使用
bs4是对爬取下来的整个网页数据经行筛选,筛选出我们自己想要的数据
from bs4 import BeautifulSoup
# 1. 准备需要解析的网页数据(实际是用request或者selenium获取)
data = open('test2.html', encoding='utf-8').read()
# 2. 创建BeautifulSoup对象(可以自动纠正数据中错误的html结构)
# BeautifulSoup(数据, 解析器)
soup = BeautifulSoup(data, 'lxml')
# 3.通过BeautifulSoup对象获取标签和标签内容
# 1)获取标签
# BeautifulSoup对象.select(css选择器) - 获取css选择器选中的所有标签;返回的是列表,列表中的元素是选中的标签对象
# BeautifulSoup对象.select_one(css选择器) - 获取css选择器选中的第一个标签;返回的是标签对象
result = soup.select('p')
print(result) # [<p>我是段落1</p>, <p>我是段落2</p>, <p>我是超链接3</p>]
result = soup.select_one('p')
print(result) # <p>我是段落1</p>
result = soup.select('#p1')
print(result) # [<p id="p1">我是超链接3</p>]
result = soup.select_one('#p1')
print(result) # <p id="p1">我是超链接3</p
result = soup.select('div p')
print(result) # [<p>我是段落2</p>, <p id="p1">我是超链接3</p>]
result = soup.select('div>p')
print(result) # [<p>我是段落2</p>]
result = soup.select('.p3')
print(result) # [<p class=p3>我是段落2</p>]
# 2) 获取标签内容
# a.标签对象.string - 获取标签中的文字内容(只有在标签内容是纯文字的时候有效,否则结果是None)
p2 = soup.select_one('div>p')
print(p2) # <p>我是段落2</p>
print(p2.string) # '我是段落2'
s1 = soup.select_one('#s1')
print(s1) # <span id="s1">我是<b>span1</b></span>
print(s1.string) # None
# b.标签对象.get_text() - 获取标签内容中所有的文字信息
print(p2.get_text()) # '我是段落2'
print(s1.get_text()) # '我是span1'
# c. 标签对象.contents
print(p2.contents) # ['我是段落2']
result = s1.contents
print(result) # ['我是', <b>span1</b>]
print(result[-1].get_text()) # 'span1'
# 3) 获取标签属性
a1 = soup.select_one('div>a')
print(a1) # <a href="https://www.baidu.com">我是超链接2</a>
print(a1.attrs['href']) # 'https://www.baidu.com'
img1 = soup.select_one('img')
print(img1) # <img alt="" src="http://www.gaoimg.com/uploads/allimg/210801/1-210P1151401S1.jpg"/>
print(img1.attrs['src']) # 'http://www.gaoimg.com/uploads/allimg/210801/1-210P1151401S1.jpg'
# 补充:
# BeautifulSoup对象.select/select_one(css选择器) - 在整个网页中获取css选择器选中的标签
# 标签对象.select/select_one(css选择器) - 在指定标签中获取css选择器选中的标签
ps = soup.select('p')
print(ps) # [<p>我是段落1</p>, <p>我是段落2</p>, <p id="p1">我是超链接3</p>]
div1 = soup.select_one('div')
ps = div1.select('p')
print(ps) # [<p>我是段落2</p>, <p id="p1">我是超链接3</p>]
爬取某瓣电影的数据
import csv
from bs4 import BeautifulSoup
import requests
all_shuju = []
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
i = 0
while i <= 9:
response = requests.get(f'https://movie.douban.com/top250?start={25*i}&filter=', headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
all_movie_li = soup.select('#content > div > div.article > ol > li')
for li in all_movie_li:
img_url = li.select_one('.pic>a>img').attrs['src']
print(img_url)
name = li.select_one('.title').get_text()
print(name)
try:
des = li.select_one('.inq').get_text()
except:
des = ''
print(des)
score = li.select_one('.rating_num').get_text()
print(score)
print('----------------------------------------------')
all_shuju.append([img_url, name, des, score])
i += 1
# 打开要写入的csv文件
f = open('files3/test.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(f)
writer.writerow(['图片链接', '电影名字', '介绍', '评分'])
writer.writerows(all_shuju)