1. BS4数据解析常见方法
BS4数据解析方法是把需要的数据进行截取。处理数据的时间比较长
测试用网页:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>模拟登录</title>
</head>
<body background="e6fa7ebdb49b3f57569742132926fc4d.jpg" style="background-size: cover">
<h1 style="text-align: center">
欢迎注册
</h1>
<h3 style="text-align: center">
好好学习,天天向上
</h3>
<form action="" method="post" style="text-align:center">
<input name="user" placeholder="昵称" type="text"><br><br>
<input name="password" placeholder="密码" type="text"><br><br>
<input name="XXX" value="1" type="radio">我同意<a href="https://ti.qq.com/agreement/index.html">服务协议</a>与
<a href="https://rule.tencent.com/rule/preview/3fd52bde-6555-453b-9ab8-c5f1f3d22c62">隐私保护协议</a><br><br>
<input type="submit" value="立即注册">
</form>
</body>
</html>
from bs4 import BeautifulSoup
file = open("模拟登录.html", "r", encoding='utf-8')
response = file.read()
file.close()
html_data = BeautifulSoup(response, 'lxml')
print(html_data.h1) # 获取h1标签代码
可见,直接获取h1标签内的所有内容
获取h1标签中的属性
print(html_data.h1.attrs)
获取h1标签具体属性
print(html_data.h1['style'])
获取title标签下的文字
print(html_data.title.text)
找所有满足要求的数据html_data.find_all(‘标签’,‘属性’)
print(html_data.find_all('body', style="background-size: cover"))
最后·如果由多个满足条件的标签,以列表的形式返回
2.BS4数据解析,爬取豆瓣电影属性
这里主要以练习BS4数据处理为目标,所以整个过程比较繁琐
import requests
from bs4 import BeautifulSoup
url = "https://movie.douban.com/subject/1292052/"
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 "
"Safari/537.36 "
}
response = requests.get(url, headers=headers)
# print(response.text)
'''
爬取豆瓣电影详情页数据
1、导演,2、演员,3、类型
4、国家,5、语言,6、上映时间
'''
def WritMsg(_file, iterable):
for _msg in iterable:
StrForWrit = _msg.text
_file.write(f" {StrForWrit} ")
html_data = BeautifulSoup(response.text, 'lxml')
file = open("msg.txt", "w", encoding="utf-8")
# 电影名
file.write("电影名:\n")
file.write(html_data.find('span', property="v:itemreviewed").text)
file.write("\n--------------------------\n")
# print(html_data.find_all('div', id="info"))
html_data = html_data.find_all('div', id="info")[0]
# 导演
write = html_data.find_all('a', rel="v:directedBy")[0].text
file.write(f"导演:\n{write}\n--------------------------\n")
# 演员
file.write("演员:\n")
msg = html_data.find_all('span', class_='attrs')[2]
# 每隔五个人换行
msg = msg.find_all('a')
flag = 0
for name in msg:
if flag == 5:
file.write("\n")
flag = 0
file.write(f"{name.text}|")
flag += 1
file.write("\n--------------------------\n")
# 类型
file.write("电影类型:\n")
WritMsg(file, html_data.find_all('span', property="v:genre"))
file.write("\n--------------------------\n")
# 制片国家/地区
dic = {}
write = html_data.text
for cheat in write.split('\n'):
array = cheat.split(':')
if len(array) > 1:
dic[array[0]] = array[1]
file.write(f"制片国家/地区:\n{dic['制片国家/地区']}")
file.write("\n--------------------------\n")
# 语言
file.write(f"语言:\n{dic['语言']}")
file.write("\n--------------------------\n")
# 上映时间
file.write("上映时间:\n")
# print(html_data.find_all('span', property="v:initialReleaseDate")[0].text)
WritMsg(file, html_data.find_all('span', property="v:initialReleaseDate"))
file.write("\n--------------------------\n")
file.close()
运行结果: