1.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>简单网页示例</title>
<!-- 如果需要,可以在这里添加CSS样式 -->
<style>
/* 示例样式,针对带有 class="myLink" 的 a 标签 */
.myLink {
color: blue;
text-decoration: none;
/* 更多样式可以添加在这里 */
}
.myLink:hover {
color: red;
/* 链接被鼠标悬停时的样式 */
}
</style>
</head>
<body>
<h1>欢迎来到我的网页</h1>
<p>这是一个简单的网页示例。</p>
<!-- 使用 a 标签,包含 href 和 class 属性 -->
<a href="https://www.example.com" class="myLink" target="_blank">点击这里访问示例网站</a>
<!-- 如果需要,可以在这里添加更多内容 -->
</body>
</html>
2.
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>《数据采集与预处理》填空题</title>
</head>
<body>
<p id="list">网站列表</p>
<p class="txt">百度 - Baidu</p>
<a href="http://www.baidu.com" class="site">百度</a><br/>
<p class="txt">腾讯 - Tencent</p>
<a href="http://www.tencent.com" class="site">腾讯</a><br/>
<p class="txt">搜狐 - Sohu</p>
<a href="http://www.sohu.com" class="site">搜狐</a><br/>
<p class="txt">湘潭理工学院</p>
<a href="http://www.xtit.edu.cn" class="home">
<img src="xtit.png" alt="暂无图片" height="50px" style="background-color: gray;">
</a>
</body>
</html>
"""
#编写Python程序,输出所有a标签的href属性的值。
# 使用Beautiful Soup解析HTML
soup = BeautifulSoup(html_doc, 'html.parser')
# 找到所有的<a>标签
a_tags = soup.find_all('a')
# 输出所有<a>标签的href属性值
for a_tag in a_tags:
print(a_tag['href'])
3.
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>《数据采集与预处理》填空题</title>
</head>
<body>
<p id="list">网站列表</p>
<p class="txt">百度 - Baidu</p>
<a href="http://www.baidu.com" class="site">百度</a><br/>
<p class="txt">腾讯 - Tencent</p>
<a href="http://www.tencent.com" class="site">腾讯</a><br/>
<p class="txt">搜狐 - Sohu</p>
<a href="http://www.sohu.com" class="site">搜狐</a><br/>
<p class="txt">湘潭理工学院</p>
<a href="http://www.xtit.edu.cn" class="home">
<img src="xtit.png" alt="暂无图片" height="50px" style="background-color: gray;">
</a>
</body>
</html>
"""
#编写Python程序,输出所有img标签的src属性的值。
# 使用Beautiful Soup解析HTML
soup = BeautifulSoup(html_doc, 'html.parser')
# 找到所有的<img>标签
img_tags = soup.find_all('img')
# 输出所有<img>标签的src属性值
for img_tag in img_tags:
print(img_tag['src'])
4.
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>《数据采集与预处理》填空题</title>
</head>
<body>
<p id="list">网站列表</p>
<p class="txt">百度 - Baidu</p>
<a href="http://www.baidu.com" class="site">百度</a><br/>
<p class="txt">腾讯 - Tencent</p>
<a href="http://www.tencent.com" class="site">腾讯</a><br/>
<p class="txt">搜狐 - Sohu</p>
<a href="http://www.sohu.com" class="site">搜狐</a><br/>
<p class="txt">湘潭理工学院</p>
<a href="http://www.xtit.edu.cn" class="home">
<img src="xtit.png" alt="暂无图片" height="50px" style="background-color: gray;">
</a>
</body>
</html>
"""
#写Python程序,输出所有p标签的文本。
# 使用Beautiful Soup解析HTML
soup = BeautifulSoup(html_doc, 'html.parser')
# 找到所有的<p>标签
p_tags = soup.find_all('p')
# 输出所有<p>标签的文本
for p_tag in p_tags:
print(p_tag.text)
5.
from bs4 import BeautifulSoup
html_content = """
<html>
<head>
<title>《数据采集与预处理》操作题</title>
</head>
<body>
<div class="multiplewrap center">
<div class="multiple">
<div class="multiplemin" aos="fade-up">
<a href="/zhyw/615.html" target="_blank" class="item">
<div class="data">
<h3>2024-05-15</h3>
</div>
<div class="imgbox">
<img src="/uploads/images/20240515/dd0a6938517dedfcd716e60133fa2460.jpg" />
</div>
<div class="text">
<h3>xxx学院举办党纪学习教育专题读书班开班式 暨专家辅导报告会</h3>
</div>
</a>
<a href="/zhyw/611.html" target="_blank" class="item">
<div class="data">
<h3>2024-05-11</h3>
</div>
<div class="imgbox">
<img src="/uploads/images/20240511/9001b0ad35dded871c5a8953e6a75291.jpg" />
</div>
<div class="text">
<h3>校党委书记xx带队赴yy“访企拓岗”</h3>
</div>
</a>
<a href="/zhyw/607.html" target="_blank" class="item">
<div class="data">
<h3>2024-05-07</h3>
</div>
<div class="imgbox">
<img src="/uploads/images/20240507/65ad7e59d16aeaddefd6a06081c88968.png" />
</div>
<div class="text">
<h3>影响因子11.1商学院青年教师xx在国际顶级期刊发表论文</h3>
</div>
</a>
<a href="/zhyw/590.html" target="_blank" class="item">
<div class="data">
<h3>2024-05-05</h3>
</div>
<div class="imgbox">
<img src="/uploads/images/20240506/5d3623a5frg2276f49a5e346e0ba66f9.jpg" />
</div>
<div class="text">
<h3>别太羡慕!“新青媒”姐妹“搭子”考研上岸!</h3>
</div>
</a>
</div>
</div>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html_content,'html.parser')
# 查找所有<a>标签内的<div>标签内的<img>标签,并提取src属性的值
img_tags = soup.find_all('a', recursive=True) # 先找到所有的<a>标签
for a_tag in img_tags:
img_in_div = a_tag.find_all('div', recursive=True) # 在<a>标签内查找<div>标签
for div_tag in img_in_div:
img = div_tag.find('img') # 在<div>标签内查找<img>标签
if img:
img_src = img['src'] # 提取<img>标签的src属性值
print(img_src) # 打印src属性的值
6.
from bs4 import BeautifulSoup
html_content = '''
<html>
<head><title>《数据采集与预处理》操作题</title></head>
<body>
<div class="multiplewrap center">
<div class="multiple">
<div class="multiplemin" aos="fade-up">
<a href="/zhyw/615.html" target="_blank" class="item">
<div class="data"> <h3>2024-05-15</h3></div>
<div class="imgbox"><img src="/uploads/images/20240515/dd0a6938517dedfcd716e60133fa2460.jpg" /></div>
<div class="text"><h3>xxx学院举办党纪学习教育专题读书班开班式 暨专家辅导报告会</h3></div>
</a>
<a href="/zhyw/611.html" target="_blank" class="item">
<div class="data"> <h3>2024-05-11</h3></div>
<div class="imgbox"> <img src="/uploads/images/20240511/9001b0ad35dded871c5a8953e6a75291.jpg" /></div>
<div class="text"> <h3>校党委书记xx带队赴yy“访企拓岗”</h3></div>
</a>
<a href="/zhyw/607.html" target="_blank" class="item">
<div class="data"> <h3>2024-05-07</h3></div>
<div class="imgbox"> <img src="/uploads/images/20240507/65ad7e59d16aeaddefd6a06081c88968.png" /></div>
<div class="text"> <h3>影响因子11.1商学院青年教师xx在国际顶级期刊发表论文</h3> </div>
</a>
<a href="/zhyw/590.html" target="_blank" class="item">
<div class="data"> <h3>2024-05-05</h3></div>
<div class="imgbox"> <img src="/uploads/images/20240506/5d3623a5frg2276f49a5e346e0ba66f9.jpg" /></div>
<div class="text"> <h3>别太羡慕!“新青媒”姐妹“搭子”考研上岸</h3></div>
</a>
</div>
</div>
</div>
</body>
</html>
'''
# 使用Beautiful Soup解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
# 找到所有新闻标题所在的<h3>标签,并逐行输出
news_titles = soup.find_all('h3')
for title in news_titles:
print(title.text.strip())
7.
from bs4 import BeautifulSoup
html_content = '''
<html>
<head>
<title>《数据采集与预处理》操作题</title>
</head>
<body>
<div class="multiplewrap center">
<div class="multiple">
<div class="multiplemin" aos="fade-up">
<a href="/zhyw/615.html" target="_blank" class="item">
<div class="data"><h3>2024-05-15</h3></div>
<div class="imgbox"><img src="/uploads/images/20240515/dd0a6938517dedfcd716e60133fa2460.jpg"/></div>
<div class="text"><h3>xxx学院举办党纪学习教育专题读书班开班式暨专家辅导报告会</h3></div>
</a>
<a href="/zhyw/611.html" target="_blank" class="item">
<div class="data"><h3>2024-05-11</h3></div>
<div class="imgbox"><img src="/uploads/images/20240511/9001b0ad35dded871c5a8953e6a75291.jpg"/></div>
<div class="text"><h3>校党委书记xx带队赴yy“访企拓岗”</h3></div>
</a>
<a href="/zhyw/607.html" target="_blank" class="item">
<div class="data"><h3>2024-05-07</h3></div>
<div class="imgbox"><img src="/uploads/images/20240507/65ad7e59d16aeaddefd6a06081c88968.png"/></div>
<div class="text"><h3>影响因子11.1商学院青年教师xx在国际顶级期刊发表论文</h3></div>
</a>
<a href="/zhyw/590.html" target="_blank" class="item">
<div class="data"><h3>2024-05-05</h3></div>
<div class="imgbox"><img src="/uploads/images/20240506/5d3623a5frg2276f49a5e346e0ba66f9.jpg"/></div>
<div class="text"><h3>别太羡慕!“新青媒”姐妹“搭子”考研上岸!</h3></div>
</a>
</div>
</div>
</div>
</body>
</html>
'''
# 使用Beautiful Soup解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
# 找到所有新闻条目所在的<a>标签
news_items = soup.find_all('a')
# 遍历每个新闻条目,获取标题和发表时间,并输出
for item in news_items:
title = item.find('div', class_='text').h3.text.strip()
date = item.find('div', class_='data').h3.text.strip()
print(f"{title}\t{date}")
8.
from bs4 import BeautifulSoup
html_content = '''
<html>
<head>
<title>《数据采集与预处理》操作题</title>
</head>
<body>
<div class="multiplewrap center">
<div class="multiple">
<div class="multiplemin" aos="fade-up">
<a href="/zhyw/615.html" target="_blank" class="item">
<div class="data">
<h3>2024-05-15</h3>
</div>
<div class="imgbox">
<img src="/uploads/images/20240515/dd0a6938517dedfcd716e60133fa2460.jpg" />
</div>
<div class="text">
<h3>xxx学院举办党纪学习教育专题读书班开班式 暨专家辅导报告会</h3>
</div>
</a>
<a href="/zhyw/611.html" target="_blank" class="item">
<div class="data">
<h3>2024-05-11</h3>
</div>
<div class="imgbox">
<img src="/uploads/images/20240511/9001b0ad35dded871c5a8953e6a75291.jpg" />
</div>
<div class="text">
<h3>校党委书记xx带队赴yy“访企拓岗”</h3>
</div>
</a>
<a href="/zhyw/607.html" target="_blank" class="item">
<div class="data">
<h3>2024-05-07</h3>
</div>
<div class="imgbox">
<img src="/uploads/images/20240507/65ad7e59d16aeaddefd6a06081c88968.png" />
</div>
<div class="text">
<h3>影响因子11.1商学院青年教师xx在国际顶级期刊发表论文</h3>
</div>
</a>
<a href="/zhyw/590.html" target="_blank" class="item">
<div class="data">
<h3>2024-05-05</h3>
</div>
<div class="imgbox">
<img src="/uploads/images/20240506/5d3623a5frg2276f49a5e346e0ba66f9.jpg" />
</div>
<div class="text">
<h3>别太羡慕!“新青媒”姐妹“搭子”考研上岸!</h3>
</div>
</a>
</div>
</div>
</div>
</body>
</html>
'''
# 使用BeautifulSoup解析HTML字符串
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有包含新闻标题和链接的<a>标签
for a_tag in soup.find_all('a', class_='item'):
# 提取新闻的标题
title = a_tag.find('div', class_='text').h3.get_text(strip=True)
# 提取新闻的URL
url = a_tag['href']
# 输出新闻的标题和URL
print(f"{title}\t{url}")
9.
from bs4 import BeautifulSoup
# 假设你已经有了HTML源代码,这里我们直接将它作为字符串
html_content = '''
<html>
<head>
<title>《数据采集与预处理》操作题</title>
</head>
<body>
<div class="information-r" aos="fade-left">
<div class="information-rmin">
<a href="/uploads/files/20230602/0833b370bea5t3e3c23584ae89a09e99.doc" download class="item">
<div class="text">
<h3>xxx学院学生课程成绩、学分认定审批表</h3>
<p>2024.04.22</p>
</div>
<div class="more">立即下载</div>
</a>
<a href="/uploads/files/20230602/39121321545d39741fa55c2ac0feca9c.docx" download class="item">
<div class="text">
<h3>xxx学院体育免修审批表</h3>
<p>2024.04.12</p>
</div>
<div class="more">立即下载</div>
</a>
<a href="/uploads/files/20230602/06c5699d3f6847f899b672bd7c9e76c7.doc" download class="item">
<div class="text">
<h3>xxx学院课程免修申请表</h3>
<p>2023.06.01</p>
</div>
<div class="more">立即下载</div>
</a>
</div>
</div>
</body>
</html>
'''
#操作:显示所有下载资源的名称。
#要求:
#(1)获取所有下载资源的名称。
#(2)每行显示一个资源名称,形如:
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有带有 class="item" 的 a 标签
download_links = soup.find_all('a', class_='item')
# 遍历每个链接,并提取其文本内容中的 h3 标签,这就是资源的名称
for link in download_links:
# 查找每个链接中的 h3 标签
resource_name = link.find('h3').get_text(strip=True)
# 打印资源名称
print(resource_name)
10.
from bs4 import BeautifulSoup
html_content = '''
<html>
<head>
<title>《数据采集与预处理》操作题</title>、、
</head>
<body>
<div class="information-r" aos="fade-left">
<div class="information-rmin">
<a href="/uploads/files/20230602/0833b370bea5t3e3c23584ae89a09e99.doc" download class="item">
<div class="text">
<h3>xxx学院学生课程成绩、学分认定审批表</h3>
<p>2024.04.22</p>
</div>
<div class="more">立即下载</div>
</a>
<a href="/uploads/files/20230602/39121321545d39741fa55c2ac0feca9c.docx" download class="item">
<div class="text">
<h3>xxx学院体育免修审批表</h3>
<p>2024.04.12</p>
</div>
<div class="more">立即下载</div>
</a>
<a href="/uploads/files/20230602/06c5699d3f6847f899b672bd7c9e76c7.doc" download class="item">
<div class="text">
<h3>xxx学院课程免修申请表</h3>
<p>2023.06.01</p>
</div>
<div class="more">立即下载</div>
</a>
</div>
</div>
</body>
</html>
'''
#操作:显示所有下载资源的名称、上传时间。
#要求:
#(1)获取所有下载资源的名称、上传时间。
#(2)每行显示一个资源名称、上传时间,两者之间使用跳转分隔,形如:
#.xxx学院学生课程成绩、学分认定审批表 2024.04.22
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
# 找到所有资源项
resources = soup.find_all('a', class_='item')
# 输出资源的名称和上传时间
for resource in resources:
resource_name = resource.find('h3').text.strip()
upload_date = resource.find('p').text.strip()
print(f"{resource_name}\t{upload_date}")
11.
from bs4 import BeautifulSoup
html_content = '''
<html>
<head>
<title>《数据采集与预处理》操作题</title>
</head>
<body>
<div class="information-r" aos="fade-left">
<div class="information-rmin">
<a href="/uploads/files/20230602/0833b370bea5t3e3c23584ae89a09e99.doc" download class="item">
<div class="text">
<h3>xxx学院学生课程成绩、学分认定审批表</h3>
<p>2024.04.22</p>
</div>
<div class="more">立即下载</div>
</a>
<a href="/uploads/files/20230602/39121321545d39741fa55c2ac0feca9c.docx" download class="item">
<div class="text">
<h3>xxx学院体育免修审批表</h3>
<p>2024.04.12</p>
</div>
<div class="more">立即下载</div>
</a>
<a href="/uploads/files/20230602/06c5699d3f6847f899b672bd7c9e76c7.doc" download class="item">
<div class="text">
<h3>xxx学院课程免修申请表</h3>
<p>2023.06.01</p>
</div>
<div class="more">立即下载</div>
</a>
</div>
</div>
</body>
</html>
'''
#操作:显示所有下载资源的名称、URL。
#要求:
#(1)获取所有下载资源的名称、URL。
#(2)每行显示一个资源名称、URL,两者之间使用跳转分隔,形如:
#xxx学院学生课程成绩、学分认定审批表 ….doc
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
# 找到所有资源项
resources = soup.find_all('a', class_='item')
# 输出资源的名称和URL
for resource in resources:
resource_name = resource.find('h3').text.strip()
resource_url = resource['href']
print(f"{resource_name}\t{resource_url}")
12.
from bs4 import BeautifulSoup
html_content = '''
<html>
<head>
<title>《数据采集与预处理》操作题</title>
</head>
<body>
<div class="information-r" aos="fade-left">
<div class="information-rmin">
<a href="/uploads/files/20230602/0833b370bea5t3e3c23584ae89a09e99.doc" download class="item">
<div class="text">
<h3>xxx学院学生课程成绩、学分认定审批表</h3>
<p>2024.04.22</p>
</div>
<div class="more">立即下载</div>
</a>
<a href="/uploads/files/20230602/39121321545d39741fa55c2ac0feca9c.docx" download class="item">
<div class="text">
<h3>xxx学院体育免修审批表</h3>
<p>2024.04.12</p>
</div>
<div class="more">立即下载</div>
</a>
<a href="/uploads/files/20230602/06c5699d3f6847f899b672bd7c9e76c7.doc" download class="item">
<div class="text">
<h3>xxx学院课程免修申请表</h3>
<p>2023.06.01</p>
</div>
<div class="more">立即下载</div>
</a>
</div>
</div>
</body>
</html>
'''
#操作:显示所有资源的上传时间、名称和URL。
#要求:
#(1)获取所有资源的上传时间、名称和URL。
#(2)每行显示一个资源的上传时间、名称和URL,两者之间使用跳转分隔,形如:
#2024.04.22 xxx学院学生课程成绩、学分认定审批表 ...doc
soup = BeautifulSoup(html_content,'parser')
resourses = soup.find_all('a',class_='item')
for resourse in resourses:
resourse_name = resourse.find('h3').text.strip()
resourse_date = resourse.find('p').text.strip()
resourse_url = resourse.find('href')
print(f"{resourse_date}\t{resourse_name}\t{resourse_url}")