首先,打开求是网,url:《求是》2019年第1期 - 求是网 (qstheory.cn)
user-agent获取和网页解析见BeautifulSoup库TapTap评论爬虫-CSDN博客
导入库
import requests
from bs4 import BeautifulSoup
字段设置
# 目标网页URL
url = 'http://www.qstheory.cn/dukan/qs/2014/2019-01/01/c_1123924172.htm'
# 设置User-Agent字段
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
发送请求和匹配
# 发送HTTP请求,包含User-Agent头部
response = requests.get(url, headers=headers)
# 检查请求是否成功
if response.status_code == 200:
# 使用 BeautifulSoup 解析 HTML 内容
soup = BeautifulSoup(response.content, 'html.parser')
links = soup.select('a[href] strong')
# 检查是否找到了链接
if links:
# 提取链接文本和链接URL
for link in links:
# 获取父级<a>标签的href属性
link_url = link.parent['href']
# 获取<strong>标签内的文本
link_text = link.get_text(strip=True)
# 打印结果
print(link_url)
print(link_text)
else:
print('No link element found.')
else:
print(f'Failed to retrieve the webpage. Status code: {response.status_code}')
爬取出每个url,截图如下