import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
# 占位符代替页码
url="https://so.gushiwen.org/search.aspx?type=guwen&page=%d&value=三国演义"
# 每一页的url
url_list=[]
headers = {
"User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 88.0.4324.150 Safari / 537.36"
}
# 共十四页
for pageNum in range(1,13):
# 拼凑完整的url
new_url=format(url%pageNum)
resp=requests.get(url=new_url,headers=headers)
resp.encoding="utf-8"
page_text=resp.text
soup=BeautifulSoup(page_text,"lxml")
for item in soup.select(".sons>.cont"):
url_list.append(item.p.a["href"])#将完整的url存入url数组中
fp=open(".sanguo.txt","w",encoding="utf-8")
for item in url_list:
url="https://so.gushiwen.org"+item
resp=requests.get(url=url,headers=headers)
resp.encoding="utf-8"
detail_page_text=resp.text
detail_soup=BeautifulSoup(detail_page_text,"lxml")
# 获取章节数
detail_num=detail_soup.select(".cont>h1>span>b")[0].text
# 获取章节标题
detail_title=detail_num+detail_soup.select(".cont>.contson>p")[0].text
# 获取章节内容
content=detail_soup.find('div',class_='contson').text
# 写入文件
fp.write(detail_title+":"+"\n"+content+"\n")
print("正在爬取"+detail_num)
print("爬取完成")
爬取三国演义
最新推荐文章于 2024-06-02 21:49:12 发布
本文通过Python爬虫技术,从古文网站抓取《三国演义》各章节内容,展示了如何使用requests和BeautifulSoup库实现网页解析。重点在于页面抓取和信息提取的过程,适合理解基础爬虫技术的朋友。
摘要由CSDN通过智能技术生成