'''
'''
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
#对主页面进行源码获取
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
page_text = requests.get(url=url,headers=headers).text
#在首页中解析章节的标题和详情页的url
soup = BeautifulSoup(page_text,'lxml')
#解析章节标题和url数据
li_list = soup.select('.book-mulu >ul > li')
fp = open('./三国.txt','w',encoding='utf-8')
for li in li_list:
title = li.a.string #获取标题
detail_url = 'https://www.shicimingju.com' + li.a['href'] #获取详情页的url
detail_page_text = requests.get(url=detail_url,headers=headers).text #获取详情页的源码数据
soup_temp = BeautifulSoup(detail_page_text,'lxml')
detail_text = soup_temp.find('div',class_='chapter_content')
#获取解析到的章节的内容
content = detail_text.text
fp.write(title+':'+content+'\n')
print(title,'爬取成功')
爬虫学习之bs4_02
最新推荐文章于 2023-08-10 09:41:11 发布