import requests
from bs4 import BeautifulSoup
import lxml
def getsg():
headrs = {
'User - Agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 95.0.4638.69Safari / 537.36Edg / 95.0.1020.53'
}
url = 'https://www.shicimingju.com/book/hongloumeng.html'
page_text = requests.get(url=url,headers=headrs)
page_text.encoding = 'utf-8'
page_text1 = page_text.text
# print(page_text1)
sp = BeautifulSoup(page_text1,'lxml')
list_all = sp.select('.book-mulu>ul>li')
fp = open('./get/红楼梦.txt','w',encoding='utf-8')
for list in list_all:
title = list.a.string
list_url = 'https://www.shicimingju.com'+list.a['href']
list_page_url = requests.get(url=list_url,headers=headrs)
list_page_ur
python爬取小说红楼梦
本文介绍如何利用Python进行网络爬虫,抓取在线版《红楼梦》的小说内容,探讨了网页解析、数据存储等关键技术,并分享了相关实战经验。
摘要由CSDN通过智能技术生成