在这里插入代码片
```import re
import os
import requests
def get_toc(html):
start_url='https://www.kanunu8.com/book3/6879/'
toc_url_list=[]
toc_block=re.findall('正文(.*?)</tbody>',html,re.S)[0]
toc_url=re.findall(' href="(.*?)"',toc_block,re.S)
for url in toc_url:
toc_url_list.append(start_url+url)
return toc_url_list
def get_article(html):
chapter_name=re.search('size="4">(.*?)<',html,re.S).group(1)
text_block=re.search('<p>(.*?)</p>',html,re.S).group(1)
text_block=text_block.replace('<br />','')
return chapter_name,text_block
def save(chapter,article):
os.makedirs('E:\爬虫\文件\动物农场',exist_ok=True)
with open(os.path.join('E:\爬虫\文件\动物农场',chapter+'.txt'),'w',encoding='utf-8')as f:
f.write(article)
if __name__=='__main__':
url=r'https://www.kanunu8.com/book3/6879/'
html=requests.get(url).content.decode('gbk')
toc_url_list=get_toc(html)
for url in toc_url_list:
url_html=requests.get(url).content.decode('gbk')
chapter_name,text_block=get_article(url_html)
save(chapter_name,text_block)
从https://www.kanunu8.com/book3/6879/爬取《动物农场》所有章节
最新推荐文章于 2023-08-24 16:54:05 发布