网址如下:http://dushu.baidu.com/pc/detail?gid=4306063500
小说采用异步加载的方式,返回的数据格式为json
import requests
import asyncio
import aiohttp
import os
async def getPage(title, cid):
url = 'http://dushu.baidu.com/api/pc/getChapterContent?data={"book_id":"4306063500","cid":"4306063500|' + cid + '","need_bookinfo":1}'
if not os.path.exists('resources/西游记'):
os.mkdir('resources/西游记')
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
dic = await resp.json()
with open(f'resources/西游记/{title}.txt', 'w', encoding='utf-8') as f:
f.write(dic['data']['novel']['content'])
print(title + '爬取成功!')
async def getCatalog(url):
dic = requests.get(url).json()
tasks = []
for i in dic['data']['novel']['items']:
cid = i['cid']
title = i['title']
d = asyncio.create_task(getPage(title, cid))
tasks.append(d)
await asyncio.wait(tasks)
def main():
url = 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}'
asyncio.run(getCatalog(url))
if __name__ == '__main__':
main()