多线程/异步下载《锦衣断案录》

本文链接：https://blog.csdn.net/lingyuncelia/article/details/117112935

import time
start_time=time.time()
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
import os
url="https://www.soshuw.com/JinYiDuanAnLu/"
x1="//div[@id='novel111739']/dl[1]/dd/a[1]/@href"
r=requests.get(url,timeout=30)
r.encoding="utf-8"
r=r.text
selector = etree.HTML(r)
#提取小说各页对应的url
t_url=str(selector.xpath(x1))
t_url=t_url.replace(t_url[0],"").replace(t_url[-1],"")
t_url=t_url.replace(r"'/JinYiDuanAnLu/",'').replace(r".html'",'').split(", ")

def download_novel(p):
    try:
        url=f'https://www.soshuw.com/JinYiDuanAnLu/{p}.html'
        r=requests.get(url,timeout=30)
        r.encoding="utf-8"
        r=r.text
        selector = etree.HTML(r)
        x1=f'//*[@id="chapter{p}"]/div[3]/div[2]/h1/text()'
        a=selector.xpath(x1)[0]
        print(a)
        x2=f'//*[@id="con{p}"]/text()[position()>1]'
        b=str(selector.xpath(x2))
        b=b.replace(b[0],"").replace(b[-1],"")
        b=b.replace(r"'\xa0\xa0\xa0\xa0","").replace(r"        ', '\n\t\t', '\n\t\t', '\n\t\t', '\n\t\t', '\n\t\t', '\n      '","").replace(r"\n',",'\n')
        with open(f'd:/zhusc/t/{p}{a}.txt','w',encoding='utf-8') as nb:
            nb.write(a)
            nb.write('\n')
            nb.write(b)
    except:
        print("error",p)

if __name__=='__main__':
    with ThreadPoolExecutor(16) as t:
        for i in range(530):
            p=t_url[i]
            t.submit(download_novel,p)
    os.system("cd /d d:/zhusc/t && copy /b *.txt new.txt")
    print("done")
print('下载所需时间为：',time.time()-start_time)#大概一分钟

#1.爬取小说各页下载链接
import time
start_time=time.time()
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
import os
url="https://www.soshuw.com/JinYiDuanAnLu/"
x1="//div[@id='novel111739']/dl[1]/dd/a[1]/@href"
r=requests.get(url,timeout=30)
r.encoding="utf-8"
r=r.text
selector = etree.HTML(r)
#提取小说各页对应的url
t_url=str(selector.xpath(x1))
t_url=t_url.replace(t_url[0],"").replace(t_url[-1],"")
t_url=t_url.replace(r"'/JinYiDuanAnLu/",'').replace(r".html'",'').split(", ")
#2.异步爬取各页内容
import asyncio
import aiohttp
template = 'https://www.soshuw.com/JinYiDuanAnLu/{page}.html'
async def get(session, queue):
    while True:
        try:
            page = queue.get_nowait()
        except asyncio.QueueEmpty:
            return
        url = template.format(page=page)
        resp = await session.get(url)
        r=await resp.text(encoding='utf-8')
        selector = etree.HTML(r)
        x1=f'//*[@id="chapter{page}"]/div[3]/div[2]/h1/text()'
        title=selector.xpath(x1)[0]
        print(page,title)
        file_name=f'k:/zhusc/t/{page}{title}.txt'
        with open(file_name,'w',encoding="utf-8") as nb:
            nb.write(r)
async def main():
    async with aiohttp.ClientSession() as session:
        queue = asyncio.Queue()
        for page in t_url:
            queue.put_nowait(page)
        tasks = []
        for _ in range(100):
            task = get(session, queue)
            tasks.append(task)
        await asyncio.wait(tasks)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print("Total time is:",time.time()-start_time)