多线程爬完整小说django后端完整程序(纯原创)

最新推荐文章于 2023-02-04 19:42:31 发布

晚风拂柳颜

最新推荐文章于 2023-02-04 19:42:31 发布

阅读量4.6k

点赞数

分类专栏： python小项目文章标签： python3 爬虫

本文链接：https://blog.csdn.net/qq_32394351/article/details/103427180

版权

python小项目专栏收录该内容

32 篇文章 0 订阅

订阅专栏

三分钟爬取一本完整小说,项目地址点击此处

主要逻辑部分 novel_download.py 代码100行出头，如下:

from lxml import etree
import requests
from threading import Thread,enumerate
import os
from time import sleep,time

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}

def thread_it(func,*args):
    t = Thread(target=func,args=args)
    t.setDaemon(True)
    t.start()

def getAll(url = "http://www.wanbenxiaoshuo.net/files/article/html/0/6/"):
    url = url.replace("index.html","")
    r = requests.get(url,headers=headers)
    if r.status_code == 200:
        r.encoding = r.apparent_encoding
        ret = r.text
        page_source = etree.HTML(ret)
        name = page_source.xpath('//*[@id="wp"]/div/div[2]/h1/text()')
        author = page_source.xpath('//*[@id="wp"]/div/div[2]/p/a/text()')
        novel_type = page_source.xpath('//*[@id="wp"]/div/div[2]/p/text()[2]')
        novel_list = page_source.xpath('//*[@id="wp"]/div/div[3]/div/ul/li/a/@href')
        novel_list = [(url+i) for i in novel_list]
        if len(name) > 0:
            return name[0],author[0],novel_type[0],novel_list
        else:
            return None,None,None,None

def getOne(link='http://www.wanbenxiaoshuo.net/files/article/html/0/6/51713.html'):
    r = requests.get(link, headers=headers)
    if r.status_code == 200:
        r.encoding = r.apparent_encoding
        ret = r.text
        page_source = etree.HTML(ret)
        node_title = page_source.xpath('//*[@id="content"]/div[1]/h1/text()')
        node_content = page_source.xpath('//*[@id="content"]/div[2]/text()')
        if len(node_title) > 0:
            content = ''.join(node_content[1:]).replace("\r\n\xa0\xa0\xa0\xa0","").replace("\n\t\t\xa0\xa0\xa0\xa0","")
            content = content.replace("\n\t\t","")
            return node_title[0], content
        else:
            return None, None

def writeOne(title,content):
    txt = "\t\t"+title+"\n"+content+"\n\n"
    return txt

def runApp(novel_list,name,t1,cwd=''):
    article_num = len(novel_list)
    xc_num = article_num//20+1
    print(f"待开启线程数量为{xc_num}")

    def inter(link,f,i):
        try:
            title, content = getOne(link)
            txt = writeOne(title, content)
            f.write(txt)
            print(f"\r线程{i}正在写入 {title}", end="")
        except Exception as e:
            print("\n爬得太快被拒绝连接，等1s递归继续")
            sleep(1)
            inter(link,f,i)

    def inner(name,i,begin,end,cwd):
        f = open(f"{cwd}downloads/{name}/{i}.txt", mode='w+', encoding='utf-8')
        for link in novel_list[begin:end]:
            inter(link, f,i)
            if link == novel_list[end - 1]:
                print(f"\n线程{i}执行完毕")
                print(f"\n剩余线程数量{len(enumerate())}")
                base_xc = 2 if not cwd else 4
                if len(enumerate()) <= base_xc:
                    print(enumerate())
                    print("\n全本下载完毕")
                    t2 = time()
                    print(f"\n本次下载小说总共耗时{round(t2 - t1)}s")
                    hebing(f"{cwd}downloads/{name}")

        f.close()

    for i in range(1,xc_num+1):
        begin = 20*(i-1)
        end = 20*i if i != xc_num else article_num
        if i == xc_num:
            print(f"\n全部线程开启完毕")
        thread_it(inner,name,i,begin,end,cwd)
        sleep(0.5)

def paixuRule(elem):
    return int(elem.split(".")[0])

def hebing(path):
    dirs = os.listdir(path)
    dirs.sort(key=paixuRule, reverse=False)
    f = open(path+".txt",mode='w+',encoding='utf-8')
    for file in dirs:
        with open(path+"/"+file,mode="r",encoding="utf-8") as f1:
            f.write(f1.read())
    f.close()
    print("小说合并完成")

if __name__ == '__main__':
    t1 = time()
    name,_,_,novel_list = getAll(url = "http://www.wanbenxiaoshuo.net/files/article/html/0/68/index.html")
    if not os.path.exists("downloads/" + name):
        os.mkdir("downloads/" + name)
    runApp(novel_list,name,t1)
    while True:
        pass

views.py代码如下:

from django.shortcuts import render,HttpResponse
from .models import Novel
from .novel_download import *
from django.http import StreamingHttpResponse
import pypinyin
# Create your views here.

# 不带声调的(style=pypinyin.NORMAL)
def pinyin(word):
    s = ''
    for i in pypinyin.pinyin(word, style=pypinyin.NORMAL):
        s += ''.join(i)
    return s

# 带声调的(默认)
def yinjie(word):
    s = ''
    # heteronym=True开启多音字
    for i in pypinyin.pinyin(word, heteronym=True):
        s = s + ''.join(i) + " "
    return s

def index(request):
    return HttpResponse("django work<br><a href='novel/'>进入</a></br>")

def novel_index(request):
    return render(request,'novel/index.html')

def novel_list(request):
    return render(request,'novel/novel_list.html')

def novel_edit(request):
    return render(request,'novel/novel_edit.html')
def download_action(request):
    def file_iterator(file_name,chunk_size=512):
        with open(file_name,encoding='utf-8') as f:
            while True:
                c = f.read(chunk_size)
                if c:
                    yield c
                else:
                    break
    a = request.GET
    if a.get('name'):
        name = a.get('name')
        filepath = os.getcwd() + "/novel/downloads/"+name
        if os.path.exists(filepath):
            # print(filepath)
            response =  StreamingHttpResponse(file_iterator(filepath))
            response['Content-Type'] = 'application/octet-stream'
            name_pinyin = pinyin(name)
            response['Content-Disposition'] = f'attachment; filename={name_pinyin}'
            return response
    return HttpResponse("错误，未找到该文件的相关下载！<br>也许还在后台下载中，请隔段时间再试")

def edit_action(request):
    novelindex = request.POST["novelindex"]  #得到小说章节列表链接
    name,author,noveltype,_ = getAll(url=novelindex)
    if name:
        Novel.objects.create(name = name,author = author,noveltype = noveltype,novelindex = novelindex)
        t1 = time()
        name, _, _, novel_list = getAll(url=novelindex)
        cwd = os.getcwd()+"/novel/"
        if not os.path.exists("novel/downloads/" + name):
            os.mkdir("novel/downloads/" + name)
        runApp(novel_list, name,t1,cwd)
        return HttpResponse(f"it's ok<br>{name} {author} {noveltype}<br><a href='/novel/download/?name={name}.txt'>大约等三分钟点击下载</a>")
    else:
        return HttpResponse("错误，未找到该文件的相关下载！")