笔趣阁(www.qu.la)小说爬取代码(python实现)

import requests
import os
import gevent
from gevent import monkey
import random
import re
from lxml import etree
from bs4 import BeautifulSoup

monkey.patch_all(select=False)
from urllib import parse
import time

IPs = [{'HTTPS': 'HTTPS://182.114.221.180:61202'},
       {'HTTPS': 'HTTPS://60.162.73.45:61202'},
       {'HTTPS': 'HTTPS://113.13.36.227:61202'},
       {'HTTPS': 'HTTPS://1.197.88.101:61202'}]
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cookie': 'UM_distinctid=1638b54c8f3279-0003db1d70474a-39614807-384000-1638b54c8f4843; CNZZDATA1261736110=613318700-1527048008-null%7C1530014624; Hm_lvt_5ee23c2731c7127c7ad800272fdd85ba=1530014621,1530014629,1530014706,1530015295; bookid=34778; bcolor=; font=; size=; fontcolor=; width=; chapterid=1896093; chaptername=%25u7B2C1%25u7AE0%2520%25u65B0%25u4E16%25u754C%25u548C%25u65B0%25u8EAB%25u4EFD; Hm_lpvt_5ee23c2731c7127c7ad800272fdd85ba=1530016490'
}


def setDir():
    if 'Noval' not in os.listdir('./'):
        os.mkdir('./Noval')


def getNoval(url, id, data, faillist):
    try:
        headers = HEADERS
        IP = random.choice(IPs)
        res = requests.get(url, headers=headers, proxies=IP, timeout=5)
        res.encoding = res.apparent_encoding
        html = res.text.replace(' ', ' ')  # 替换掉这个字符 换成空格~ 意思是一样的
        soup = BeautifulSoup(html, 'lxml')
        content = soup.find('div', attrs={'id': 'content'})
        name = soup.find('div', attrs={'class': 'bookname'}).h1.text

        if name:
            s = name + '\n'
            s = s + content.text
            data[id] = s

    except Exception:
        faillist.append(id)


def getNoval2(url, id, data):
    while True:
        try:
            headers = HEADERS
            IP = random.choice(IPs)
            res = requests.get(url, headers=headers, proxies=IP)
            res.encoding = res.apparent_encoding
            html = res.text.replace(' ', ' ')  # 替换掉这个字符 换成空格~ 意思是一样的
            soup = BeautifulSoup(html, 'lxml')
            content = soup.find('div', attrs={'id': 'content'})
            name = soup.find('div', attrs={'class': 'bookname'}).h1
            if name:
                s = name.text + '\n'
                s = s + content.text
                data[id] = s
        except Exception:
            continue
        else:
            break


def getContentFile2(url):
    headers = HEADERS
    IP = random.choice(IPs)
    res = requests.get(url, headers=headers, proxies=IP)
    res.encoding = res.apparent_encoding
    soup = BeautifulSoup(res.text, 'lxml')
    info = soup.find('div', attrs={'id': 'info'})
    bookname = info.h1.text
    datalist = soup.find('div', attrs={'id': 'list'})
    data = list(
        map(lambda x: 'https://www.qu.la' + x['href'], datalist.find_all(attrs={'href': re.compile('/book.*?\.html')})))
    return data, bookname


def BuildGevent(baseurl):
    content, bookname = getContentFile2(baseurl)  # version2
    steps = 2
    beginIndex, length = steps, len(content)
    count = 0
    name = "%s.txt" % bookname
    data = {}
    faillist = []
    while (count - 1) * steps < length:
        WaitigList = [gevent.spawn(getNoval, content[i + count * steps], i + count * steps, data, faillist) for i in
                      range(steps) if
                      i + count * steps < length]
        gevent.joinall(WaitigList)
        print(count)
        count += 1
    count = 0
    print("HE")
    faillistlen = len(faillist)
    while count * steps < faillistlen:
        WaitigList = [gevent.spawn(getNoval2, content[faillist[i + count * steps]], faillist[i + count * steps], data)
                      for i in range(steps) if i + count * steps < faillistlen]
        gevent.joinall(WaitigList)
    String = '\n'.join(data.values())
    with open('./Noval/' + name, 'w', encoding='gb18030', errors='ignore') as ff:
        ff.write(String)


if __name__ == '__main__':
    starttime = time.time()
    setDir()
    url = 'https://www.qu.la/book/34778/'
    BuildGevent(url)
    endtime = time.time()
    print("Total use time: %.6f" % (endtime - starttime))
好的,我可以为你编写一个简单的小说下载器界面,使用Python的tkinter库实现。在这个界面上,你可以输入小说的网址和要下载的章节范围,然后点击&ldquo;下载&rdquo;按钮开始下载。 为了从笔趣阁下载小说,我们将使用requests和BeautifulSoup库来解析HTML页面和下载小说。在开始之前,请确保你已经安装了这些库。 以下是代码实现: ```python import requests from bs4 import BeautifulSoup import tkinter as tk from tkinter import ttk from tkinter import messagebox class NovelDownloader: def __init__(self, url, start, end): self.url = url self.start = start self.end = end self.chapter_urls = [] self.chapter_texts = [] def get_chapter_urls(self): response = requests.get(self.url) soup = BeautifulSoup(response.text, 'html.parser') chapter_list = soup.find_all('div', id='list')[0].find_all('a') self.chapter_urls = [self.url + i['href'] for i in chapter_list[self.start-1:self.end]] def get_chapter_texts(self): for url in self.chapter_urls: response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') chapter_title = soup.find_all('div', id='maininfo')[0].find_all('h1')[0].text chapter_text = soup.find_all('div', id='content')[0].text.replace('\xa0','\n') self.chapter_texts.append(chapter_title + '\n\n' + chapter_text) def save_to_file(self): with open('novel.txt', 'w', encoding='utf-8') as f: f.write('\n\n'.join(self.chapter_texts)) class NovelDownloaderGUI: def __init__(self, root): self.root = root self.root.title('小说下载器') self.root.geometry('400x200') self.url_label = ttk.Label(self.root, text='小说网址:') self.url_label.grid(column=0, row=0, padx=5, pady=5) self.url_entry = ttk.Entry(self.root, width=30) self.url_entry.grid(column=1, row=0, padx=5, pady=5) self.range_label = ttk.Label(self.root, text='章节范围:') self.range_label.grid(column=0, row=1, padx=5, pady=5) self.start_entry = ttk.Entry(self.root, width=5) self.start_entry.insert(0, '1') self.start_entry.grid(column=1, row=1, padx=5, pady=5) self.end_label = ttk.Label(self.root, text='到') self.end_label.grid(column=2, row=1, padx=5, pady=5) self.end_entry = ttk.Entry(self.root, width=5) self.end_entry.insert(0, '10') self.end_entry.grid(column=3, row=1, padx=5, pady=5) self.download_button = ttk.Button(self.root, text='下载', command=self.download_novel) self.download_button.grid(column=1, row=2, padx=5, pady=5) def download_novel(self): url = self.url_entry.get() start = int(self.start_entry.get()) end = int(self.end_entry.get()) downloader = NovelDownloader(url, start, end) downloader.get_chapter_urls() downloader.get_chapter_texts() downloader.save_to_file() messagebox.showinfo('提示', '下载完成!') if __name__ == '__main__': root = tk.Tk() app = NovelDownloaderGUI(root) root.mainloop() ``` 这个小说下载器界面包括一个文本框和两个输入框,以及一个&ldquo;下载&rdquo;按钮。当你在文本框中输入小说的网址,并指定要下载的章节范围时,点击&ldquo;下载&rdquo;按钮,程序将会自动从笔趣阁下载小说,并保存到一个名为&ldquo;novel.txt&rdquo;的文本文件中。 请注意,这个小说下载器只适用于笔趣阁,对于其他小说网站可能需要做一些修改才能正常使用。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

肥宅_Sean

公众号“肥宅Sean”欢迎关注

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值