Python爬取小说

最新推荐文章于 2024-08-07 09:00:00 发布

齐硕君

最新推荐文章于 2024-08-07 09:00:00 发布

阅读量687

点赞数 7

分类专栏： python 文章标签： python 多进程正则匹配爬取小说

本文链接：https://blog.csdn.net/weixin_41099712/article/details/100930743

版权

python 专栏收录该内容

9 篇文章 3 订阅

订阅专栏

起因：有一天突然想看本小说，在小说网站上不能下载，广告太多，便。。。
思路：
–分析网站的结构
–爬取目录（获得章节名和链接）
–多线程加载章节网页
–正则匹配内容
–保存生成错误日志
–链接单个章节txt合并为一个

# -*- coding: utf-8 -*-
"""
Created on Tue Jul  2 18:23:49 2019

@author: 24709
"""
import urllib
import urllib.request
import multiprocessing
from bs4 import BeautifulSoup
import re
import os
import time
#小说首页http://m.wenxuedu.com/html/208249
dirPath = "C:/Users/24709/Desktop/txtxt/"
#所有txt位于的文件夹路径（注意\与/的区别），最终文件为output
soup=""
titles=[] #存取所有章节名称
urls=[] #存取所有章节的URL链接


################从目录页面爬取章节名称和url地址##################################
def geturl():
    print('正在加载章节.....')
    for index in range(5,6):#(5,151)爬取第5到第150页目录
        try:
            request = urllib.request.Request("http://m.wenxuedu.com/html/208249_"+str(index))
            response = urllib.request.urlopen(request,timeout=8)
            content = response.read()
            data = content.decode('utf-8')
            # soup转换
            soup = BeautifulSoup(data, "html.parser")
            i=0;
            for link in soup.findAll('li'):
                #获取 link 的  href 属性内容
                if re.search(r'章',str(link.a.string)) :
                    if i>4:
                        print(str(link.a.string))
                        #print(link.a.get('href'))
                        titles.append(str(link.a.string))
                        urls.append(str(link.a.get('href')))
                    i=i+1
        except:
            try:
                #再次尝试
                request = urllib.request.Request("http://m.wenxuedu.com/html/208249_"+str(index))
                response = urllib.request.urlopen(request,timeout=8)
                content = response.read()
                data = content.decode('utf-8')
                # soup转换
                soup = BeautifulSoup(data, "html.parser")
                i=0;
                for link in soup.findAll('li'):
                    #获取 link 的  href 属性内容
                    if re.search(r'章',str(link.a.string)) :
                        if i>4:
                            print(str(link.a.string))
                            #print(link.a.get('href'))
                            titles.append(str(link.a.string))
                            urls.append(str(link.a.get('href')))
                        i=i+1
            except:
                #目录下载失败将索引写入错误日志
                writefile('error_log',"index:{} \n".format(index))
###########################根据url下载小说内容##################################
def getcontent(url):
    request = urllib.request.Request("http://m.wenxuedu.com"+url)
    response = urllib.request.urlopen(request)
    content = response.read()
    data = content.decode('utf-8')
    # soup转换
    soup = BeautifulSoup(data, "html.parser")
    a=str(soup.find(id='novelcontent').p).replace('<br/>','')[3:-4]
    try:
        #尝试删去冗余的头（标题）和尾
        TEMP=int(re.search(str(r"[)]"),a).span()[0])+1
        a=a[TEMP:-1*(re.search(str('\u3000\u3000'),a[::-1]).span()[0]+2)]
    except:
        pass
    #print(a)
    #print("\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\")
    try:
    #尝试由没有第二章
        request = urllib.request.Request("http://m.wenxuedu.com"+url[:-1]+"_2")
        response = urllib.request.urlopen(request)
        content = response.read()
        data = content.decode('utf-8')
        # soup转换
        soup = BeautifulSoup(data, "html.parser")
        b=str(soup.find(id='novelcontent').p).replace('<br/>','')[3:-4]
        #print(b)
        try:
            #尝试删去冗余的头（标题）
            TEMP=int(re.search(str("[)]"),b).span()[0])+1
            c=a+b[TEMP:-1]
        except:
            c=a+b
    except:
        #print("本章无第二章节")
        pass
    return c


############################写入文件###########################################
def writefile(title,content):
    with open(dirPath+title+".txt",'a',encoding='utf-8') as f:
        f.write(content)
    f.close()
    
#######################尝试下载，下载失败保存到日志##############################
def download(title_url):
    try:
        writefile(title_url[0],getcontent(title_url[1]))
    except:
        writefile('error_log',"title:{} url:{} \n".format(title_url[0],title_url[1]))


##########################合并txt##############################################
def linkTheBook():
    print("-------------------开始合成txt-------------------")
    start0 = time.time()
    file0 = os.listdir(dirPath)
    files=[]
    for file in file0:
        if re.search(r'(\d+)',file):
            files.append(file)
    ##过滤名字里不带数字章节的
    files.sort(key=lambda i:int(re.search(r'(\d+)',i)[0]))#用正则提取章节数字并排序
    res = ""
    i = 0
    for file in files:
        if file.endswith(".txt"):
            i += 1
            title = "%s" % (file[0:len(file)-4])
    
            with open(dirPath + file, "r", encoding='utf-8') as file:
                content = file.read()
                file.close()
    
            append = "\n%s\n\n%s" % (title, content)
            res += append
    
    with open(dirPath+"outfile.txt", "w", encoding='utf-8') as outFile:
        outFile.write(res)
        outFile.close()
    end0=time.time()
    print("-------------------txt合成完成-------------------")
    print("全书共"+str(len(files))+"章，共"+str(len(res))+"字")
    print('运行时间  %0.2f s.' % (end0 - start0))
#######################################################################3




if __name__=="__main__":
    start = time.time()
    geturl()
    #爬取目录中的章节名称和url地址到【titles】和【urls】
    print("-------------------开始下载-------------------")
    p = []
    print('主程序的PID：%s' % os.getpid())
    for [title,url] in zip(titles,urls):
        p.append(multiprocessing.Process(target=download, args=([title,url],)))
        #多进程同时下载不同的章节
    print("等待所有的进程加载完成........")
    for i in p:
        i.start()
    for i in p:
        i.join()
    end = time.time()    
    print("-------------------全部下载完成-------------------")
    print('运行时间  %0.2f s.' % (end - start))
    ###################################3
    #linkTheBook()
    pass

用于自我学习记录，欢迎交流指正。