python爬取 网站的小说 代码

headers中的User-Agent信息

谷歌浏览器为例:随便打开一个网页F12  或右上角三个点>更多工具>开发者工具

 将User-Agent冒号后面的信息添加到代码中

import requests
from lxml import etree
import time

headers={'User-Agent': '添加电脑上的user-Agent信息'}
num=0
allnum=0

def writeInTxt(title,content,file_name):
    global num,allnum
    print(str(allnum)+'/'+str(num)+': '+title[0])
    num+=1
    file_write_in=open(file_name+'.txt','a',encoding='utf-8')
    file_write_in.write('\n\n\n\n'+title[0]+'\n\n')
    for content_ in content:
        if content_=='\n':
            pass
        else:
            file_write_in.write(content_)
    file_write_in.close


def getContent(infoUrl):
    contentReq=requests.get(infoUrl,headers=headers)
    contentReq.encoding=contentReq.apparent_encoding
    contentHtml=etree.HTML(contentReq.text)

    titleByXpath=contentHtml.xpath('//h1/text()')
    contentByXpath=contentHtml.xpath('//div[@id="content"]/text()')
    return (titleByXpath,contentByXpath)

def chapterUrl(InfoUrlList):
    fullChapterUrlList=[]
    for InfoUrlSingle in InfoUrlList:
        headUrl='https://www.mubige.com' #更改牧笔阁或笔趣阁网址
        fullUrl=headUrl+InfoUrlSingle
        fullChapterUrlList.append(fullUrl)
    return fullChapterUrlList


def getInfoUrl(novelUrl):
    global allnum,num
    novelReq=requests.get(novelUrl,headers=headers)
    novelReq.encoding=novelReq.apparent_encoding
    novelReqHtml=etree.HTML(novelReq.text)
    novelName=novelReqHtml.xpath('//h1/text()')
    contentUrl=novelReqHtml.xpath('//dd/a/@href')
    num=int(input('从第多少章节开始获取:'))
    starnum=num+8
    allnum=len(contentUrl[9:])
    print('《'+novelName[0]+'》一共有'+str(len(contentUrl[starnum:]))+'章节!!(注意爬取速度)')
    return (contentUrl[starnum:],novelName)

if __name__ == "__main__":
    novelUrl=input('请输入想要获取笔趣阁或牧笔阁的小说网址:')
    InfoUrlList,novelName=getInfoUrl(novelUrl)
    contentUrlList=chapterUrl(InfoUrlList)
    for contentUrl in contentUrlList:
        title,content=getContent(contentUrl)
        writeInTxt(title,content,novelName[0])
        # time.sleep(0.5)  #更改获取时间间隔

 运行代码后要求输入小说网址如下图。

 

注:如果遇到爬到一半突然报错的情况,可以重新运行然后更改 从出错的章节开始运行即可。 

=======================分割线==========================================================================

出现503时,用👇

import requests
from lxml import etree
import time
 
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
num=0
allnum=0
 
def writeInTxt(title,content,file_name):
    global num,allnum
    print(str(allnum)+'/'+str(num)+': '+title[0])
    num+=1
    file_write_in=open(file_name+'.txt','a',encoding='utf-8')
    file_write_in.write('\n\n\n\n'+title[0]+'\n\n')
    for content_ in content:
        if content_=='\n':
            pass
        else:
            file_write_in.write(content_)
    file_write_in.close
 
 
def getContent(infoUrl):
    contentReq=requests.get(infoUrl,headers=headers)
    contentReq.encoding=contentReq.apparent_encoding
    contentHtml=etree.HTML(contentReq.text)
 
    titleByXpath=contentHtml.xpath('//h1/text()')
    contentByXpath=contentHtml.xpath('//div[@id="content"]/text()')
    return (titleByXpath,contentByXpath)
 
def chapterUrl(InfoUrlList):
    fullChapterUrlList=[]
    for InfoUrlSingle in InfoUrlList:
        headUrl='http://www.xbiquge.la/' 
        fullUrl=headUrl+InfoUrlSingle
        fullChapterUrlList.append(fullUrl)
    return fullChapterUrlList
 
 
def getInfoUrl(novelUrl):
    global allnum,num
    novelReq=requests.get(novelUrl,headers=headers)
    novelReq.encoding=novelReq.apparent_encoding
    novelReqHtml=etree.HTML(novelReq.text)
    novelName=novelReqHtml.xpath('//h1/text()')
    contentUrl=novelReqHtml.xpath('//dd/a/@href')
    # num=int(input('从第多少章开始'))
    num=-8
    starnum=num+8
    allnum=len(contentUrl[9:])
    print('《'+novelName[0]+'》一共有'+str(len(contentUrl[starnum:]))+'章节')
    return (contentUrl[starnum:],novelName)
 
if __name__ == "__main__":
    # novelUrl=input('请输入想要获取的网址')
    novelUrl='http://www.xbiquge.la/5/5395/'
    InfoUrlList,novelName=getInfoUrl(novelUrl)
    contentUrlList=chapterUrl(InfoUrlList)
    num=0
    while num<=len(contentUrlList):
        title,content=getContent(contentUrlList[num])
        if title[0]=='503 Service Temporarily Unavailable':
            time.sleep(5)
            print(num)
            continue
        else:
            writeInTxt(title,content,novelName[0])
            # time.sleep(1)  
 

    # for num in range(0,len(contentUrlList)):
    #     title,content=getContent(contentUrlList[num])
    #     if title[0]=='503 Service Temporarily Unavailable':
    #         num=num-1
    #         continue
    #     else:
    #         writeInTxt(title,content,novelName[0])
    #         # time.sleep(1)  
 

  • 11
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值