python爬小说目录_【python入门爬虫】爬取笔趣阁小说

[Python] 纯文本查看 复制代码import time

from bs4 import BeautifulSoup

import requests

import urllib.parse

#模拟小说搜索

def search(url):

print("访问:"+url)

#请求页面

response = requests.get(url)

#获取真实地址

url = response.url

#防止中文乱码,参考html的meta标签的content.charset属性

response.encoding ='gbk'

#获取html内容

html_content = response.text

#print(html_content)

#转为bs

soup=BeautifulSoup(html_content,"lxml")

searchList = []

#获取搜索的结果列表

i=0

if(len(soup.select('#main li'))<=0):

if(soup.title.string =='笔趣阁'):

return []

else:

name = soup.select('#info > h1')[0].string

url = url

author = soup.select('#info > p:nth-child(2) > a')[0].string

novel = {"name":name,"url":url,"author":author}

print(("""id:%d\t书名:%s\t作者:%s""" %(i, name, author)))

searchList.append(novel)

return searchList

else:

for child in soup.select('#main li'):

name = child.select('.s2 a')[0].string

url = child.select('.s2 a')[0].get('href')

author = child.select('.s4')[0].string

novel = {"name":name,"url":url,"author":author}

searchList.append(novel)

print(("""id:%d\t书名:%s\t作者:%s""" %(i, name, author)))

i+=1

return searchList

#爬取小说属性

def getNovelAtrr(url):

print("访问:"+url)

#请求页面

response = requests.get(url)

#防止中文乱码,参考html的meta标签的content.charset属性

response.encoding ='gbk'

#获取html内容

html_content = response.text

#print(html_content)

#转为bs

soup=BeautifulSoup(html_content,"lxml")

#获取小说名,作者,简介,更新日期,字数,目录

name = soup.select('#info h1')[0].string

author = soup.select('#info > p:nth-child(2) > a')[0].string

profile = soup.select('#intro')[0].text

updata_wordnum = str(soup.select('#info > p:nth-child(4)')[0].text)

index = updata_wordnum.find("[")

lastindex = updata_wordnum.find("字")

updataTime = updata_wordnum[5:int(index)]

wordnum = updata_wordnum[int(index)+2:lastindex]

catalogList = []

for item in soup.select('#list > dl > dd a'):

value = url + item.get("href")

name = item.text

catalog = {name:value}

catalogList.append(catalog)

Novel = {"name":name,"url":url,"profile":profile,"author":author,"updataTime":updataTime,"wordnum":wordnum,"catalogList":catalogList}

return Novel

#打开小说章节

def openCatalog(url):

print("访问:"+url)

#请求页面

response = requests.get(url)

#防止中文乱码,参考html的meta标签的content.charset属性

response.encoding ='gbk'

#获取html内容

html_content = response.text

#print(html_content)

#转为bs

soup=BeautifulSoup(html_content,"lxml")

content = soup.select('#content')[0].text

print(content)

#选择搜索页的某本书,并访问某章节

def openNovel(id,searchList):

if id>=len(searchList):

print("没有这本书")

else:

Novel = getNovelAtrr(searchList[id]["url"])

page = int(0)

limit = int(10)

while True:

for i in range(limit):

print(("""id:%d\t %s""" %(page*limit+i, Novel["catalogList"][page*limit+i])))

cmdid = int(input("输入‘-1’上一页,输入‘-2’下一页,输入章节id访问章节:"))

if(cmdid == -1):

if(page>0):

page=page-1

elif(cmdid == -2):

if(page*limit

page=page+1

elif(cmdid >= 0):

url = Novel["catalogList"][cmdid]

for key in url:

openCatalog(url[key])

break

if __name__ == '__main__':

searchUrl = 'https://www.52bqg.com/modules/article/search.php?searchkey='

word = str(input("输入搜索关键字:"))

#拼接链接,搜索关键字必须进行url转码

url = searchUrl + urllib.parse.quote(word.encode('gbk'))

searchList = search(url)

if(len(searchList) == 0):

print("检索失败!")

else:

openNovel(int(input("输入小说id:")),searchList)

getNovelAtrr("https://www.52bqg.com/book_361/")

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值