python爬取有声小说_python写的有声小说爬虫

querybook.py

from bs4 import BeautifulSoup

from lxml import html

import xml

import requests

import splider

class QuName:

def __init__(self,number):

self.number = number

def getPageNum(self,url):

f = requests.get(url) # Get该网页从而获取该html内容

soup = BeautifulSoup(f.content, "lxml")

try:

pageNum = soup.find('div', class_="pagesnums").find('span').text

print('getPageNum执行成功')

return int(pageNum[3:5])

except:

print('getPageNum执行失败')

finally:

print('___________________________')

def getBookList(self):

for num in range(1,self.number):

pageNum = self.getPageNum('http://www.ting89.com/booklist/'+str(num)+'.html')

self.getBookInfo('http://www.ting89.com/booklist/'+str(num)+'.html')

print('http://www.ting89.com/booklist/'+str(num)+'.html')

for num1 in range(2,pageNum):

self.getBookInfo('http://www.ting89.com/booklist/'+str(num)+'_'+str(num1)+'.html')

print('http://www.ting89.com/booklist/'+str(num)+'_'+str(num1)+'.html')

def getBookInfo(self,url):

f = requests.get(url) # Get该网页从而获取该html内容

soup = BeautifulSoup(f.content, "lxml")

try:

bookList = soup.find('div', class_="clist").findAll('li')

for i in bookList:

imgUrl = i.find('img')

print('书籍封面',imgUrl['src'])

# print('书名:',i.find('b').text)

pList = i.findAll('p')

for j in pList:

print(j.text)

#下载文件

splider.YsSpider(i.find('b').text).download_files()

except:

print('getBookInfo执行失败')

finally:

print('___________________________')

qn = QuName(13) #这里是网站的类别数量(偷了个懒,直接写了个数字)

qn.getBookList()

splider.py

import requests

import urllib

import re

import os

import time

class YsSpider:

def __init__(self, name):

self.search_name = name

self.search_url = "http://www.ting89.com/search.asp?searchword="

self.home_url = "http://www.ting89.com/books/"

self.index_pattern = r""""""

self.chapter_pattern=r"""(.+?)"""

self.down_pattern=r"""url=(.*)/(.+?)\.mp3"""

self.book_id = ''

self.book_name = ''

self.Chapter_list = []

# 返回搜索书目的id

def searchbook(self):

file = requests.get(self.search_url + urllib.parse.quote(self.search_name, encoding='gb2312'))

data = file.content.decode('gbk')

result = re.findall(self.index_pattern, data)

if len(result):

for index, i in enumerate(result):

print('%d.%s'%(index+1,i[1]))

# str = input("输入你要下载的书目名称序号: ")

str = '1'

self.book_name = result[int(str)-1][1]

self.book_id = result[int(str)-1][0]

return self.book_id

else:

print('*******没有找到你输入的相关书籍,请更换后重新运行程序*******')

exit()

def get_chapter_list(self):#获取各章节list和url

data = requests.get(self.home_url+self.searchbook()+'.html').content.decode('gbk')

result = re.findall(self.chapter_pattern, data)

return result

def _getAllUrl(self):# 获得所有的章节的下载地址

chapter_list = self.get_chapter_list()

chapter = [x[0] for x in chapter_list]

self.Chapter_list= [x[1] for x in chapter_list]

_list = [x[1] for x in chapter_list]

data = requests.get("http://www.ting89.com" + chapter[0]).content.decode('gbk')

result = re.findall(self.down_pattern, data)

# return result

return self.sub_get_url(result[0][0],_list, re.search("^0.*1$", result[0][1]))

def sub_get_url(self, down_url, _list, down_url_flag):

url = []

if down_url_flag:

xulie = list(range(len(_list)))

weishu = len(str(xulie[-1]))

for i in xulie:

i1 = i + 1

tmp_url = down_url+'/' + str(i1).zfill(weishu) + '.mp3'

url.append(urllib.request.quote(tmp_url, safe='/:?='))

else:

for item in _list:

tmp_url = down_url + '/'+item + ".mp3"

url.append(urllib.request.quote(tmp_url, safe='/:?='))

return url

# 保存指定URL的文件

def save_a_file(self, url, path, chapter):

try:

print('尝试下载',chapter)

if not os.path.exists(path):

response = requests.get(url)

with open(path, 'wb') as f:

f.write(response.content)

f.close

print(chapter,'保存成功')

response.close()

time.sleep(1)

else:

print('文件已经存在')

except:

print('爬取失败,已下载至',chapter,'即将重新尝试下载')

self.save_a_file(url, path, chapter)

def download_files(self):

result = self._getAllUrl()# 所有的章节对应的下载地址

root = os.path.join(os.getcwd(), self.book_name)

if not os.path.exists(root):

os.mkdir(root)

for index,i in enumerate(result):

path = os.path.join(root, self.Chapter_list[index])+'.mp3'

self.save_a_file(i, path, self.Chapter_list[index])

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值