遇到小说收费,看不过瘾?
然后找了一个盗版的小说网站,随便写了百来行代码,然后龟速开爬。
为啥龟速?毕竟俺单个线程,没有异步,不会正则和xpath,用了个龟速的beautifulsoup来写
但是嘛,重在代码简单易懂,好改。各位看官凑合着用
#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# File : mybs4.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2019/7/16
from bs4 import BeautifulSoup as bt
import requests
from urllib.request import quote
import time
import os
def findnovel(novelname="修仙",homeurl="https://www.biqule.com/"):
keyword = quote(novelname, encoding='gb2312') #对搜索的关键字进行编码
url = f'{homeurl}modules/article/search.php?searchkey={keyword}'
print(url)
r = requests.get(url) # 笔趣阁搜索url
r.encoding = r.apparent_encoding
# print(r.status_code) 打印访问状态,一般200就是成功
html = r.text
soup = bt(html, 'html.parser')
# print(soup) soup格式化后的网页源码
chapters1 = soup.find_all("li")
soup1 = bt(str(chapters1), 'html.parser')
chapters2 = soup.find_all("span",class_='s2')
soup2 = bt(str(chapters2), 'html.parser')
arr1 = [] # 章节内容列表
lis=[] #arr 元组列表
info = {} #最终信息字典
for child in soup1.children:
soup3 = bt(str(child), 'html.parser')
chapters3 = soup3.find_all("span")
arr = [] # 单个li内所有span标签的文本
for i in chapters3:
if i.get_text()!="":
arr.append(i.get_text())
arr = tuple(arr)
if arr != ():
lis.append(arr)
for i in lis:
info[i[1]]=(i[3],i[0],i[1],i[2],i[4],i[5]) #key为小说名。value为小说信息
for child in soup2.children:
if hasattr(child, 'href') and child.a is not None:
bookname = child.get_text()
bookurl = child.a['href'] #提取a标签内href属性的值
book = [bookname,bookurl,info[bookname][0]]
arr1.append(book)
return arr1
def download(
homeurl="https://www.biqule.com/book_72715/",
savename="重生之都市修仙.txt"):
"""
笔趣乐专用小说下载器 https://www.biqule.com
:param homeurl: 小说主页 如 https://www.biqule.com/book_72715/
:param savepath: 保存路径(在同目录下downloads里面路径)
:return:
"""
if not os.path.exists("download"):
os.mkdir("download")
savepath = f"download/{savename}"
r = requests.get(homeurl)
r.encoding = r.apparent_encoding
ret = r.text
soup = bt(ret, 'html.parser')
chapters = soup.find_all("dd")
soup = bt(str(chapters), 'html.parser')
arr = [] # 章节内容列表
for child in soup.children: # dd下所有子节点
if hasattr(child, 'href') and child.a is not None:
arr.append(child.get_text())
num = len(arr)
print(f"本小说共有 {num} 章节")
file = open(savepath, 'a', encoding='utf-8')
downsouplist(homeurl, soup, file, num)
def downsouplist(url, soup, file, num): # 下载soup对象的所有
index = 1
for child in soup.children:
if hasattr(child, 'href') and child.a is not None:
myurl = url + child.a['href']
downhtml(myurl, file, index, num)
index += 1
file.close()
def downhtml(url, file, index, num): # 下载单独的一个html文件
r = requests.get(url)
r.encoding = r.apparent_encoding
ret = r.text
soup = bt(ret, 'html.parser')
contents = soup.find_all(id='content')
for txtcode in contents:
content = txtcode.get_text()
file.write(content + '\n\n')
print("已下载:%.3f%%" % float(index / num * 100)) # 爬取进度
def downtest(homeurl="https://www.biqule.com/book_72715/",savename="重生之都市修仙.txt"): #测试下载
time1 = time.time() # 获取当前时间(秒)
download(homeurl,savename)
time2 = time.time()
tt = time2 - time1
print(f'花费时间:{tt}秒')
def mydownload(novelname="魔皇",authorname="八月飞鹰"):
ret = findnovel(novelname)
print(ret)
for i in ret:
if i[2] == authorname:
homeurl = i[1]
savename = novelname+".txt"
downtest(homeurl,savename)
break
def readme():
jiaocheng = """
===========================达神beatifulsoup教程===============================
soup = bt(ret, 'html.parser') 先构建beatifulsoup对象,然后关键是find_all用法
contents = soup.find_all(id='content')
find_all(name,text,attrs) 查找标签 , 查找文本 , 基于attrs参数
用法1: li = soup.find_all('li') 查找标签为li的并返回一个列表li
用法2: li = soup.find_all (id = 'flask') 查看id=值
用法3: li = soup.find_all (class_='s2') 查看class=值 注意,由于class是py的关键字,soup里必须后面跟上下划线
用法4:find_class = soup.find(attrs={'class':'s2'}) 在attrs里面传入json文本
组合起来使用: 如需要找span下class为s2的:
find_ret = soup.find_all("span",class_="s2")
==============================================================================
"""
print(jiaocheng)
def findNovelList(novelname):
ret = findnovel(novelname)
for i in ret:
print(f"小说名称:{i[0]} 作者:{i[2]} 主页:{i[1]}")
if __name__ == '__main__':
readme()
# downtest()
# mydownload()
# findNovelList("紫阳")
downtest('https://www.biqule.com/book_47626/',"紫阳帝尊.txt")