python 实现爬取指定小说实时下载(附源码)
import requests
import re
import time
import random
def download(book_name):
# 下载小说
search_real_url = 'https://www.biquge5200.cc/modules/article/search.php?searchkey=' + book_name
try:
novel_source = requests.get(search_real_url).text
reg1 = r'<td class="odd"><a href="(.*?)">(.*?)</a></td>.*?<td class="odd">(.*?)</td>'
# 所有搜索到的结果(包括小说网址、名称、作者姓名)
novel_list = re.findall(reg1, novel_source, re.S)
# 判断是否有结果返回
if len(novel_list) == 0:
print('你要找的小说不存在,请检查后重新输入')
except Exception as e:
print(e)
for novel_url, novel_name, novel_author in novel_list:
if novel_name == book_name:
print('你即将下载的小说:%s 作者:%s' % (novel_name, novel_author))
return novel_url, novel_name
def get_chapter(url):
# 获取章节页面
try:
# 章节页面源代码
chapter_page_source = requests.get(url).text
reg2 = r'<dd><a href="(.*?)">(.*?)</a></dd>'
chapter_list = re.findall(reg2, chapter_page_source)
# print(chapter_list)
except Exception as e:
print(e)
return chapter_list
def get_content(chapter_list, novel_name):
"""
:param chapter_list:
:param novel_name:
"""
count = 0
length = len(chapter_list)
for chapter_url, chapter_name in chapter_list:
try:
time.sleep(1 + random.random())
content_source = requests.get(chapter_url).text
# print(content_source)
reg = r'<div id="content">(.*?)</div>'
content = re.findall(reg, content_source, re.S)[0]
stsdw = re.sub('<p>(.*?)\(《》\)</p>', ' ', str(content))
print(stsdw)
contents = stsdw.replace('</p><p class="">', '\n').replace(' ', '\n\t\t').replace('<p>', '').replace('</p>', '')
print('contents:')
print(contents)
count += 1
with open(novel_name + '.txt', 'a', encoding='utf-8') as f:
f.write(chapter_name + '\n' + contents + '\n' * 2)
print('正在写入: ' + chapter_name)
print('进度:%0.2f' % (count / length) + '%')
except Exception as e:
print(e)
if __name__ == '__main__':
book_name = input('请输入小说名:')#'圣墟' 凡人修仙传
novel_url, novel_name = download(book_name)
chapter_list = get_chapter(novel_url)
get_content(chapter_list, novel_name)
##以上内容仅供学习使用
实现效果图
爬取 笔趣阁 的《凡人修仙传》,请勿过度爬取他人网站,后果自负。
实现效果二:
利用爬虫xpath 进行爬取笔趣阁 的《凡人修仙传仙界篇》,该代码更容易理解:
说明:该网站搜索功能崩溃,无法使用 输入搜索功能
可自行打开网站进行,选取后修改类中URL
如下:
self.server = 'http://www.biquge.info/书号id/'
self.target = 'http://www.biquge.info/书号id/'
当然你不想爬取这个网站也行,可执行修改爬取网站,xpath 的总体实现思路以提供
当然绝大部分网站是有反爬策略的,注意合理配置,访问即可
import requests, sys
from lxml import etree
import io
import re
"""
类说明:下载《笔趣阁》网小说《凡人修仙传仙界篇》
Parameters:
无
Returns:
无
Modify:
2019-12-24
"""
class downloader(object):
def __init__(self):
self.server = 'http://www.biquge.info/22_22533/'
self.target = 'http://www.biquge.info/22_22533/'
self.names = [] # 存放章节名
self.urls = [] # 存放章节链接
self.nums = 0 # 章节数
self.books = 0 #书名
"""
函数说明:获取下载链接
Parameters:
无
Returns:
无
Modify:
2019-12-24
"""
def get_download_url(self):
req = requests.get(url=self.target)
# print(req.status_code)
html = req.content.decode()
element = etree.HTML(html)
trs = element.xpath('//*[@id="list"]/dl/dd/a/@href')
trns = element.xpath('//*[@id="list"]/dl/dd/a/text()')
name = element.xpath('//*[@id="info"]/h1')[0]
self.books = name
# self.author = name
for tr in trs:
self.urls.append(self.server + tr)
for tr in trns:
self.names.append(tr)
self.nums = self.urls.__len__()
"""
函数说明:获取章节内容
Parameters:
target - 下载连接(string)
Returns:
texts - 章节内容(string)
Modify:
2019-12-24
"""
def get_contents(self, target):
req = requests.get(url=target)
contens = req.content.decode('utf-8')#设置编码格式
trs = etree.HTML(contens)
text = trs.xpath('//*[@id="content"]/text()')
stale = io.StringIO()
for tr in text:
stale.write('\n\t' + tr)#添加个格式,让文本更合理展示
re.time(1)#休眠1秒后执行
return stale.getvalue()
"""
函数说明:将爬取的文章内容写入文件
Parameters:
name - 章节名称(string)
path - 当前路径下,小说保存名称(string)
text - 章节内容(string)
Returns:
无
Modify:
2017-09-13
"""
def writer(self, name, path, text):
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
if __name__ == "__main__":
dl = downloader()
dl.get_download_url()
print('《'+dl.books.text+'》开始下载:')
for i in range(dl.nums):
dl.writer(dl.names[i], dl.books.text+'.txt', dl.get_contents(dl.urls[i]))
sys.stdout.write(" 已下载:%.3f%%" % float(i / dl.nums) + '\r')
sys.stdout.flush()
print(dl.books.text+'下载完成')
最后:本人实现了许多代理池的免费 ip 爬取 网站尽20余个,需要xpth 实现脚本的@该账号