# 增加了:1、使面向对象化 2、加入了异常判断,防止程序因报错中断 3、检查txt文件是否存在,如存在,跳过并下载下一个文件
# 增加了:多线程,可同时download多个文件 2018.1.11
import requests
from bs4 import BeautifulSoup
import time
import os
import threading
class Book1:
def __init__(self,start_url):
self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} ##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
self.start_url = start_url
# 开始爬虫
def star_url(self):
try:
books=self.requests(self.start_url)
# 定位目标小说并解析
books_url=books.find('div', class_="tab-item clearfix").find_all('div',class_="yd-book-item yd-book-item-pull-left")
except:
print('目标小说解析错误!!!!!')
for a in books_url:
try:
book1 = a.find('a')
book1_href = book1['href']
book1_gy=self.requests(book1_href) # 定位小说概要页面
allbook_url = book1_gy.find('div', class_="b-oper").find('a')['href']
open_url=self.requests(allbook_url)
name = open_url.find('div', class_="chapName").find('strong').get_text() # 定位小说名字
path = 'G:/小说/' + name + '.txt'
exist = os.path.exists(path)
if exist:
print('\n《' + name + '》' + ',已存在\n')
continue
print('《' + name + '》' + ',下载开始')
chapter = open_url.find('div', class_="clearfix dirconone").find_all('a') # 定位小说目录
except:
print(name + ',目录读取错误!')
continue
for i in chapter:
try:
title = i.get_text() # 目录章节
href = i['href'] # 取出a标签的href属性
html=self.requests(href)
# 解析具体章节页面
content = html.find('div', class_="mainContenr").get_text()
text = open(path, 'a', encoding='utf-8')
text.write('\n' + title + '\n\n\n' + content + '\n\n\n')
print(name+' '+'《' + title + '》' + '下载完成')
except:
print(name+': '+title + ',章节内容读取错误!')
continue
print('《' + name + '》' + ',下载完成'+'\n\n\n')
# 解析网页
def requests(self,url):
try:
content = requests.get(url, headers=self.headers)
content.encoding = 'gbk'
soup=BeautifulSoup(content.text,'lxml')
return soup
except:
print('网页解析发生错误!!!!')
# 定义多线程执行函数
def threads(count):
for i in range(count):
threading.Thread(target=Book1.star_url, args=()).start()
time.sleep(5)
# 执行多线程,要几个加几个(加鸡腿的效率操作O(∩_∩)O哈)
Book1=Book1('http://www.quanshuwang.com/all/lastupdate_5_0_0_0_1_0_1.html')
threads(10)
python爬虫-多线程小说批量下载
最新推荐文章于 2024-07-27 00:17:58 发布