以某小说网站为例,代码如下:
# coding: utf-8
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import random
import urllib.request
#获取当前文件夹路径
current_path = os.getcwd()
#查重
def check_duplited():
#查找路径下的所有文件夹名称
book_path = os.path.join(current_path, '小说爬虫')
# 创建一个空列表,用于存放所有文件夹的路径
book_path_list = []
# 遍历所有文件夹
for root, dirs, files in os.walk(book_path):
# dirs是一个包含当前路径下所有文件夹名称的列表
book_path_list.extend(dirs)
return book_path_list
#奇书网爬虫
def get_qishu_book():
url = 'https://www.qishu.de/'
#获取网页内容
result = requests.get(url)
result.encoding = 'gbk'
result = result.text
soup = BeautifulSoup(result, 'html.parser')
#确定下载路径
download_path = os.path.join(current_path, '小说爬虫\\奇书网')
#判断路径是否存在
if not os.path.exists(download_path):
os.makedirs(download_path)
#获取所有小说名称
# 如果文件不存在则创建文件
if os.path.isfile(f'{current_path}\\小说爬虫\\download_history.txt'):
with open(f'{current_path}\\小说爬虫\\download_history.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
exsit_book_list = [line.strip() for line in lines]
f.close()
else:
with open(f'{current_path}\\小说爬虫\\download_history.txt', 'w') as f:
f.close()
# 找到分类的链接跳转
class_div = soup.find('div', class_="wrap header")
class_tag = class_div.find_all('a', href=True, attrs={"target":"_blank"})
for class_information in class_tag:
class_name = class_information.get_text()
class_path = os.path.join(download_path,class_name)
if not os.path.exists(class_path):
os.makedirs(class_path)
#如果文件不存在则创建文件
if os.path.isfile(f'{class_path}\page.txt'):
with open(f'{class_path}\page.txt', 'r') as f:
page = int(f.readlines()[0])
else:
with open(f'{class_path}\page.txt', 'w') as f:
page = int(1)
class_link = f"https://www.qishu.de{class_information['href']}"
# 获取分类下的所有小说
response = requests.get(class_link)
response.encoding = 'gbk'
response = response.text
class_soup = BeautifulSoup(response, 'html.parser')
#获取该类别总页数
page_div = class_soup.find('div', class_="tspage")
page_tag = page_div.find_all('a', text='尾页')
#使用正则获取字符串的所有数字
page_total = int(re.findall(r"\d+", page_tag[0]['href'])[0])
print(f'开始爬取{class_name}类小说,总页数:{page_total}...')
while page<=page_total:
print(f'开始爬取页码{page}....')
# 获取该页的所有小说
if page == 1:
class_link = f"https://www.qishu.de{class_information['href']}"
else:
class_link = f"https://www.qishu.de{class_information['href']}index_{page}.html"
page+=1
#修改txt的内容
with open(f'{class_path}\page.txt', 'w') as f:
f.write(str(page))
# 获取该页的所有小说
page_response = requests.get(class_link)
page_response.encoding = 'gbk'
page_response = page_response.text
page_soup = BeautifulSoup(page_response, 'html.parser')
page_content_div = page_soup.find('div', class_="list")
content_tag = page_content_div.find_all('a')
for book_content in content_tag:
img = book_content.find('img')
if img is not None:
book_name = book_content.get_text()
if book_name not in exsit_book_list:
book_link = f"https://www.qishu.de{book_content['href']}"
#进入小说详细界面抓包
book_response = requests.get(book_link)
book_response.encoding = 'gbk'
book_response = book_response.text
book_soup = BeautifulSoup(book_response, 'html.parser')
book_content_div = book_soup.find('div', class_="showDown")
try:
book_tag = book_content_div.find_all('a',string='Txt格式下载')[0]
book_download_link = book_tag['href']
result_response = requests.get(book_download_link)
save_path = class_path + '\\' + book_name + '.txt'
if result_response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(result_response.content)
# 记录下载
with open(f'{current_path}\\小说爬虫\\download_history.txt', 'a',
encoding='utf-8') as f:
f.write(book_name)
f.write('\n')
f.close()
#随机睡眠
sleep_time= random.uniform(1, 5)
time.sleep(sleep_time)
#无txt格式,下载epub
elif result_response.status_code == 502:
book_tag = book_content_div.find_all('a', string='Epub格式下载')[0]
book_download_link = book_tag['href']
result_response = requests.get(book_download_link)
save_path = class_path + '\\' + book_name + '.epub'
if result_response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(result_response.content)
#记录下载
with open(f'{current_path}\\小说爬虫\\download_history.txt', 'a',encoding='utf-8') as f:
f.write(book_name)
f.write('\n')
f.close()
#随机睡眠
sleep_time= random.uniform(1, 5)
time.sleep(sleep_time)
else:
print(f'{book_name}下载失败,code:{result_response.status_code}')
with open(f'{class_path}\except.txt', 'a') as f:
f.write(f'{book_name}下载失败,code:{result_response.status_code}')
f.write('\n')
f.close()
except Exception as e:
print(f"捕获到异常: {e}, 小说名:{book_name},页码:{page-1},分类:{class_name}")
#写入日志
with open(f'{class_path}\except.txt', 'a') as f:
f.write(f"捕获到异常: {e}, 小说名:{book_name},页码:{page-1},分类:{class_name}")
f.write('\n')
f.close()
else:
pass
while True:
try:
get_qishu_book()
except:
#链接被断开,睡2分钟继续
time.sleep(120)