该代码仅供学习交流!!
爬虫:
爬虫是一种自动化程序,用于从互联网上获取和提取数据。它模拟人类用户的行为,在网页上浏览和抓取信息,并将所需数据保存或处理。
下面演示的代码是爬取小说的
先看运行效果图
下载完成后,直接合并为一个文件,超方便好吧,不是单个的文件看着麻烦
代码如下:
需要爬取指定小说就修改一下对应小说的编号即可
import requests
from bs4 import BeautifulSoup
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
ids = "16585"
url = f'http://www.qiuyelou.net/{ids}/'
def get_soup(url):
# 发送请求获取页面内容,并返回解析后的BeautifulSoup对象
response = requests.get(url, headers=headers)
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def download_chapter(chap_url, title):
# 下载章节内容
retries = 3
while retries > 0:
try:
chapter_response = requests.get(chap_url, headers=headers)
chapter_response.encoding = "utf-8"
chapter_soup = BeautifulSoup(chapter_response.text, 'html.parser')
# 提取章节标题和内容
chapter_title = chapter_soup.find('div', class_='title').h1.text
idclass = chap_url.split("/")[-1].replace(".html", "")
chapter_content = chapter_soup.find('div', id=f'content{idclass}').text
# 将章节标题和内容写入文件
with open(f"缓存/{title}.txt", 'a', encoding='utf-8') as f:
f.write(chapter_title + '\n\n')
f.write(chapter_content + '\n\n')
print('已下载:', chapter_title)
break
except requests.exceptions.RequestException:
print('请求失败,重试...')
retries -= 1
time.sleep(1)
else:
print('无法下载章节:', chap_url)
soup = get_soup(url)
title = soup.find('div', class_='title').h1.text
print('正在下载小说:', title)
chapter_urls = [f'http://www.qiuyelou.net/{ids}/{chapter["href"]}' for chapter in soup.select('dd a')]
print(chapter_urls)
for chap_url in chapter_urls:
download_chapter(chap_url, title)
print('小说下载完成!')