自学python爬虫,爬取次元姬小说并保存到指定的目录下。修改小说地址和保存路径即可自动爬取,源码分享。
import requests # 送http请求的库
from bs4 import BeautifulSoup # 处理获取到的网页信息的库
import os
import time
urls = []
names = []
fees_urls = []
fees_names = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}
def get_count(url):
response = requests.get(url, headers=headers)
if response.ok:
count = response.text
soup = BeautifulSoup(count, "html.parser")
all_a = soup.findAll("a", attrs={"class": "Link_link__LjyZ2 book_detail_item__EMrK7"})
fees = soup.findAll("a", attrs={"class": "Link_link__LjyZ2 book_detail_item__EMrK7 book_detail_lock__eNRvE"})
for a in all_a:
urls.append(f'https://www.ciyuanji.com{a["href"]}')
names.append(a.string)
for fee in fees:
fees_urls.append(f'https://www.ciyuanji.com{fee["href"]}')
fees_names.append(fee.string)
def get_fiction(url, name):
response = requests.get(url, headers=headers)
if response.ok:
folder_path = r"D:\1\pythonTest\fiction" # 小说保存地址
if not os.path.exists(folder_path): # 判断文件夹是否存在
# 如果文件夹不存在,则创建它
os.makedirs(folder_path)
file_name = f"{name}.txt"
# 构造完整的文件路径
file_path = os.path.join(folder_path, file_name)
with open(file_path, 'w') as f:
content = response.content
soup = BeautifulSoup(content, "html.parser")
text = soup.findAll('article', attrs={"class": "chapter_article__vWEkb"})
all_p = str(text[0]).split('<p>')
for p in all_p:
if '</p>' in p and '</article>' not in p:
text_replace = p.replace('</p>', '')
f.write(text_replace + "\n")
f.flush()
f.close()
def main():
get_count("https://www.ciyuanji.com/b_d_1226.html") # 小说主页面路径
urls.extend(fees_urls)
names.extend(fees_names)
for url, name in zip(urls, names):
get_fiction(url, name)
print(f'{name} 下载完成')
time.sleep(10) # 休眠10秒再发送请求
if __name__ == '__main__':
main()