爬虫爬取小说

最新推荐文章于 2024-09-12 14:11:12 发布

j596440876

最新推荐文章于 2024-09-12 14:11:12 发布

阅读量1.4k

点赞数 5

文章标签： windows

本文链接：https://blog.csdn.net/j596440876/article/details/132074089

版权

1、写一个网络爬虫程序
2、爬取目标网站数据，关键项不能少于5项。
3、存储数据到数据库，可以进行增删改查操作。

import requests
import os
from bs4 import BeautifulSoup
import time
def book_page_list(book_id):
url = 'http://www.biquw.com/book/{}/'.format(book_id)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
response = requests.get(url, headers)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'lxml')
booklist = soup.find('div', class_='book_list').find_all('a')
return booklist
def book_page_text(bookid, booklist):
for book_page in booklist:
page_name = book_page.text.replace('*', '')
page_id = book_page['href']
time.sleep(3)
url = 'http://www.biquw.com/book/{}/{}'.format(bookid, page_id)
headers = {
'User- agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
response_book = requests.get(url, headers)
response_book.encoding = response_book.apparent_encoding
soup_book = BeautifulSoup(response_book.text, 'lxml')
book_content = soup_book.find('div', id="htmlContent")
with open("./{}/{}.txt".format(bookid, page_name), 'a') as f:
f.write(book_content.text.replace('\xa0', ''))
print("当前下载章节：{}".format(page_name))
print("章节内容获取成功！")
if __name__ == '__main__':
bookid = input("请输入书号(数字)：")
# 如果书号对应的目录不存在，则新建目录，用于存放章节内容
if not os.path.isdir('./{}'.format(bookid)):
os.mkdir('./{}'.format(bookid))
try:
booklist = book_page_list(bookid)
print("获取目录成功！")
time.sleep(5)
book_page_text(bookid, booklist)
except Exception as e:
print(e)
print("获取目录失败，请确保书号输入正确！")