今天在网上看到了另外一种爬取数据的方法,今天就给大家展示出来。目前还没有接触爬虫框架,废话不多说,上代码。
import requests
import os
from bs4 import BeautifulSoup
shici_url = 'http://www.shicimingju.com'
url = 'http://www.shicimingju.com/book/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}
# # 请求页面数据
response = requests.get(url=url,headers=headers)
page_text = response.text
soup = BeautifulSoup(page_text,'lxml')
# 获取所有小说名称组成的列表
a_list = soup.select('.bookmark-list>ul>li>h2>a')
# 获取书籍的详细
def get_book_detail(page_url):
book_detail_content = requests.get(url=page_url, headers=headers).text
soup = BeautifulSoup(book_detail_content, 'lxml')
book_content = soup.select('.chapter_content>p')
if not book_content:
book_content = soup.select('.chapter_content')
content = ''
for book_c in book_content:
content = content + book_c.text
# 获取 详细内容
return content
# 获取书籍的列表页面
def get_book_list(book_url,f):
book_list_content = requests.get(url=book_url, headers=headers).text
soup = BeautifulSoup(book_list_content, 'lxml')
book_mulu = soup.select('.book-mulu>ul>li>a')
for book in book_mulu:
page_title = book.text
print(page_title + "开始下载...")
page_url = shici_url+book['href']
# 调用 详细页面
content = get_book_detail(page_url)
f.write(page_title+"\n\n"+content+"\n\n\n")
print(page_title+"下载完成...")
f.close()
# 判断目录是否存在
file_path = './史书/'
if not os.path.exists(file_path):
os.mkdir(file_path)
n = 0
for a in a_list:
n = n + 1
# 书名
book_name = a.text
print("<<%s>>正在下载..."%book_name)
# 创建以当前书名为文件名的txt文件
file_name = file_path+str(n)+'.'+book_name+'.txt'
f = open(file_name,'a+',encoding='utf-8')
# url
book_url = shici_url+a['href']
# 通过url 进入到 书籍的列表页面
get_book_list(book_url,f)
下面这张图就是我们爬取成功之后的txt文件。