import re from time import sleep import requests from bs4 import BeautifulSoup # 获取书籍名字和进入目录的链接
def get_book():
# 获取网页
url = 'http://www.shicimingju.com/book/'
html = requests.get(url).text
# 利用BeautifulSoup分析提取网页内容
soup = BeautifulSoup(html, 'lxml')
bookmark_list = soup.find('div', class_='bookmark-list')
li_list = bookmark_list.find_all('li')
# 获取想要的书籍的名字和进入目录的链接,并生成字典
datas = []
for li in li_list:
book_href = 'http://www.shicimingju.com' + li.find('a')['href']
book_name = li.text.replace('\n', '')
reg = r'\d+、(.*)'
book_name = re.findall(reg, book_name)[0]
dict1 = {
'href': book_href,
'book_name': book_name
}
datas.append(dict1)
return datas
# 获取目录信息,进入内容的链接
def get_mulu_detail(urls):
# 获取目录网页信息
html = requests.get(urls).text
# 分析网页
soup = BeautifulSoup(html, 'lxml')
book_mulu = soup.find('div', class_='book-mulu')
li_list = book_mulu.find_all('li')
# 得到进入内容的链接
detail_list = []
for li in li_list:
li_href = 'http://www.shicimingju.com' + li.find('a')['href']
detail_list.append(li_href)
return detail_list
# 爬取书籍内容
def get_content(urls):
content = []
for url in urls:
# 获取网页信息
html = requests.get(url).text
# 分析网页并得到内容
soup = BeautifulSoup(html, 'lxml')
container = soup.find('div', class_='www-main-container')
container = container.text.replace('\n', '').replace('\xa0', '')
content.append(container)
return content
#将内容保存到每个对应的文件中,文件的名字用 ‘书名+.txt’ 来命名
def save_books(contents, book_name):
filename = book_name + '.txt'
for content in contents:
with open(filename, 'a', encoding='utf-8') as f:
f.write(content)
#代码在当前环境下运行
def main():
datas = get_book()
for data in datas:
book_href = data['href']
book_name = data['book_name']
sleep(2)
detail_href = get_mulu_detail(book_href)
sleep(2)
contents = get_content(detail_href)
sleep(2)
save_books(contents, book_name)
if __name__ == '__main__':
main()
# 最后整理代码如下
import re
from time import sleep
import requests
from bs4 import BeautifulSoup
def get_book():
url = 'http://www.shicimingju.com/book/'
html = requests.get(url).text
# print(html)
soup = BeautifulSoup(html, 'lxml')
bookmark_list = soup.find('div', class_='bookmark-list')
li_list = bookmark_list.find_all('li')
datas = []
for li in li_list:
book_href = 'http://www.shicimingju.com' + li.find('a')['href']
book_name = li.text.replace('\n', '')
reg = r'\d+、(.*)'
book_name = re.findall(reg, book_name)[0]
dict1 = {
'href': book_href,
'book_name': book_name
}
datas.append(dict1)
return datas
def get_mulu_detail(urls):
html = requests.get(urls).text
# print(html)
soup = BeautifulSoup(html, 'lxml')
book_mulu = soup.find('div', class_='book-mulu')
li_list = book_mulu.find_all('li')
detail_list = []
for li in li_list:
li_href = 'http://www.shicimingju.com' + li.find('a')['href']
detail_list.append(li_href)
return detail_list
def get_content(urls):
content = []
for url in urls:
html = requests.get(url).text
# print(html)
soup = BeautifulSoup(html, 'lxml')
container = soup.find('div', class_='www-main-container')
container = container.text.replace('\n', '').replace('\xa0', '')
content.append(container)
return content
def save_books(contents, book_name):
filename = book_name + '.txt'
for content in contents:
with open(filename, 'a', encoding='utf-8') as f:
f.write(content)
def main():
datas = get_book()
for data in datas:
book_href = data['href']
book_name = data['book_name']
sleep(2)
detail_href = get_mulu_detail(book_href)
sleep(2)
contents = get_content(detail_href)
sleep(2)
save_books(contents, book_name)
if __name__ == '__main__':
main()