心灯录-爬虫
import requests
import os
from bs4 import BeautifulSoup
if __name__ == '__main__':
if not os.path.exists('D:/心灯录'):
os.mkdir('D:/心灯录')
url = 'http://www.daode.org/rdbook/xdl/index.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
response1 = requests.get(url=url, headers=headers)
response1.encoding = 'gb18030'
page_text1 = response1.text
soup1 = BeautifulSoup(page_text1, 'lxml')
a_list1 = soup1.select('a')
a_list = a_list1
flag = 5
for a in a_list:
if flag > 0:
flag -= 1
continue
chapter_title = a.string
add_name = a['href'].strip('../../')
chapter_url = 'http://www.daode.org/rdbook/xdl/' + add_name
chapter_response = requests.get(url=chapter_url, headers=headers)
chapter_response.encoding = 'gb18030'
chapter_text = chapter_response.text
chapter_soup = BeautifulSoup(chapter_text, 'lxml')
if (chapter_soup.find('p') == None):
continue
else:
chapter_content = chapter_soup.find('p', class_='style15').text
with open('D:/心灯录/' + chapter_title + '.txt', 'w', encoding='utf-8') as fp:
fp.write(chapter_content)
fp.close()
chapter_response.close()
print(chapter_title, '正文字数:' + str(len(chapter_content)), '下载完成!!!')