这里只爬取了三国演义整本书
from bs4 import BeautifulSoup
import urllib.request
import time
def get_string(href):
request = get_request(url=href)
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
soup = BeautifulSoup(content,'lxml')
odiv = soup.find('div',class_ = 'chapter_content')
return odiv.text
def get_text(odiv):
#更具方法查找所有章节A链接
on_list = odiv.find_all('a')
fp = open('三国演义.txt','w',encoding='utf8')
for on in on_list:
title = on.string
print("正在下载-----%s"%title)
href = 'http://www.shicimingju.com' + on["href"]
text = get_string(href)
#写入文件
fp.write(title + '/n' + text)
print("结束下载-----%s" % title)
time.sleep(2)
fp.close()
def parse_content(content):
#生成soup对象
soup = BeautifulSoup(content,'lxml')
#更具方法查找所有的章节和内容
odiv = soup.find('div',class_='book-mulu')
get_text(odiv)
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
return content
def get_request(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def main():
url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
# 构建请求对象
request = get_request(url)
#获得首页内容
content = get_content(request)
#通过bs4解析网页内容
parse_content(content)
if __name__ == '__main__':
main()