爬取’史书典籍‘

最新推荐文章于 2024-09-14 19:55:48 发布

Gxjd_Mr_蒋

最新推荐文章于 2024-09-14 19:55:48 发布

阅读量1.1k

点赞数

分类专栏：初学爬虫文章标签： python

本文链接：https://blog.csdn.net/qq_41053487/article/details/100513384

版权

初学爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

‘’’
http://www.shicimingju.com/book/index.html

‘’’

import urllib.request
import urllib.parse
from lxml import etree
import os
import time
from urllib.parse import urljoin

class Choose(object):
def get_url():
url=‘http://www.shicimingju.com/book/’
b=ShiShuBook(url)
str=b.get_html(url)
stri=etree.HTML(str)
mulu_code=stri.xpath("//div[@class=‘bookmark-list’]//h2/text()")
mulu_bt=stri.xpath("//div[@class=‘bookmark-list’]//a/text()")
mulu_url=stri.xpath("//div[@class=‘bookmark-list’]//a/@href")
for i in range(len(mulu_code)):
print(mulu_code[i],mulu_bt[i],end=’ ‘)
if (i+1)%5==0:
print(’\n’)
print(’\n’)
num=input(“请输入要下载的序号：”)
snum=num+’、’
if snum in mulu_code:
down_url=‘http://www.shicimingju.com’+mulu_url[int(num)-1]
return (down_url)
else:
print(‘没有找到…’)
exit()

class ShiShuBook(object):
def init(self,url):
self.url=url
self.y_url=self.url[:27]

def get_html(self,url):
	headers={
	'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
	}
	request=urllib.request.Request(url=url,headers=headers)
	response=urllib.request.urlopen(request)
	res=response.read().decode()
	return(res)

def title_subtitle_suburl(self,html):
	tree=etree.HTML(html)
	book_title=tree.xpath('//div/h1/text()')
	sub_title=tree.xpath('//div[@class="book-mulu"]//ul/li/a/text()')
	sub_url=tree.xpath('//div[@class="book-mulu"]//ul/li/a/@href')
	return (book_title,sub_title,sub_url)

def get_text(self,html):
	tree=etree.HTML(html)
	str=tree.xpath('//div[@class="chapter_content"]//text()')
	return(str)	

def save_file_content(self,book_title,sub_t,sub_u):
	book=book_title[0].encode()
	book=book.decode('utf-8')
	if not os.path.exists(book):
		os.mkdir(book)
	sub_t=sub_t.replace(' ','-').replace('--','-')
	file_name=book+'\\'+sub_t+'.txt'
	sub_u=urljoin(self.y_url,sub_u)
	html=self.get_html(sub_u)
	str=self.get_text(html)

	with open(file_name,'w',encoding='utf-8') as fp:
		for s in str:
			fp.write(s)
	print("{}  下载完成".format(sub_t))
	time.sleep(1)

def run(self):
	html=self.get_html(self.url)
	book_title,sub_title,sub_url=self.title_subtitle_suburl(html)		
	print("开始下载 {} ，请稍侯......".format(book_title[0]))
	for i in range(len(sub_title)):#
		self.save_file_content(book_title,sub_title[i],sub_url[i])
	print("{} 全部下载完毕！！！".format(book_title[0]))

def choose(self):
	url='http://www.shicimingju.com/book/'
	str=self.get_html(url)
	stri=etree.HTML(str)
	mulu_code=stri.xpath("//div[@class='bookmark-list']//h2/text()")
	mulu_bt=stri.xpath("//div[@class='bookmark-list']//a/text()")
	mulu_url=stri.xpath("//div[@class='bookmark-list']//a/@href")
	for i in range(len(mulu_code)):
		print(mulu_code[i],mulu_bt[i],end=' ')
		if (i+1)%5==0:
			print('\n')
	print('\n')
	num=input("请输入要下载的序号：")
	snum=num+'、'
	if snum in mulu_code:
		down_url='http://www.shicimingju.com'+mulu_url[int(num)-1]
		self.url=down_url
		self.run()
	else:
		print('没有找到......')
		exit()