import urllib.request
from lxml import etree
def chu_url(url,shuhao):
url = url + shuhao
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
req = urllib.request.Request(url=url,headers=headers)
res = urllib.request.urlopen(req)
return res
def wanmei_spider(res):
html = res.read()
html_tree = etree.HTML(html)
tree_list = html_tree.xpath("//div[@class='box_con']//dl//dd/a/@href")
# print(tree_list) #验证
#获取章节的内容和章节名称
shus = {}
for tree in tree_list:
res = urllib.request.urlopen(tree)
html = res.read()
tree1 = etree.HTML(html)
shus['章节'] = tree1.xpath("//div[@class='bookname']/h1/text()")
shus["内容"] = tree1.xpath("//div[@id='content']/text()")
# print(html)
return shus
def main():
url = "https://www.biquge5200.com/"
#书号表示网址 https://www.biquge5200.com/0_9/ .com/后的0_9就是书号
shuhao = input("请输入书号")
text = wanmei_spider(chu_url(url,shuhao))
# print(text)
return text
if __name__ == '__main__':
main()