# coding=utf-8
import urllib.request
import ssl
import re
# 开始调用
def getHtml(url):
ssl._create_default_https_context = ssl._create_unverified_context
page = urllib.request.urlopen(url)
html = page.read()
html = html.decode('utf-8')
return html
def scriptHtmlKind(data, reg):
# data = '11爱woni000'
# reg = r'\d{2}[\u4e00-\u9fa5]+\w{4}\d{3}'
# 编写正则表达式-获取首页所有分类list
kindDomReg = re.compile(reg)
kindDom = re.findall(kindDomReg, data)
return kindDom
# 处理总列表
def kindAllBoss(url):
print(url)
# 获取分类页面节点
# getHtml(url)
if __name__ == '__main__':
# 处理url
url = 'https://www.bxwxorg.com/'
# 获取资源
data = getHtml(url)
# 处理资源,获取分类列表DOM
reg = r'<div class="nav">\s*<ul>[\u4e00-\u9fa50-9a-zA-Z\<\>\\\"\s\=\:\/\/\.]*?</div>'
kindDom = scriptHtmlKind(data, reg)
if len(kindDom) < 1:
AssertionError
# 获取分类列表
reg2 = r'.com/(\w*\/)">([\u4e00-\u9fa5a-zA-Z]*)'
kindList = scriptHtmlKind(kindDom[0], reg2)
# 分类list,存库
kinds = []
for kind in kindList:
kinds.append(kind[0])
# 拼接url,处理分类资源,获取分类页面下所有书本
for kind in kinds:
kindAllBoss(url+kind)
# print(data)