import urllib.request
import lxml.etree
import os
from multiprocessing.dummy import Pool
def get_start_page(url):
"""
通过网址请求页面
:param url: 请求网址
:return: 请求得到的页面内容
"""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read()
# 页面采用的是GB2312进行编码,这里选择使用的是gbk进行解码
html = content.decode('gbk')
print(html)
return html
def get_content_page(url):
"""
通过网址请求页面
:param url: 请求网址
:return: 请求得到的页面内容
"""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read()
# 页面采用的是GB2312进行编码,这里选择使用的是gbk进行解码
html = content.decode('gbk')
print(html)
return html
def load_start_page(name):
"""
加载保存在硬盘上的网页内容
:param name: 网页文件名称
:return: 网页内容
"""
with open(name,"r") as f:
html = f.read()
print(html)
return html
def xpath_parse_start_page(html):
"""
解析网页内容,得到下一级页面的url
:param html: 网页内容
:return: 下一级页面的url
"""
selector = lxml.etree.HTML(html, lxml.etree.HTMLParser())
urls = selector.xpath('//tr/td/table[@cellpadding="8"]/tbody/tr/td/a/@href')
print(urls)
return urls
def xpath_parse_content_page(html):
"""
解析页面,得到标题和内容
:param html:
:return: 标题和页面内容
"""
selector = lxml.etree.HTML(html, lxml.etree.HTMLParser())
tittles = selector.xpath('//font/text()')
tittle = tittles[0]
contents = selector.xpath('//p/text()')
return tittle, contents
def save_content(title, contents):
"""
保存内容到文件
:param title: 文件名称
:param contents: 文件内容
:return:
"""
os.makedirs("动物庄园", exist_ok=True)
with open(os.path.join("动物庄园", title + ".txt"), "w", encoding='utf-8') as f:
for content in contents:
if len(content) > 2:
f.write(content)
def write_html(url):
"""
请求url,并将得到的页面进行解析,保存到文件中
:param url: 请求url
:return:
"""
html = get_content_page(url)
title, contents = xpath_parse_content_page(html)
save_content(title, contents)
if __name__ == "__main__":
start_url = 'https://www.kanunu8.com/book3/6879/'
html = get_start_page(url=start_url)
# html = load_start_page('animal.html')
urls = xpath_parse_start_page(html)
pool = Pool(4)
pool.map(write_html, urls)