来源:https://blog.lqsos.com/archives/32.html
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# author=He
import xml.dom.minidom
import datetime
from urllib import request
from bs4 import BeautifulSoup
'''要执行的url'''
URL = 'https://blog.lqsos.com'
'''所有url列表'''
URL_LIST = {}
'''模拟header'''
HEADER = {
'Cookie': 'AD_RS_COOKIE=20080917',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ '
'Chrome/58.0.3029.110 Safari/537.36'}
def get_http(url, headers=None, charset='utf8'):
"""
发送请求
:param url:
:param headers:
:param charset:
:return:
"""
if headers is None:
headers = {}
try:
return request.urlopen(request.Request(url=url, headers=headers)).read().decode(charset)
except Exception:
pass
return ''
def open_url(url):
"""
打开链接,并返回该链接下的所有链接
:param url:
:return:
"""
soup = BeautifulSoup(get_http(url=url, headers=HEADER), 'html.parser')
all_a = soup.find_all('a')
url_list = {}
for a_i in all_a:
if foreign_chain(a_i.get('href')) is True:
url_list[a_i.get('href')] = a_i.get('href')
URL_LIST[a_i.get('href')] = a_i.get('href')
return url_list
def foreign_chain(url):
"""
验证是否是外链
:param url:
:return:
"""
return url.find(URL) == 0
'''首页'''
home_all_url = open_url(URL)
'''循环首页下的所有链接'''
if isinstance(home_all_url, dict):
# 循环首页下的所有链接
for home_url in home_all_url:
# 验证是否是本站域名
if foreign_chain(home_url) is True:
open_url(home_url)
URL_LIST_COPY = URL_LIST.copy()
for copy_i in URL_LIST_COPY:
open_url(copy_i)
# 创建文件
doc = xml.dom.minidom.Document()
root = doc.createElement('urlset')
# 设置根节点的属性
root.setAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
root.setAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
root.setAttribute('xsi:schemaLocation', 'http://www.sitemaps.org/schemas/sitemap/0.9 \
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd')
doc.appendChild(root)
for url_list_i in URL_LIST:
nodeUrl = doc.createElement('url')
nodeLoc = doc.createElement('loc')
nodeLoc.appendChild(doc.createTextNode(str(url_list_i)))
nodeLastmod = doc.createElement("lastmod")
nodeLastmod.appendChild(doc.createTextNode(str(datetime.datetime.now().date())))
nodePriority = doc.createElement("priority")
nodePriority.appendChild(doc.createTextNode('1.0'))
nodeUrl.appendChild(nodeLoc)
nodeUrl.appendChild(nodeLastmod)
nodeUrl.appendChild(nodePriority)
root.appendChild(nodeUrl)
fp = open('sitemap.xml', 'w')
doc.writexml(fp, indent='\t', addindent='\t', newl='\n', encoding="utf-8")
通过linux的crontab命令定时更新文件