import requests
import re
import os
import parsel
html_str = """
<!doctype html>
<html lang="en">
<head>
<meta charset = "utf-8">
<title>Document</title>
</head>
<body>
{article}
</body>
</html>
"""
html_filename = 'html\\'
if not os.path.exists(html_filename):
os.mkdir(html_filename)
url = 'https://www.chinawenwang.com/zlist-66-1.html' #文章列表页的url地址
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url=url,headers=headers)
href = re.findall('<h2><a href="(.*?)" class="juhe-page-left-div-link">',response.text) # 提取文章的url地址
for link in href:
response_1 = requests.get(url=link,headers=headers)
selector = parsel.Selector(response_1.text)
title = selector.css('.content-page-header-div h1::text').get()
content = selector.css(('.content-page-main-content-div')).get()
article = html_str.format(article = content)
with open(html_filename+title+'.html',mode='w',encoding='utf-8') as f:
f.write(article)
print(title)
结果展示: