爬取某付费网站文档保存为html文件

本文链接：https://blog.csdn.net/m0_57265868/article/details/135873733

import requests
import re
import os
import parsel


html_str = """
<!doctype html>
<html lang="en">
<head>
    <meta charset = "utf-8">
    <title>Document</title>
</head>
<body>
{article}
</body>
</html>
"""
html_filename = 'html\\'
if not os.path.exists(html_filename):
    os.mkdir(html_filename)
url = 'https://www.chinawenwang.com/zlist-66-1.html'  #文章列表页的url地址
headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
response = requests.get(url=url,headers=headers)

href = re.findall('<h2><a href="(.*?)" class="juhe-page-left-div-link">',response.text) # 提取文章的url地址
for link in href:
    response_1 = requests.get(url=link,headers=headers)
    selector = parsel.Selector(response_1.text)
    title = selector.css('.content-page-header-div h1::text').get()
    content = selector.css(('.content-page-main-content-div')).get()
    article = html_str.format(article = content)
    with open(html_filename+title+'.html',mode='w',encoding='utf-8') as f:
        f.write(article)
        print(title)

结果展示：