python3爬取某网站文章

南下狩猎的小花猫

于 2024-09-29 16:24:07 发布

阅读量114

点赞数 1

文章标签： python 爬虫

本文链接：https://blog.csdn.net/HSJ0170/article/details/142638280

版权

声明：仅供学习参考，请勿用于非法商业用途，本人概不负责！！！

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time    : 2024/8/23 12:41
# @Author  : 何胜金-heshengjin
# @Site    :
# @File    : http_test.py
# @Software: PyCharm
"""
虚拟virtualenv
pip install requests
pip install beautifulsoup4
"""

import requests
from bs4 import BeautifulSoup
import time

# 请求头，添加你的浏览器信息后才可以正常运行（cookies过期自己手动获取）
# host = 'www.xdingdian.info'
host_http = 'https://www.gushiwen.cn'
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    # 'Accept-Encoding':'gzip, deflate, br, zstd',
    'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control':'max-age=0',
    'Referer': 'https://www.gushiwen.cn/guwen/book.aspx?id=46653FD803893E4FE03CBAE75DE61AB8',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
    'Cookie': "login=flase; Hm_lvt_9007fab6814e892d3020a64454da5a55=1727590841; HMACCOUNT=FF82E8281B1A4114; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1727594427",
    # 'Host': host,
    'Connection': 'keep-alive'
}
content_txt = "百家姓.txt"
tmp_html = "temp33.html"
next_text = '下一章'
# 小说起始页
main_url = "https://www.gushiwen.cn/guwen/bookv_d3fab98491f4.aspx"


while True:
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

    # 使用get方法请求网页
    source_html = requests.get(main_url, headers=headers)
    # 设置编码
    source_html.encoding = 'utf-8'

    # 覆盖写入 temp.html
    with open(tmp_html, "w+", encoding="utf-8") as f:
        f.write(source_html.text)
        f.seek(0)
        html_handle = f.read()

    title_text = ''
    soup = BeautifulSoup(html_handle, "html.parser")

    if next_text == '下一章':
        title = soup.find('div', class_='sons').find('div', class_='cont').find('h1').text
        title_text += '\n正文 '
        title_text += title.replace('\n', '').replace('\r', '')
        # 打印title
        print(title_text)
        title_text += '\n'

    text = soup.find('div', class_='contson').text
    title_text += text
    # print(text)

    children = soup.find('div', class_='bookvmiddle').find_all("a", recursive=False)
    last_children = children[-1]
    main_url = host_http + last_children['href']
    next_text = last_children.get_text()
    print(next_text + main_url + "\n")

    # 追加写入
    with open(content_txt, "a+", encoding="utf-8") as fc:
        # 处理NBSP 等等特殊符号
        fc.write(title_text.replace(u'\xa0', '')
                 .replace(u'「', ' ')
                 .replace(u'」', ' ')
                 .replace(u'', '')
                 .replace(u'', '')
                 .replace(u'', '')
                 .replace(u'历史来源', '\n历史来源\n')
                 .replace(u'家族名人', '\n家族名人\n')
                 .replace(u'地望分布', '\n地望分布\n')
                 )

    # 2s
    time.sleep(2)