问答爬取记录

import os
import requests
from bs4 import BeautifulSoup
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 '
                  'Safari/537.36',
    'cookie': 'q_c1=b542702612f44b088719251e9de03c20|1681794946000|1681794946000; _zap=7ff88844-6984-404a-ab66-d7c352d09d0c; d_c0=AOAXyLHDpBaPTqffZaEansMxGvvINu2S9WY=|1681794275; YD00517437729195%3AWM_TID=o2aVAx5dsEdAVQFUAQPALtZcoZyNCFqS; __snaker__id=Crd1UjkeDW0Bu2wO; l_cap_id="ZTM5MzY3ZGFhMDA2NDQwOGJkYzcxMTQ3Njk4MTk1NTg=|1682264663|6e2e6779055fc262d3ec4b02e024d20367dd122f"; r_cap_id="YTY0NDg1MmFmYjRmNDVmMTk0MjMyM2M0YTI0NjgxY2Y=|1682264663|44d8b0a1f147e35370bb28a0d28f84c4b3bd3185"; cap_id="YWY2NzM3ZTFhODRkNDE3MGFlYjRiOGI5N2Q5YjkyMjI=|1682264663|bab40ea946a1f97da259d15e2d3656c7fc5e6922"; q_c1=3c56afc024fc426eb65999ca8ebb33cd|1682264685000|1682264685000; _ga=GA1.2.685732651.1684398623; _gid=GA1.2.269526236.1684819349; _xsrf=c6770262-4121-488c-b464-855a60e7bf9e; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1684742821,1684750791,1684818211,1684833453; tst=f; SESSIONID=kLoSvlxejUBG5q243K1vy4yj32Ry7hP0WxQXKU5p4oP; JOID=V14WAkuJ99lkFTsVIo1xxl2t_Lcy4b60NmZUaEXmoZ4hIU8kE4RitgQTPRIhKJOd0XeXMglW1RxkjslVqPwBJ6c=; osd=U1gXBkqN8dhgFD8TI4lwwlus-LY257-wN2JSaUHnpZggJU4gFYVmtwAVPBYgLJWc1XaTNAhS1Bhij81UrPoAI6Y=; captcha_session_v2=2|1:0|10:1684837077|18:captcha_session_v2|88:NEFpalhzWWRmZkdkbmgvVTlBVzAyQnVkTDRyMUcycFpRaXNnaWRkNi9BMUhxbWVRVElJOW5Scm84dnkwWU1EdA==|687c30f32ed925be500bbdf199111f626ef7ee299b8acf5253e9a280a78e08f8; gdxidpyhxdE=XyLkzrMsZsNToMu%2FzmXG2BE3yzVSsgnxT6pLq1hEN%2FRl8QlfAcbieij%5Cbam%2FuIcb7TiZjiS%2BIAgEYmv8GEEN88YhOOgC8%2BYTb27Vs3Qy9yMRvkHSfT8SR%2FqKEUck%5CUfgqM%2Fd0cCgGYfO7SK3exoR%5C52ukldTgHr6Sofeg618nI%2B6%2FBIO%3A1684837979195; YD00517437729195%3AWM_NI=dSTTyNNJMExxzhEGPU0kjRhNg2mMoenpvszYIDWlO1YRB%2F2HxbvEVJBbSFe9MF3wuAk2vEtWWGSB%2Fb1Vj5JX9HwhDegl1h8Oksvo2T90MUz4mBIHxBK4ijp43l%2BdI5wtZ3c%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee88f867fbec96abd9429b868bb6c45e969a9badc16888899fb2f86a85a9c097d22af0fea7c3b92aa88d9cb5fc468e9683a6fb74bab88ba8d35ff4b5a6b3eb46a5eea3aeee6faeb0ba94f9538b9c998dce3b9b8e8ed0b144bcbebaa6d54282be9da7e8338f9afd98f84ba9eebaa6f75d95999da3ae6af38aa6d6c954fb98b989c76fa5b4ff89bb45a9adba90b160fbf083b6f940af9e87a9fb4a89eca78bf340f1b4e585cc3fb5ac96d4e237e2a3; z_c0=2|1:0|10:1684837115|4:z_c0|92:Mi4xWTJYLUJnQUFBQUFBNEJmSXNjT2tGaVlBQUFCZ0FsVk4tLUJaWlFCdGtqSUItMUF3WE1ReW9ya0hGUEtQVzh1R3Zn|8b0ec894199d741170ad9d1b05dff3e2437b77463df8e3a7557f9d259fb60ebf; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1684837117; KLBRSID=d6f775bb0765885473b0cba3a5fa9c12|1684837117|1684833451'
}
def query(_url, _headers):
    res = requests.get(url=_url, headers=_headers)
    data = res.text
    print(data)
    soup = BeautifulSoup(data, 'lxml')
    title = soup.find('h1', class_='QuestionHeader-title').text
    print("标题:", title)
    print(soup.select(
        '#root > div > main > div > div > div.Question-main > div.ListShortcut > div > div.Card.AnswerCard.css-0 > '
        'div > div > div > div.ContentItem-meta > div:nth-child(3)'))
    print(soup.select(
        '#root > div > main > div > div > div.Question-main > div.ListShortcut > div > div.Card.AnswerCard.css-0 > '
        'div > div > div > div.ContentItem-meta'))
    content = str(soup.find('div', class_='RichContent-inner').text)
    print(content)

    # 设置文件名和后缀
    filename = f"{title}.txt"
    suffix = 0

    # 如果文件已经存在,则增加一个数字后缀
    while os.path.isfile(filename):
        suffix += 1
        filename = f"{title}_{suffix}.txt"

    # 打开文件并写入内容
    with open(filename, "w", encoding="utf-8") as f:
        for line in content.splitlines():
            f.write(line + "\n")

    print(f"文件 {filename} 创建成功!")
# url = 'https://www.zhihu.com/question/322856732/answer/1832552633'
url = input("输入链接:")
query(_url=url, _headers=headers)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值