import os
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 '
'Safari/537.36',
'cookie': 'q_c1=b542702612f44b088719251e9de03c20|1681794946000|1681794946000; _zap=7ff88844-6984-404a-ab66-d7c352d09d0c; d_c0=AOAXyLHDpBaPTqffZaEansMxGvvINu2S9WY=|1681794275; YD00517437729195%3AWM_TID=o2aVAx5dsEdAVQFUAQPALtZcoZyNCFqS; __snaker__id=Crd1UjkeDW0Bu2wO; l_cap_id="ZTM5MzY3ZGFhMDA2NDQwOGJkYzcxMTQ3Njk4MTk1NTg=|1682264663|6e2e6779055fc262d3ec4b02e024d20367dd122f"; r_cap_id="YTY0NDg1MmFmYjRmNDVmMTk0MjMyM2M0YTI0NjgxY2Y=|1682264663|44d8b0a1f147e35370bb28a0d28f84c4b3bd3185"; cap_id="YWY2NzM3ZTFhODRkNDE3MGFlYjRiOGI5N2Q5YjkyMjI=|1682264663|bab40ea946a1f97da259d15e2d3656c7fc5e6922"; q_c1=3c56afc024fc426eb65999ca8ebb33cd|1682264685000|1682264685000; _ga=GA1.2.685732651.1684398623; _gid=GA1.2.269526236.1684819349; _xsrf=c6770262-4121-488c-b464-855a60e7bf9e; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1684742821,1684750791,1684818211,1684833453; tst=f; SESSIONID=kLoSvlxejUBG5q243K1vy4yj32Ry7hP0WxQXKU5p4oP; JOID=V14WAkuJ99lkFTsVIo1xxl2t_Lcy4b60NmZUaEXmoZ4hIU8kE4RitgQTPRIhKJOd0XeXMglW1RxkjslVqPwBJ6c=; osd=U1gXBkqN8dhgFD8TI4lwwlus-LY257-wN2JSaUHnpZggJU4gFYVmtwAVPBYgLJWc1XaTNAhS1Bhij81UrPoAI6Y=; captcha_session_v2=2|1:0|10:1684837077|18:captcha_session_v2|88:NEFpalhzWWRmZkdkbmgvVTlBVzAyQnVkTDRyMUcycFpRaXNnaWRkNi9BMUhxbWVRVElJOW5Scm84dnkwWU1EdA==|687c30f32ed925be500bbdf199111f626ef7ee299b8acf5253e9a280a78e08f8; gdxidpyhxdE=XyLkzrMsZsNToMu%2FzmXG2BE3yzVSsgnxT6pLq1hEN%2FRl8QlfAcbieij%5Cbam%2FuIcb7TiZjiS%2BIAgEYmv8GEEN88YhOOgC8%2BYTb27Vs3Qy9yMRvkHSfT8SR%2FqKEUck%5CUfgqM%2Fd0cCgGYfO7SK3exoR%5C52ukldTgHr6Sofeg618nI%2B6%2FBIO%3A1684837979195; YD00517437729195%3AWM_NI=dSTTyNNJMExxzhEGPU0kjRhNg2mMoenpvszYIDWlO1YRB%2F2HxbvEVJBbSFe9MF3wuAk2vEtWWGSB%2Fb1Vj5JX9HwhDegl1h8Oksvo2T90MUz4mBIHxBK4ijp43l%2BdI5wtZ3c%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee88f867fbec96abd9429b868bb6c45e969a9badc16888899fb2f86a85a9c097d22af0fea7c3b92aa88d9cb5fc468e9683a6fb74bab88ba8d35ff4b5a6b3eb46a5eea3aeee6faeb0ba94f9538b9c998dce3b9b8e8ed0b144bcbebaa6d54282be9da7e8338f9afd98f84ba9eebaa6f75d95999da3ae6af38aa6d6c954fb98b989c76fa5b4ff89bb45a9adba90b160fbf083b6f940af9e87a9fb4a89eca78bf340f1b4e585cc3fb5ac96d4e237e2a3; z_c0=2|1:0|10:1684837115|4:z_c0|92:Mi4xWTJYLUJnQUFBQUFBNEJmSXNjT2tGaVlBQUFCZ0FsVk4tLUJaWlFCdGtqSUItMUF3WE1ReW9ya0hGUEtQVzh1R3Zn|8b0ec894199d741170ad9d1b05dff3e2437b77463df8e3a7557f9d259fb60ebf; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1684837117; KLBRSID=d6f775bb0765885473b0cba3a5fa9c12|1684837117|1684833451'
}
def query(_url, _headers):
res = requests.get(url=_url, headers=_headers)
data = res.text
print(data)
soup = BeautifulSoup(data, 'lxml')
title = soup.find('h1', class_='QuestionHeader-title').text
print("标题:", title)
print(soup.select(
'#root > div > main > div > div > div.Question-main > div.ListShortcut > div > div.Card.AnswerCard.css-0 > '
'div > div > div > div.ContentItem-meta > div:nth-child(3)'))
print(soup.select(
'#root > div > main > div > div > div.Question-main > div.ListShortcut > div > div.Card.AnswerCard.css-0 > '
'div > div > div > div.ContentItem-meta'))
content = str(soup.find('div', class_='RichContent-inner').text)
print(content)
# 设置文件名和后缀
filename = f"{title}.txt"
suffix = 0
# 如果文件已经存在,则增加一个数字后缀
while os.path.isfile(filename):
suffix += 1
filename = f"{title}_{suffix}.txt"
# 打开文件并写入内容
with open(filename, "w", encoding="utf-8") as f:
for line in content.splitlines():
f.write(line + "\n")
print(f"文件 {filename} 创建成功!")
# url = 'https://www.zhihu.com/question/322856732/answer/1832552633'
url = input("输入链接:")
query(_url=url, _headers=headers)
问答爬取记录
于 2023-05-23 18:23:44 首次发布