爬取知乎问题答案

Leah0105

已于 2023-03-10 17:16:43 修改

阅读量884

点赞数

文章标签： python 开发语言

于 2023-03-10 17:12:41 首次发布

本文链接：https://blog.csdn.net/Leah0105/article/details/129447059

版权

参考博客：基于Python知乎回答爬虫 +jieba关键字统计可视化_知乎爬虫搜索关键词_菠萝柚王子的博客-CSDN博客

1、安装依赖包

import numpy
import requests
import certifi
from PIL import Image
from lxml import etree
import jieba
from wordcloud import WordCloud

手动安装插件

1、下载插件包，解压到路径Python3\Lib\site-packages

2、进入插件包，执行 python setup.py install命令进行安装

2、爬取问题答案

def fetch_text():
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
               'Cookie': '_zap=45ffaab9-c328-4843-b252-1f521d538595; _xsrf=0c0fc9f1-b57a-43f0-a9c1-c95ec11e59fb; d_c0=AKBYoDTQaRaPTuBvfd2jHmPLHLky7s_fjJ4=|1677838132; ' \
                         'KLBRSID=ca494ee5d16b14b649673c122ff27291|1677897534|1677897456; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1677838134; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1677897488;' \
                         ' captcha_session_v2=2|1:0|10:1677897488|18:captcha_session_v2|88:' \
                         'WjlkMlJMeWNSUGQzV1M1ZzVqYVJqR0NUOGVyS1JBWlovZnYxRUVudEN6VnNUc0pVZWhwZGxsZStjWTBJd1Z2Zw==|8c57cc723eb09a7831a22c18cf6a4bcf1b906cf377457432e4abf75a67fd3e5c; ' \
                         'gdxidpyhxdE=8eQjbxDfuEq18dZi20NAsNH%5C6YR%2Fe0ojcGKE%2BSzWTMZ%2F4fn2DbAIOK%2FdTeKrJMVjfZhwRN3Hm00KqXIMm4RMgN4qJ7sPKWI0gl07p3C6tT9oipWWGlnI7mIQDtqrL8M%2BkSc5z4mdOzT7LOluNpqStLP9r' \
                         'PqEgwypqOf7HPppLG4Kvn28%3A1677898391211; YD00517437729195%3AWM_NI=WpDYVR1YaOo%2FKnQBpVcPHYko%2F6Rhxi%2FZOqrVf9HcRCbMlsl5heAV5MD5J9tx0mLPUjUPiRx2iTB%2BQUsKsPZmeYsEE5gYbKckD4EFCv' \
                         '060QUMYmbK7IXwRpdNAwcrOKhvZ3I%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee98d44db69fa18ebc4d92e78ea3c54f978a9eb0d46f8fb8a197ea6d89b8a4a5f82af0fea7c3b92a95b08cbad55' \
                         'af387afb0ee2589b29fa3f35ea19088baea65edea9e89fb63ed868192f15ea6e78884d149a293a494ef3facbb8fb7b84788eb86daf774f4b384b1d8629790a7b3fb3aab8c8eccd45fb195bdb1d967a787bda6c83fa5' \
                         '87aeace26188ec8ba7c27ef4ea00b2d8479bb3aeacb6439ca99e91e74683a9ad93ce41acaa96d4cc37e2a3; YD00517437729195%3AWM_TID=thaY088YLfdBVERQQRbVLI09rkwOhxWv; ' \
                         'ariawapChangeViewPort=false; ariaFixed=true; ariaReadtype=1; ariaoldFixedStatus=false; ariaStatus=false'}
    url = 'https://www.zhihu.com/question/308447090'  # 'https://www.zhihu.com/question/308447090'
    response = requests.get(url, headers=headers, verify=False)
    html = etree.HTML(response.text)
    title = html.xpath("//h1[@class='QuestionHeader-title']")[0].text
    keyword = html.xpath("//div/meta[@itemprop='keywords']")[0].get("content")
    print(keyword)

    aheader = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0',
               'Accept': '*/*',
               'Accept-Language': 'en-US,en;q=0.5',
               'Accept-Encoding': 'gzip, deflate, br',
               'Referer': 'https://www.zhihu.com/question/308447090,',
               'Cookie': '_zap=45ffaab9-c328-4843-b252-1f521d538595; _xsrf=0c0fc9f1-b57a-43f0-a9c1-c95ec11e59fb; d_c0=AKBYoDTQaRaPTuBvfd2jHmPLHLky7s_fjJ4=|1677838132; ' \
                         'KLBRSID=81978cf28cf03c58e07f705c156aa833|1677922162|1677922052; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1677838134; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1677922160; ' \
                         'captcha_session_v2=2|1:0|10:1677922161|18:captcha_session_v2|88:RzJCaGUwdWdQNXFxTGY4dHpSbVpvQVF4aHJ2Uk90SDRONVg5dUk0VmF0cWVaWnR5S1dJZnVOeDFIK3pZSVJkZg==|' \
                         '8a6bd71d3cb5982308db4766953216ff6683fd1b7bd67324b7c55b1891fccaa0; gdxidpyhxdE=mJsIftPnWfKT9Vw8E082q719gmcIQISluGTpMDV%2BTocfs92oLE%2BujV6Xl%2FIQYxeK980UdIiYBxr7nrgP2Vv%2Bnv' \
                         'LmqAWPy27YaL%2BmS9RRjQQydRA7cjoD8M%5Cf8kbaB1nMIMX45%5Cp5I48E2EjZiAOaPXbci9gI88A8r8qjVO%2BL3ohXe0lB%3A1677922765564; YD00517437729195%3AWM_NI=ruHLHxPBw1sq%2BFlGPhpu8bFoOhP%2BlZAhWu9' \
                         'SimALgSDRgGW1rv9hl15B51cYlxxaY2cI87hRYbU3SXvKgMSeOBV8E%2FnFqi4unZOayj5qgj%2BGOV%2FDPh67LewEEONTU7%2BWaHQ%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eebadc34a9869c9be83f8de78a' \
                         'a3d44e839e8facd8418fedafa5b740a399f9d2e92af0fea7c3b92ab2ab88adcb74949eb9d2f27be99aa4a8d364b4abf883c5658db8a0ccef5283b89996bc59abb38fade659888ea9a8d63bb2aaf789ef3bf79efbd2e253b0a8b882d' \
                         'c3aa197f8d8eb5a9bbe9bd4d254b695a9aed26f8d9e9ad9c825ae9abf8ab77fae868182fc62f7afa98baa4d8d8ae1a5d364f6e98486f64f879182b5d83de989afa9d437e2a3; YD00517437729195%3AWM_TID=thaY088YLfdBVERQ' \
                         'QRbVLI09rkwOhxWv; ariawapChangeViewPort=false; ariaFixed=true; ariaReadtype=1; ariaoldFixedStatus=false; ariaStatus=false'
               }
    question_id = 308447090  # 知乎问题id
    interval = 5  # 一页html答案的数量
    offset = 0
    end = 1202  # 回答数

    i = 1
    file = open('anwsers.txt', 'w', encoding="utf8")
    while True:
        aurl = f'https://www.zhihu.com/api/v4/questions/{question_id}/feeds?include=content&limit={interval}&offset={offset}&order=default'
        aresponse = requests.get(aurl, headers=aheader, verify=False)
        # print(aresponse.text)
        anws = aresponse.json()["data"]
        print(len(anws))
        for anw in anws:
            content = anw["target"]["content"]
            excerpt = anw["target"]["excerpt"]
            file.write("anwser start~~~" + str(i) + "~~~\n")
            file.write(content + '\n')
            file.write(excerpt + '\n')
            file.write("anwser end~~~" + str(i) + "~~~\n")
            i += 1
        offset += interval
        if offset >= end:
            print('结束！！！')
            break

    file.close()

3、截取分析答案

#判断是否为汉字
def is_han(text):
    return all('\u4e00' <= char <= '\u9fff' for char in text)

def count_worlds():
    text = open('./anwsers.txt','r',encoding='utf-8').read()
    word_count = {}
    text_list = jieba.lcut(text, cut_all=True)
    new_text_list = []
    for content in text_list:
        if len(content) <= 1:
            continue

        if is_han(content):
            #print(content)
            word_count[content] = word_count.get(content, 0) + 1
            new_text_list.append(content)

    generate_text = ' '.join(new_text_list)
    sort_txt = sorted(list(word_count.items()), key = lambda a: a[1] ,reverse=True)
    print(sort_txt)
    mask_pic = numpy.array(Image.open('1.png'))#打开背景图片
    wordcloud = WordCloud(font_path=r"C:/Windows/Fonts/STSONG.TTF",
                          collocations=False,
                          max_words=100,
                          min_font_size=10,
                          max_font_size=500,
                          mask=mask_pic).generate(generate_text)
    image = wordcloud.to_image()
    # image.show()
    wordcloud.to_file('result.png')  # 把词云保存下来