知乎根据话题ID爬虫

Python爬虫

根据话题URL加载网页,筛选内容,存储答案内容和图片URL

  • 随便写的,获取知乎问题答案下的回答内容和图片

主要耗时长的是调整保存文件的排版

可以直接运行,代码如下:

import re
import json
import requests
import urllib3

urllib3.disable_warnings()

t = 30  # 设置根据赞数筛选

# 26830927  国内自然风景最美的地方是哪里?
qid = 26830927

# 图片列表,防止重复
img_urls = []


def get_answers():
    page_no = 0
    with open("answer.cache", "a", encoding="utf-8") as answer_cache:
        while True:
            print(page_no + 1)
            answer_cache.write("第" + (page_no + 1).__str__() + "页:\t\n")
            is_end = get_answers_by_page(page_no, answer_cache)
            page_no += 1
            if page_no >= 4:
                break
            if is_end:
                break
    answer_cache.close()


def get_answers_by_page(page_no, answer_cache):
    # 页偏移量,由limit决定
    offset = page_no * 20
    url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment" \
          "%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky" \
          "%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count" \
          "%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info" \
          "%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp" \
          "%2Cis_labeled%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A" \
          "%5D.topics&limit=20&offset={}&platform=desktop&sort_by=default".format(qid, offset)
    headers = {
        "User-Agent": "Mozilla/5.0  (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/69.0.3497.100 Safari/537.36",
    }
    r = requests.get(url, verify=False, headers=headers)
    data = json.loads(r.content.decode("utf-8"))
    items = data["data"]
    for item in items:
        if item.get("voteup_count") > t:
            print(item)
            answer_cache.write("\n\nanswer" + ":\t" + item.get("url") + "  ")
            answer_cache.write("\t\t voteup_count:\t" + str(item.get("voteup_count")) + "\n")
            answer_cache.write("author:\t" + item.get("author").get("name") + ":\t" +
                               "https://www.zhihu.com/people/" + item.get("author").get("url_token")+"\n\n")
            matched_img_url = re.findall(r'data-original="([^"]+)"', item.get("content"))
            cons = re.findall("<p>.*?</p>", item.get("content"), re.U)
            for con in cons:
                con = con.replace("<p>", "").replace("</p>", "").replace("</b>", "").replace("<b>", "").replace("<br/>", "\n")
                answer_cache.write(con + "\n")
            for img_url in matched_img_url:
                if img_url not in img_urls:
                    img_urls.append(img_url)
                    answer_cache.write(img_url + "\n")
        if item.get("is_end"):
            return True
    return False


if __name__ == "__main__":
    get_answers()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值