Java实现知乎热点小时榜爬虫

1.效果演示

1.1 热点问题列表

启动程序后,自动展示热点问题,并等待终端输入
在这里插入图片描述

1.2 根据序号选择想看的热点问题

输入问题序号,展示回答内容
在这里插入图片描述

1.3 退出

输入q即可退出程序
在这里插入图片描述

2.源码

2.1 pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>zhihu</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <dependencies>

        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.9.0</version>
        </dependency>

    </dependencies>

</project>

2.2 Java代码

package org.example;

import com.google.gson.Gson;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;


public class ZhihuHotHourCrawler {
    final static String ZHIHU_HOT_URL = "https://www.zhihu.com/api/v4/creators/rank/hot?domain=0&period=hour";
    final static String QUESTION_HTML_MATCH_PREFIX = "<script id=\"js-initialData\" type=\"text/json\">";
    final static String QUESTION_HTML_MATCH_SUFFIX = "</script>";

    public static String getHtml(String urlString) {
        StringBuffer response = new StringBuffer();
        URL url = null;
        try {
            url = new URL(urlString);
            URLConnection connection = url.openConnection();

            BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
            String inputLine;

            while ((inputLine = in.readLine()) != null) {
                response.append(inputLine);
            }
            in.close();
        } catch (Exception e) {
            e.printStackTrace();
        }

        return response.toString();
    }

    public static void parseHotHtml(String hotHtml, Map<String, String> titleAndUrl, Map<String, String> indexAndTitle) {
        // 解析知乎小时榜页HTML,返回所有热搜问题标题和问题链接
        Gson gson = new Gson();
        JsonObject jsonObject = gson.fromJson(hotHtml, JsonObject.class);

        int index = 1;
        for (JsonElement item : jsonObject.get("data").getAsJsonArray()) {
            JsonObject question = item.getAsJsonObject().get("question").getAsJsonObject();
            String questionUrl = question.get("url").getAsString();
            String questionTitle = question.get("title").getAsString();
            titleAndUrl.put(questionTitle, questionUrl);
            indexAndTitle.put(String.valueOf(index), questionTitle);
            index++;
        }
    }

    public static String removeHtmlTag(String content) {
        StringBuilder sb = new StringBuilder(content);
        while (true) {
            int tagStartIndex = sb.indexOf("<");
            if (tagStartIndex < 0) {
                return sb.toString();
            }
            int tagEndIndex = sb.indexOf(">", tagStartIndex);
            sb.delete(tagStartIndex, tagEndIndex + 1);
        }
    }

    public static void parseQuestionHtml(String questionHtml) {
        int prefixIndex = questionHtml.indexOf(QUESTION_HTML_MATCH_PREFIX);
        int suffixIndex = questionHtml.indexOf(QUESTION_HTML_MATCH_SUFFIX, prefixIndex);
        String jsonStr = questionHtml.substring(prefixIndex + QUESTION_HTML_MATCH_PREFIX.length(), suffixIndex);

        // 解析知乎问题页HTML,输出问题对应的回答内容
        Gson gson = new Gson();
        JsonObject jsonObject = gson.fromJson(jsonStr, JsonObject.class);
        JsonObject answers = jsonObject.get("initialState").getAsJsonObject().get("entities").getAsJsonObject().get("answers").getAsJsonObject();
        int answerNum = 1;
        for (String answerId : answers.keySet()) {
            JsonObject answer = answers.get(answerId).getAsJsonObject();
            String content = answer.get("content").getAsString();
            String finalContent = removeHtmlTag(content);
            System.out.println("A" + answerNum + ": " + finalContent);
            answerNum++;
        }
    }

    public static void main(String[] args) {
        String hotHtml = getHtml(ZHIHU_HOT_URL);

        Map<String, String> titleAndUrl = new LinkedHashMap<>();
        Map<String, String> indexAndTitle = new LinkedHashMap<>();

        parseHotHtml(hotHtml, titleAndUrl, indexAndTitle);

        for (String key : indexAndTitle.keySet()) {
            System.out.println(key + "." + indexAndTitle.get(key));
        }

        while (true) {
            Scanner scanner = new Scanner(System.in);
            System.out.print("请输入序号:");
            String nextLine = scanner.nextLine();
            if (nextLine.equals("q")) {
                break;
            } else {
                String questionUrl = titleAndUrl.get(indexAndTitle.get(nextLine));
                String questionHtml = getHtml(questionUrl);
                parseQuestionHtml(questionHtml);
            }
        }
    }
}

3.补充

如果不好使了,可以留言,我更新一下代码(如果有时间的话😂)。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值