垃圾百度百科分析

public class Main {

    static Map<String, String> headers = new HashMap<>() {{
        put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
        put("Accept-Encoding", "gzip, deflate, br");
        put("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
        put("Cache-Control", "max-age=0");
        put("Connection", "keep-alive");
        put("Host", "baike.baidu.com");
        put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36");
    }};

    public static void main(String[] args) {

        try {
            Document doc = Jsoup.connect("https://baike.baidu.com/item/%E5%A4%AA%E9%98%B3/24010")
                    .headers(headers).method(Connection.Method.GET).execute().parse();
            Element ele = doc.select(".main-content").get(0);
            List<Level2> level2List = new ArrayList<>();
            for (Element ele2 : doc.select(".level-2")) {
                Level2 level2 = new Level2();
                level2.name = ele2.text();
                System.out.println("--------------------------" + ele2.text() + "--------------------------------");
                boolean flag = false;
                Level3 level3 = null;
                for (Element el = ele2.nextElementSibling(); !el.attr("class").contains("level-2"); ) {
                    if (flag == false) {
                        if (el.attr("class").equals("para")) {
                            System.out.println(el.text());
                            level2.blocks.add(el.text());
                        } else if (el.attr("class").contains("level-3")) {
                            flag = true;
                        }
                    }
                    if (flag == true) {
                        if (el.attr("class").contains("level-3")) {
                            level3 = new Level3();
                            level3.name = el.text();
                            System.out.println("三级标题------------------------------------------------");
                            level2.level3List.add(level3);

                            System.out.println(el.text());
                        } else if (el.attr("class").equals("para")) {
                            level3.blocks.add(el.text());
                            System.out.println(el.text());
                        }
                    }
//                    if (el.attr("class").equals("para")) {
//                        System.out.println(el.text());
//                    } else if (el.attr("class").contains("level-3")) {
//                        System.out.println();
//                        System.out.println("欢迎来到三级标题");
//                        System.out.println(el.text());
//                    }
                    el = el.nextElementSibling();
                    if (el == null) {
                        break;
                    }
                }

                level2List.add(level2);

            }

            System.out.println("+++++++++++++++++++++++++++++++++++++++");
            level2List.stream().forEach(level2 -> {
                System.out.println("level2 title ----------------");
                System.out.println(level2.name);
                System.out.println("level2 Blocks ----------------");
                System.out.println(level2.blocks);
                System.out.println("level3 ----------------");
                level2.level3List.stream().forEach(level3 -> {
                    System.out.println(level3.name);
                    System.out.println(level3.blocks);
                });
            });

//            int count = 1;
            /**
             * 三级目录可以获取完成
             */
//            for (Element var : ele.select(".level-3")) {
//                System.out.println(count);
//                System.out.println(var.text());
//
//                for (Element elevar = var.nextElementSibling(); elevar.attr("class").equals("para"); elevar = elevar.nextElementSibling())
//                    System.out.println(elevar.text());
//                System.out.println("----------------------------------------------");
//                count++;
//            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

class Level2 {
    String name;

    List<String> blocks = new ArrayList<>();

    List<Level3> level3List = new ArrayList<>();

    @Override
    public String toString() {
        return "Level2{" +
                "name='" + name + '\'' +
                ", blocks=" + blocks +
                ", level3List=" + level3List +
                '}';
    }
}

class Level3 {
    String name;
    List<String> blocks = new ArrayList<>();

    @Override
    public String toString() {
        return "Level3{" +
                "name='" + name + '\'' +
                ", blocks=" + blocks +
                '}';
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值