public class Main {
static Map<String, String> headers = new HashMap<>() {{
put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
put("Accept-Encoding", "gzip, deflate, br");
put("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
put("Cache-Control", "max-age=0");
put("Connection", "keep-alive");
put("Host", "baike.baidu.com");
put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36");
}};
public static void main(String[] args) {
try {
Document doc = Jsoup.connect("https://baike.baidu.com/item/%E5%A4%AA%E9%98%B3/24010")
.headers(headers).method(Connection.Method.GET).execute().parse();
Element ele = doc.select(".main-content").get(0);
List<Level2> level2List = new ArrayList<>();
for (Element ele2 : doc.select(".level-2")) {
Level2 level2 = new Level2();
level2.name = ele2.text();
System.out.println("--------------------------" + ele2.text() + "--------------------------------");
boolean flag = false;
Level3 level3 = null;
for (Element el = ele2.nextElementSibling(); !el.attr("class").contains("level-2"); ) {
if (flag == false) {
if (el.attr("class").equals("para")) {
System.out.println(el.text());
level2.blocks.add(el.text());
} else if (el.attr("class").contains("level-3")) {
flag = true;
}
}
if (flag == true) {
if (el.attr("class").contains("level-3")) {
level3 = new Level3();
level3.name = el.text();
System.out.println("三级标题------------------------------------------------");
level2.level3List.add(level3);
System.out.println(el.text());
} else if (el.attr("class").equals("para")) {
level3.blocks.add(el.text());
System.out.println(el.text());
}
}
// if (el.attr("class").equals("para")) {
// System.out.println(el.text());
// } else if (el.attr("class").contains("level-3")) {
// System.out.println();
// System.out.println("欢迎来到三级标题");
// System.out.println(el.text());
// }
el = el.nextElementSibling();
if (el == null) {
break;
}
}
level2List.add(level2);
}
System.out.println("+++++++++++++++++++++++++++++++++++++++");
level2List.stream().forEach(level2 -> {
System.out.println("level2 title ----------------");
System.out.println(level2.name);
System.out.println("level2 Blocks ----------------");
System.out.println(level2.blocks);
System.out.println("level3 ----------------");
level2.level3List.stream().forEach(level3 -> {
System.out.println(level3.name);
System.out.println(level3.blocks);
});
});
// int count = 1;
/**
* 三级目录可以获取完成
*/
// for (Element var : ele.select(".level-3")) {
// System.out.println(count);
// System.out.println(var.text());
//
// for (Element elevar = var.nextElementSibling(); elevar.attr("class").equals("para"); elevar = elevar.nextElementSibling())
// System.out.println(elevar.text());
// System.out.println("----------------------------------------------");
// count++;
// }
} catch (IOException e) {
e.printStackTrace();
}
}
}
class Level2 {
String name;
List<String> blocks = new ArrayList<>();
List<Level3> level3List = new ArrayList<>();
@Override
public String toString() {
return "Level2{" +
"name='" + name + '\'' +
", blocks=" + blocks +
", level3List=" + level3List +
'}';
}
}
class Level3 {
String name;
List<String> blocks = new ArrayList<>();
@Override
public String toString() {
return "Level3{" +
"name='" + name + '\'' +
", blocks=" + blocks +
'}';
}
}
垃圾百度百科分析
最新推荐文章于 2020-01-20 14:58:25 发布