1、Jsoup是什么
Jsoup
是java的HTML解析器,可以解析请求URL的返回结果,可以解析HTML的片段内容,其实主要用来解析HTML内容的。
pom.xml文件引入:
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
2、Jsoup解析URL返回结果
package com.xxx.xxx.utils;
import java.io.IOException;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
* Jsoup Http请求工具
*
* @author avatar
* @since 2022-03-10
*/
public class JsoupHttpUtil {
public static void main(String[] args) {
Document doc = JsoupHttpUtil.get("https://www.baidu.com");
System.out.println(doc.toString());
}
/**
* get请求
* @param url
* @return
*/
public static Document get(String url){
try{
Connection conn = Jsoup.connect(url);
conn.header("Accept", "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01");
conn.header("Referer", url);
conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
conn.header("X-Requested-With", "XMLHttpRequest");
return conn.get();
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
}
3、Jsoup解析HTML片段
HTML片段:
<div class="orange">
<a href="./xxx/xxxx.do" targer="_blank">
点击跳转<em>此</em><em>链</em><em>接</em>
</a>
<a>下一页</a>
</div>
代码:
String each = "<div class=\"orange\"><a href=\"./xxx/xxxx.do\" targer=\"_blank\">点击跳转<em>此</em><em>链</em><em>接</em></a><a>下一页</a></div>";
//将html片段转成Document对象
Document div = Jsoup.parse(each);
//获取到所有的属性有target的标签,然后取第一个
Element a = div.getElementsByAttribute("target").get(0);
//获取a标签内部的所有的em的元素集合
Elements em = a.getElementsByTag("em");