什么是Jsoup?
Jsoup是Java用于解析HTML,就类似XML解析器用于解析XML。 Jsoup它解析HTML成为真实世界的HTML。 它与jquery选择器的语法非常相似,并且非常灵活容易使用以获得所需的结果。
有哪些功能?
- 查找和提取数据,使用DOM遍历或CSS选择器
- 操纵HTML元素,属性和文本
- 从URL,文件或字符串中刮取并解析HTML
- 输出整洁的HTML
- 根据安全的白名单清理用户提交的内容,以防止XSS攻击
准备工作
-
导入jar
架包下载地址.jar
或者导入maven依赖方式:<dependency> <!-- jsoup HTML parser library @ http://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency>
-
测试方法
jsoup有三种获取文档的方式下面测试类已经给出
package com.zsx;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class JsoupTest {
public static void main(String[] args) {
//***********************使用url获取文档
/*String url="http://www.gzmssy.cn";
try {
Document document = Jsoup.connect(url).get();
System.out.println(document.title()); // 获得文档标题
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}*/
//***********************
//***********************使用文件加载html文档
/*File html = new File("C:/Users/mssy/Desktop/index.html");
try {
Document document2 = Jsoup.parse(html,"utf-8");
System.out.println(document2.title()); // 获得文档标题
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}*/
//***********************
//***********************从String中加载HTML文档
String html2 = "<html><head><title>Jsoup 标题</title></head>"
+ "<body><p>Parsed HTML into a doc.</p></body></html>";
Document document3 = Jsoup.parse(html2);
System.out.println(document3.title()); // 获得文档标题
//***********************
}
}
案例
- 获取HTML页面的fav图标
package com.zsx;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class JsoupFav {
public static void main(String[] args) {
String favImage = "Not found";
try {
Document document = Jsoup.connect("http://www.baidu.com").get();
Element element = document.head().select("link[href~=.*\\.(ico|png)]").first();
if (element == null) {
element = document.head().select("meta[itemprop=image]").first();
if (element != null) {
favImage = element.attr("content");
}
}else{
favImage = element.attr("href");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(favImage);
}
}
- 获得所有a连接
package com.zsx;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupA {
public static void main(String[] args) {
String url = "http://www.baidu.com";
try {
Document document = Jsoup.connect(url).get();
Elements links = document.select("a[href]");
for (Element element : links) {
System.out.println("link : " + element.attr("href"));
System.out.println("text : " + element.text());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
- 获得所有图片信息
package com.zsx;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupImage {
public static void main(String[] args) {
String url = "https://www.baidu.com";
try {
Document document = Jsoup.connect(url).get();
Elements image = document.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
int count = 0 ;
for (Element element : image) {
System.out.println("src : " + image.attr("src"));
System.out.println("height : " + image.attr("height"));
System.out.println("width : " + image.attr("width"));
System.out.println("alt : " + image.attr("alt"));
count++;
}
System.out.println("总张数 :" + count +" 张 ");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
- 获得表单内容
package com.zsx;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.w3c.dom.stylesheets.LinkStyle;
/**
* Jsoup获取表单元素
*
* @author mssy
*
*/
public class JsoupFormElement {
public static void main(String[] args) {
File html = new File("C:/Users/mssy/Desktop/jsoupFrom.html");
try {
Document document = Jsoup.parse(html, "utf-8");
Element formElement = document.getElementById("loginForm");
Elements links = document.select("a[href]");
Elements inputElements = formElement.getElementsByTag("input");
for (Element element : inputElements) {
String key = element.attr("name");
String value = element.attr("value");
System.out.println("Param name: " + key + "\nParam value: " + value);
}
links.attr("innerHTML","jsoup");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
jsoupFrom.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<!-- Always force latest IE rendering engine (even in intranet) & Chrome Frame
Remove this if you use the .htaccess -->
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>jsoup Test</title>
<meta name="description" content="">
<meta name="author" content="Administrator">
<meta name="viewport" content="width=device-width; initial-scale=1.0">
<!-- Replace favicon.ico & apple-touch-icon.png in the root of your domain and delete these references -->
<link rel="shortcut icon" href="/favicon.ico">
<link rel="apple-touch-icon" href="/apple-touch-icon.png">
</head>
<body>
<center>
<form id="loginForm" action="" method="">
用户名: <input type="text" name = "username" value="zhangsan"/>
密 码:<input type="password" name = "password" value="123456"/>
<input name ="sub" type="submit" value="提交"/>
</form>
</center>
</body>
- 消除不信任的HTML(以防止XSS)
package com.zsx;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
public class JsoupXSS {
public static void main(String[] args) {
String dirtyHTML = "<p><a href='http://www.baidu.com/' onclick='sendCookiesToMe()'>Link</a></p>";
String cleanHTML = Jsoup.clean(dirtyHTML, Whitelist.basic());
System.out.println(cleanHTML);
}
}