JSoup一般解析HTML时会自动添加换行, 例如
String html = "<div> <div> <a href=\"http://leetcode.com/\" "
+ "target=\"_blank\">LeetCode</a>LeetCode</div><div>";
Document doc = Jsoup.parse(html);
System.out.println(doc.select("body").html().replace("<body>", "").replace("</body>", ""));
未处理时, Document doc = Jsoup.parse(source)对HTML进行格式化操作。解析后输出:
<div>
<div>
<a href="http://leetcode.com/" target="_blank">LeetCode</a>LeetCode
</div>
<div>
使用doc.outputSettings().indentAmount(0).prettyPrint(false)关闭自定义格式.
Document doc = Jsoup.parse(source);
doc.outputSettings().indentAmount(0).prettyPrint(false);
System.out.println(doc.select("body").html().replace("<body>", "").replace("</body>", ""));
输出:
<div> <div> <a href="http://leetcode.com/" target="_blank">LeetCode</a>LeetCode</div><div></div></div>
import org.springframework.stereotype.Service;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;
public class LinkReplacer {
/**
* 替换href链接为"javascript:void(0);"
*
* @param source
* @return
*/
public static String replaceHref(String source) {
if (StringUtils.isBlank(source)) {
return source;
}
try {
Document doc = Jsoup.parse(source);
// 关闭JSoup自动格式化的功能
doc.outputSettings().indentAmount(0).prettyPrint(false);
Elements select = doc.select("a");
for (Element e : select) {
e.attr("href", "javascript:void(0);");
}
return doc.select("body").html().replace("<body>", "")
.replace("</body>", "");
} catch (Exception e) {
return source;
}
}
public static void main(String[] args) {
String source = "关于白板题目去哪里找:<a href=\"https://leetcode.com/problemset/database/\">Leetcode</a><a href=\"https://www.topcoder.com/\">TopCoder</a>, <a href=\"http://codeforces.com/\">Codeforces</a>, <a href=\"https://projecteuler.net/\">Project Euler</a> 都是不错的选择";
// 关于白板题目去哪里找:<a href="javascript:void(0);">Leetcode</a><a href="javascript:void(0);">TopCoder</a>, <a href="javascript:void(0);">Codeforces</a>, <a href="javascript:void(0);">Project Euler</a> 都是不错的选择
System.out.println(replaceHref(source));
source = "<div> <div> <a href=\"http://leetcode.com/\" "
+ "target=\"_blank\">LeetCode</a>LeetCode</div><div>";
// <div> <div> <a href="javascript:void(0);" target="_blank">LeetCode</a>LeetCode</div><div></div></div>
System.out.println(replaceHref(source));
}
}