jsoup 是一款 Java 的HTML 解析器,可直接解析某个URL地址、HTML文本内容。
示例代码:
File input = new File("/tmp/input.html");
Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
Element content = doc.getElementById("content");
Elements links = content.getElementsByTag("a");
for (Element link : links) {
String linkHref = link.attr("href");
String linkText = link.text();
}
2行代码将html转换成纯文本。
- String html = "你好,我是来自<a href='http://www.story.net/' target='_blank'>gushi</a>小菜。";
- System.out.println(Jsoup.parse(html).text());
主要介绍利用 jsoup 将 html 文档中的链接、图片以及其他引入的链接解析出来。
运行时需要传入要解析的URL地址,程序将自动下载该地址的html文档并进行解析。
- import org.apache.commons.lang.Validate;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.Jsoup;
- import org.jsoup.select.Elements;
- import java.net.URL;
- import java.io.IOException;
- /**
- * Example program to list links from a URL.
- */
- public class ListLinks {
- public static void main(String[] args) throws IOException {
- Validate.isTrue(args.length == 1, "usage: supply url to fetch");
- URL url = new URL(args[0]);
- print("Fetching %s...", url.toExternalForm());
- Document doc = Jsoup.parse(url, 3*1000);
- Elements links = doc.select("a[href]");
- Elements media = doc.select("[src]");
- Elements imports = doc.select("link[href]");
- print("/nMedia: (%d)", media.size());
- for (Element src : media) {
- if (src.tagName().equals("img"))
- print(" * %s: <%s> %sx%s (%s)",
- src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
- trim(src.attr("alt"), 20));
- else
- print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
- }
- print("/nImports: (%d)", imports.size());
- for (Element link : imports) {
- print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
- }
- print("/nLinks: (%d)", links.size());
- for (Element link : links) {
- print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
- }
- }
- private static void print(String msg, Object... args) {
- System.out.println(String.format(msg, args));
- }
- private static String trim(String s, int width) {
- if (s.length() > width)
- return s.substring(0, width-1) + ".";
- else
- return s;
- }
- }