jsoup使用实例

package jsoup;

import java.io.File;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

//http://www.open-open.com/jsoup/
public class TestDocument {

	public static void main(String[] args) throws Exception {

		// parseBodyFragment();
		// parserHTML();
		// parseGmail();
		// download();
		// parserFromFile();
		// parseLink();
		// visitDom();
		// select();
		// parserURL();
		// Cleaner();
		// setContent();

		String html = "<span class=\"yP\" email=\"542335496@qq.com\" name=\"為妳變┳乖\"> - 為妳變┳乖</span>";
		html=unhtml(html);
		Document doc = Jsoup.parse(html);
		Element span = doc.select("span").first();
		String input = span.text();
		System.out.println(input);
		System.out.println(span.attr("email"));
		// <span class="y2"> - 您好,邮件我已经收到,我会尽快给您回复。祝你学习进步,
		// 工作顺利!</span>
		// <span title="2014年10月16日 下午6:06" id=":3s"
		// aria-label="2014年10月16日 下午6:06">10月16日</span>
	}

	public static String html(String content) {
		if (content == null)
			return "";
		String html = content;
		// html = html.replace( "'", "'");
		html = html.replaceAll("&", "&");
		html = html.replace("\"", """); // "
		html = html.replace("\t", "  ");// 替换跳格
		html = html.replace(" ", " ");// 替换空格
		html = html.replace("<", "<");
		html = html.replaceAll(">", ">");
		return html;
	}

	public static String unhtml(String content) {
		if (content == null)
			return "";
		String html = content;
		html = html.replaceAll("&","&");
		html = html.replace(""","\"");
		html = html.replace("  ","\t");// 替换跳格
		html = html.replace("- "," ");// 替换空格
		html = html.replace(" "," ");// 替换空格
		html = html.replace("<","<");
		html = html.replaceAll(">",">");
		return html;
	}
	private static void setContent() {

		String html = "<p>An <a href='http://example.com/'><b>example</b><div>test</div></a><span>字体</span> <li><li>link.</p>";
		Document doc = Jsoup.parse(html);
		Element div = doc.select("div").first(); // <div></div>
		div.html("<p>lorem ipsum</p>"); // <div><p>lorem ipsum</p></div>
		div.prepend("<p>First</p>");// 在div前添加html内容
		div.append("<p>Last</p>");// 在div之后添加html内容
		// 添完后的结果: <div><p>First</p><p>lorem ipsum</p><p>Last</p></div>

		Element span = doc.select("span").first(); // <span>One</span>
		span.wrap("<li><a href='http://example.com/'></a></li>");
		// 添完后的结果: <li><a href="http://example.com"><span>One</span></a></li>

		Element div2 = doc.select("li").first(); // <div></div>
		div2.text("five > four"); // <div>five > four</div>
		div2.prepend("First ");
		div2.append(" Last");

		doc.select("div.masthead").attr("title", "jsoup").addClass("round-box");

		System.out.println(doc);
	}

	private static void Cleaner() {
		String unsafe = "<p><a href='http://example.com/' οnclick='stealCookies()'>Link</a></p>";
		String safe = Jsoup.clean(unsafe, Whitelist.basic());
		System.out.println(safe);
		// now: <p><a href="http://example.com/" rel="nofollow">Link</a></p>f
	}

	private static void parserURL() throws Exception {
		Document doc = Jsoup.connect("http://www.open-open.com/").get();

		Element link = doc.select("a").first();
		String relHref = link.attr("href"); // == "/"
		String absHref = link.attr("abs:href"); // "http://www.open-open.com/"
		System.out.println(relHref);
		System.out.println(absHref);
	}

	private static void select() {
		String html = "<p>An <a href='http://example.com/'><b>example</b></a> link.</p>";
		Document doc = Jsoup.parse(html);// 解析HTML字符串返回一个Document实现
		Element link = doc.select("a").first();// 查找第一个a元素

		String text = doc.body().text(); // "An example link"//取得字符串中的文本
		String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址
		String linkText = link.text(); // "example""//取得链接地址中的文本

		String linkOuterH = link.outerHtml();
		// "<a href="http://example.com"><b>example</b></a>"
		String linkInnerH = link.html(); // "<b>example</b>"//取得链接内的html内容
		System.out.println(text);
		System.out.println(linkHref);
		System.out.println(linkText);
		System.out.println(linkInnerH);
		System.out.println(linkOuterH);
	}

	private static void visitDom() throws Exception {
		File input = new File("d:/login.html");
		Document doc = Jsoup.parse(input, "UTF-8", "http://www.baidu.com/");

		Element content = doc.getElementById("body");
		Elements links = content.getElementsByTag("a");
		for (Element link : links) {
			String linkHref = link.attr("href");
			String linkText = link.text();
			System.out.println(linkHref);
			System.out.println(linkText);
		}
	}

	private static void parseLink() {
		String html = "<p>An <a href='http://example.com/'><b>example</b></a> link.</p>";
		Document doc = Jsoup.parse(html);// 解析HTML字符串返回一个Document实现
		Element link = doc.select("a").first();// 查找第一个a元素

		String text = doc.body().text(); // "An example link"//取得字符串中的文本
		String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址
		String linkText = link.text(); // "example""//取得链接地址中的文本

		String linkOuterH = link.outerHtml();
		// "<a href="http://example.com"><b>example</b></a>"
		String linkInnerH = link.html(); // "<b>example</b>"//取得链接内的html内容
		System.out.println(text);
		System.out.println(linkHref);
		System.out.println(linkText);
		System.out.println(linkOuterH);
		System.out.println(linkInnerH);
	}

	private static void parserFromFile() throws Exception {
		File input = new File("d:/login.html");
		Document doc = Jsoup.parse(input, "UTF-8", "http://www.baidu.com/");
		System.err.println(doc);
	}

	private static void download() throws Exception {
		Document doc = Jsoup.connect("http://www.baidu.com/").data("query",
				"Java").userAgent("Mozilla").cookie("auth", "token").timeout(
				3000).get();
		System.out.println(doc);
	}

	private static void parserHTML() {
		String html = "<html><head><title>First parse</title></head>"
				+ "<body><p>Parsed HTML into a doc.</p></body></html>";
		Document doc = Jsoup.parse(html);
		System.out.println(doc);
	}

	private static void parseGmail() throws Exception {
		Document doc = Jsoup
				.connect("https://accounts.google.com/ServiceLogin").get();
		Element content = doc.getElementById("gaia_loginform");
		// System.out.println(content);

		Elements inputs = content.select("input[name]");
		// StringBuffer sb=new StringBuffer();
		Map<String, String> maps = new HashMap<String, String>();
		for (Element element : inputs) {
			// System.out.println(element);
			String name = element.attr("name");
			String value = element.attr("value");
			// System.out.println(name+"="+value);
			if (value != null && !"".equals(value)) {
				maps.put(name, value);
			}

		}
		// Email= Passwd=
		System.out.println(maps);
	}

	// 解析body片段
	private static void parseBodyFragment() {
		String html = "<div><p>Lorem ipsum.</p>";
		Document doc = Jsoup.parseBodyFragment(html);
		Element body = doc.body();

		System.out.println(body);
	}

}
package jsoup;
import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

//http://www.open-open.com/jsoup/
public class ListLinks {
	
    public static void main(String[] args) throws IOException {
        Validate.isTrue(args.length == 1, "usage: supply url to fetch");
        String url = args[0];
        print("Fetching %s...", url);

        Document doc = Jsoup.connect(url).get();
        Elements links = doc.select("a[href]");
        Elements media = doc.select("[src]");
        Elements imports = doc.select("link[href]");

        print("\nMedia: (%d)", media.size());
        for (Element src : media) {
            if (src.tagName().equals("img"))
                print(" * %s: <%s> %sx%s (%s)",
                        src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
                        trim(src.attr("alt"), 20));
            else
                print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
        }

        print("\nImports: (%d)", imports.size());
        for (Element link : imports) {
            print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
        }

        print("\nLinks: (%d)", links.size());
        for (Element link : links) {
            print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
        }
    }

    private static void print(String msg, Object... args) {
        System.out.println(String.format(msg, args));
    }

    private static String trim(String s, int width) {
        if (s.length() > width)
            return s.substring(0, width-1) + ".";
        else
            return s;
    }
}
package jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class Link {

	public static void main(String[] args) {
		String html = "<p>An <a href='http://example.com/'><b>example</b></a> link.</p>";
		Document doc = Jsoup.parse(html);//解析HTML字符串返回一个Document实现
		Element link = doc.select("a").first();//查找第一个a元素

		String text = doc.body().text(); // "An example link"//取得字符串中的文本
		String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址
		String linkText = link.text(); // "example""//取得链接地址中的文本

		String linkOuterH = link.outerHtml(); 
		    // "<a href="http://example.com"><b>example</b></a>"
		String linkInnerH = link.html(); // "<b>example</b>"//取得链接内的html内容
		System.out.println(text);
		System.out.println(linkHref);
		System.out.println(linkText);
		System.out.println(linkOuterH);
		System.out.println(linkInnerH);
	}
}



  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值