抓取百度首页网页源代码的例子
package com.pyc.search.crawler.node.tools;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class TestJsoup {
public static Document getDocument(String url) throws IOException {
// 加入 userAgent 超时等设置
Document document = Jsoup
.connect(url)
.userAgent(
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.29 Safari/537.36")
.header("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.timeout(1000 * 20).get();
return document;
}
public static void main(String[] args) {
try {
String htmls =getDocument("http://www.baidu.com/").html();
System.out.println(htmls);
} catch (IOException e) {
e.printStackTrace();
}
}
}