使用Jsoup的静态方法 Jsoup.parse(String html) 把 html 字符串解析为一个新的 Documnet 文档,然后通过使用 Document 中的方法或者它父类 Element 和 Node 中的方法取得相关的数据。
下面以获取
http://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient/4.5.3 这个页面中所有的 table 为例子。
1、原网页界面:
2、源码:
package com.makerspace.html.jsoup;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 描述:通过Jsoup获取页面信息
* 公司: www.makerspace.com
* @autho guanqin_li
* @time 2017年8月25日 下午3:41:47
*/
public class JsoupTest {
public void Test() {
// HttpClient httpClient = new DefaultHttpClient();//DefaultHttpClient已经被弃用
HttpClient httpClient = HttpClientBuilder.create().build();
HttpGet get = new HttpGet("http://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient/4.5.3");
try {
HttpResponse resp = httpClient.execute(get);
HttpEntity entity = resp.getEntity();
String html = convertStreamToString(entity.getContent());// 将输入流InputStream转换为String
if (!StringUtil.isBlank(html)) {
Document doc = Jsoup.parse(html);// 把html字符串解析为一个新的Documnet文档
if (null != doc) {
Elements elements = doc.getElementsByTag("table");// 通过tag标签获取元素
if (null != elements && elements.size() > 0) {
for (Element element : elements) {
System.out.println("===================================================");
System.out.println(element.toString());
}
}
}
} else {
System.out.println("html str is blank !!");
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 方法名称:convertStreamToString
* 方法说明:将输入流InputStream转换为String
* @param is
* @return
* @autho guanqin_li
* @time 2017年8月25日 下午3:56:27
*/
public static String convertStreamToString(InputStream is) {
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
StringBuilder sb = new StringBuilder();
String line = null;
try {
while ((line = reader.readLine()) != null) {
sb.append(line + "\n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
public static void main(String[] args) {
new JsoupTest().Test();
}
}
3、获取该页面所有 table 的结果:
===================================================
<table class="grid" width="100%">
<tbody>
<tr>
<th>License</th>
<td><span class="b lic">Apache 2.0</span></td>
</tr>
<tr>
<th>Categories</th>
<td><a href="/open-source/http-clients" class="b c">HTTP Clients</a></td>
</tr>
<tr>
<th>HomePage</th>
<td><a href="http://hc.apache.org/httpcomponents-client" rel="nofollow"> http://hc.apache.org/httpcomponents-client </a></td>
</tr>
<tr>
<th>Date</th>
<td>(Jan 21, 2017) </td>
</tr>
<tr>
<th style="width: 12em;">Files</th>
<td><a class="vbtn" href="http://central.maven.org/maven2/org/apache/httpcomponents/httpclient/4.5.3/httpclient-4.5.3.jar">Download (JAR)</a><span style="color: black; font-weight: bold;"> (696 KB) </span></td>
</tr>
<tr>
<th style="width: 12em;">Repositories</th>
<td><a class="b lic" href="/repos/central">Central</a></td>
</tr>
<tr>
<th>Used By</th>
<td><a href="/artifact/org.apache.httpcomponents/httpclient/usages"><b>5,584 artifacts</b></a></td>
</tr>
</tbody>
</table>
===================================================
<table class="grid" style="vertical-align: middle" width="100%">
<thead>
<tr>
<th style="width: 12em; text-align: center">Category/License</th>
<th style="width: 28px"></th>
<th> Group / Artifact</th>
<th style="text-align: center">Version</th>
<th style="text-align: center">Updates</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: right"><a href="/open-source/base64-libraries" class="b c">Base64</a><br><span class="b lic">Apache 2.0</span></td>
<td>
<picture>
<source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/a117c37ecc0ed0750c48bd4755638e06">
<img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/c44e3998569145e628d7d13a288ba5a">
</picture></td>
<td><a href="/artifact/commons-codec">commons-codec</a> » <a href="/artifact/commons-codec/commons-codec">commons-codec</a></td>
<td style="text-align: center"><a class="vbtn release" href="/artifact/commons-codec/commons-codec/1.9"> 1.9</a></td>
<td style="text-align: center"><a class="vbtn release" href="/artifact/commons-codec/commons-codec/1.10">1.10</a></td>
</tr>
<tr>
<td style="text-align: right"><a href="/open-source/logging-frameworks" class="b c">Logging</a><br><span class="b lic">Apache 2.0</span></td>
<td>
<picture>
<source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/a117c37ecc0ed0750c48bd4755638e06">
<img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/bb2b8886770c45c0e0d07cc97851a6f1">
</picture></td>
<td><a href="/artifact/commons-logging">commons-logging</a> » <a href="/artifact/commons-logging/commons-logging">commons-logging</a></td>
<td style="text-align: center"><a class="vbtn release" href="/artifact/commons-logging/commons-logging/1.2"> 1.2</a></td>
<td style="text-align: center"> ✔ </td>
</tr>
<tr>
<td style="text-align: right"><a href="/open-source/http-clients" class="b c">HTTP Clients</a><br><span class="b lic">Apache 2.0</span></td>
<td>
<picture>
<source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/d0a90db48344c7126a6b54f1f3089347">
<img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/e722b6263952f18e235cd8e0985546a1">
</picture></td>
<td><a href="/artifact/org.apache.httpcomponents">org.apache.httpcomponents</a> » <a href="/artifact/org.apache.httpcomponents/httpcore">httpcore</a></td>
<td style="text-align: center"><a class="vbtn release" href="/artifact/org.apache.httpcomponents/httpcore/4.4.6"> 4.4.6</a></td>
<td style="text-align: center"> ✔ </td>
</tr>
</tbody>
</table>
===================================================
<table class="grid" style="vertical-align: middle" width="100%">
<thead>
<tr>
<th style="width: 12em; text-align: center">Category/License</th>
<th style="width: 28px"></th>
<th> Group / Artifact</th>
<th style="text-align: center">Version</th>
<th style="text-align: center">Updates</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: right"><a href="/open-source/testing-frameworks" class="b c">Testing</a><br><span class="b lic">EPL 1.0</span></td>
<td>
<picture>
<source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/d0f73ecf7a2ecdc8f1082e67dbb5617d">
<img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/7cb2d4617d97415f562bd5711c429a95">
</picture></td>
<td><a href="/artifact/junit">junit</a> » <a href="/artifact/junit/junit">junit</a></td>
<td style="text-align: center"><a class="vbtn release" href="/artifact/junit/junit/4.11"> 4.11</a></td>
<td style="text-align: center"><a class="vbtn release" href="/artifact/junit/junit/4.12">4.12</a></td>
</tr>
<tr>
<td style="text-align: right"><a href="/open-source/mocking" class="b c">Mocking</a><br><span class="b lic">MIT</span></td>
<td>
<picture>
<source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/883ab1fd45b7243b298b18790e70cf3e">
<img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/ab8655889ce452e2fa4713aa13b8428f">
</picture></td>
<td><a href="/artifact/org.mockito">org.mockito</a> » <a href="/artifact/org.mockito/mockito-core">mockito-core</a></td>
<td style="text-align: center"><a class="vbtn release" href="/artifact/org.mockito/mockito-core/1.8.5"> 1.8.5</a></td>
<td style="text-align: center"><a class="vbtn release" href="/artifact/org.mockito/mockito-core/2.9.0">2.9.0</a></td>
</tr>
</tbody>
</table>
===================================================
<table class="grid" width="100%">
<thead>
<tr>
<th style="width: 16em;">License</th>
<th>URL</th>
</tr>
</thead>
<tbody>
<tr>
<td>The Apache Software License, Version 2.0</td>
<td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt" rel="nofollow"> http://www.apache.org/licenses/LICENSE-2.0.txt </a></td>
</tr>
<tr>
<td>Apache License, Version 2.0</td>
<td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt" rel="nofollow"> http://www.apache.org/licenses/LICENSE-2.0.txt </a></td>
</tr>
</tbody>
</table>
===================================================
<table class="grid" width="100%">
<thead>
<tr>
<th style="width: 16em;">Name</th>
<th>Email</th>
<th>Dev Id</th>
<th>Roles</th>
<th>Organization</th>
</tr>
</thead>
<tbody>
<tr>
<td>Ortwin Glueck</td>
<td> oglueck -at- apache.org</td>
<td>oglueck</td>
<td>Emeritus PMC</td>
<td></td>
</tr>
<tr>
<td>Oleg Kalnichevski</td>
<td> olegk -at- apache.org</td>
<td>olegk</td>
<td>Committer, PMC</td>
<td></td>
</tr>
<tr>
<td>Asankha C. Perera</td>
<td> asankha -at- apache.org</td>
<td>asankha</td>
<td>Committer, PMC Chair</td>
<td></td>
</tr>
<tr>
<td>Sebastian Bazley</td>
<td> sebb -at- apache.org</td>
<td>sebb</td>
<td>Committer, PMC</td>
<td></td>
</tr>
<tr>
<td>Erik Abele</td>
<td> erikabele -at- apache.org</td>
<td>erikabele</td>
<td>Committer, PMC</td>
<td></td>
</tr>
<tr>
<td>Ant Elder</td>
<td> antelder -at- apache.org</td>
<td>antelder</td>
<td>Committer, PMC</td>
<td></td>
</tr>
<tr>
<td>Paul Fremantle</td>
<td> pzf -at- apache.org</td>
<td>pzf</td>
<td>Committer, PMC</td>
<td></td>
</tr>
<tr>
<td>Roland Weber</td>
<td> rolandw -at- apache.org</td>
<td>rolandw</td>
<td>Emeritus PMC</td>
<td></td>
</tr>
<tr>
<td>Sam Berlin</td>
<td> sberlin -at- apache.org</td>
<td>sberlin</td>
<td>Committer</td>
<td></td>
</tr>
<tr>
<td>Sean C. Sullivan</td>
<td> sullis -at- apache.org</td>
<td>sullis</td>
<td>Committer</td>
<td></td>
</tr>
<tr>
<td>Jonathan Moore</td>
<td> jonm -at- apache.org</td>
<td>jonm</td>
<td>Committer, PMC</td>
<td></td>
</tr>
<tr>
<td>Gary Gregory</td>
<td> ggregory -at- apache.org</td>
<td>ggregory</td>
<td>Committer</td>
<td></td>
</tr>
<tr>
<td>William Speirs</td>
<td> wspeirs at apache.org</td>
<td>wspeirs</td>
<td>Committer</td>
<td></td>
</tr>
<tr>
<td>Karl Wright</td>
<td> kwright -at- apache.org</td>
<td>kwright</td>
<td>Committer</td>
<td></td>
</tr>
<tr>
<td>Francois-Xavier Bonnet</td>
<td> fx -at- apache.org</td>
<td>fx</td>
<td>Committer</td>
<td></td>
</tr>
</tbody>
</table>