通过httpclinet爬到相应的内容后,需要解析,这里就用到jsoup来解析
maven依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.10</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
获取数据
package com.jsoup.demo.jsoup;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Hello world!
*
*/
public class App {
public static void main(String[] args) {
CloseableHttpClient chc = HttpClients.createDefault();
HttpGet httpget = new HttpGet("https://www.cnblogs.com/");
CloseableHttpResponse chr = null;
try {
chr = chc.execute(httpget);
HttpEntity he = chr.getEntity();
String res = EntityUtils.toString(he, "utf-8");
chr.close();
//解析数据,转成doc
Document document = Jsoup.parse(res);
//通过id选择器获取数据
Element site_nav_top = document.getElementById("site_nav_top");
//通过class选择器获取数据
Elements titlelnks = document.getElementsByClass("titlelnk");
Element titlelnk = titlelnks.get(0);
System.out.println(titlelnk.html());
}catch (Exception e) {
e.printStackTrace();
}
}
}
一些对dom对象的操作
package com.jsoup.demo.jsoup;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Hello world!
*
*/
public class App2 {
public static void main(String[] args) {
CloseableHttpClient chc = HttpClients.createDefault();
HttpGet httpget = new HttpGet("https://www.cnblogs.com/");
CloseableHttpResponse chr = null;
try {
chr = chc.execute(httpget);
HttpEntity he = chr.getEntity();
String res = EntityUtils.toString(he, "utf-8");
chr.close();
Document document = Jsoup.parse(res);
//通过选择器查找 标签的内容
Elements as = document.select("#post_list .post_item .post_item_body h3 a");
for(Element a:as) {
System.out.println(a.text());
}
//通过选择器获得所有href的标签
Elements hrefs = document.select("a[href]");
for(Element href:hrefs) {
System.out.println(href.toString());
}
//选择器查找扩展名后缀是png结尾的
Elements pngs = document.select("img[src$=.png]");
for(Element png:pngs) {
System.out.println(png.toString());
}
//选择器查找 标签里的key value
Elements key = document.select("#post_list .post_item .post_item_body h3 a");
for(Element a:key) {
System.out.println(a.attr("href"));
System.out.println(a.text());
//System.out.println(a.attr("target"));
}
}catch (Exception e) {
e.printStackTrace();
}
}
}