Jsoup
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
主要功能:
1. 从一个URL,文件或字符串中解析HTML;
2. 使用DOM或CSS选择器来查找、取出数据;
3. 可操作HTML元素、属性、文本;
/**
* 根据属性选择
*/
public class JsoupDemo02 {
public static void main(String[] args) throws Exception{
//新建HttpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("https://www.cnblogs.com/");
//返回响应
CloseableHttpResponse response = null;
//执行请求
try {
response = httpClient.execute(httpGet);
//获取实体
HttpEntity entity = response.getEntity();
//获取内容
String content = EntityUtils.toString(entity,"utf-8");
//获取Dom元素
Document document = Jsoup.parse(content);
//获取tag为标题的元素
Elements elementsByTag = document.getElementsByTag("title");
//打印标题内容
System.out.println("网页第一个标题: "+elementsByTag.get(0).text());
//获取网页id为site_nav_top的文档对象
Element elementById = document.getElementById("site_nav_top");
//打印该id对应的内容
System.out.println("id为site_nav_top的内容:"+elementById.html());
//获取class为post_item的元素
Elements elementsByClass = document.getElementsByClass("post_item");
for (Element element : elementsByClass) {
System.out.println("class属性为post_item: " + element.html());
}
//获取属性是height的元素
Elements elementsByAttribute = document.getElementsByAttribute("height");
for (Element element : elementsByAttribute) {
System.out.println("属性为height: " + element.toString());
}
//获取属性target的值为_blank的元素
Elements elementsByAttributeValue = document.getElementsByAttributeValue("target", "_blank");
for (Element element : elementsByAttributeValue) {
System.out.println("属性target的值为_blank: "+ element.toString());
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//关闭资源
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//关闭资源
httpClient.close();
}
}
/**
* 根据选择器选择
*/
public class JsoupDemo03 {
public static void main(String[] args) throws Exception{
//新建HttpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("https://www.cnblogs.com/");
//返回响应
CloseableHttpResponse response = null;
//执行请求
try {
response = httpClient.execute(httpGet);
//获取实体
HttpEntity entity = response.getEntity();
//获取内容
String content = EntityUtils.toString(entity,"utf-8");
//获取Dom元素
Document document = Jsoup.parse(content);
//使用id选择器
Elements selectElements = document.select("#post_list .post_item .post_item_body h3 a");
for (Element element : selectElements) {
System.out.println("标题: "+element.text());
System.out.println("博客地址: " + element.attr("href"));
System.out.println("target: " + element.attr("target"));
}
System.out.println("------------");
//选择带有href属性的a标签元素
Elements hrefElements = document.select("a[href]");
for (Element element : hrefElements) {
System.out.println("a标签的链接: "+element.toString());
}
//选择扩展名为.png的图片
Elements imgElements = document.select("img[src$=.png]");
for (Element element : imgElements) {
System.out.println("图片: " + element.toString() );
}
//获取第一个标题
Element title = document.getElementsByTag("title").first();
System.out.println("网页标题: " + title.text());
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//关闭资源
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//关闭资源
httpClient.close();
}
}