本文出自:http://blog.csdn.net/dt235201314/article/details/79003591
一丶效果图
二丶概述
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
jsoup的主要功能如下:
1. 从一个URL,文件或字符串中解析HTML;
2. 使用DOM或CSS选择器来查找、取出数据;
3. 可操作HTML元素、属性、文本;
三丶看代码
1.选取要抓取的网络地址 例:我的博客 http://blog.csdn.net/dt235201314
2.选取要抓取的类容
博主信息类:
public class BlogAuthor { //作者名字 private String authorName; //访问数量 private String visitNumber; //积分 private String mark; //排名 private String rank; //原创文章数量 private String originalArticleNumber; //转载文章数量 private String reprintArticleNumber; //翻译文章数量 private String translateArticleNumber; //评论数量 private String commentNumber; //头像链接 private String avatarUrl; //我的代号 private String code; //我的名言 private String myHelloWorld;页面文章摘要类:
public class BlogIntroduction { //文章标题 private String title; //文章摘要 private String description; //文章信息,包括阅读量,评论数,发表时间等 private String msg; //文章分类 private String category; //文章链接 private String url;3.相关操作(谷歌浏览器)
这个时候浏览器会出现源码,且锁定到你选取的位置
其它位置一样
、
4.代码解析
添加依赖:
compile 'org.jsoup:jsoup:1.9.2'
/** * 获取博主的基本信息 * * @return 博主信息 */ public static BlogAuthor getBlogAutoMessage() { Document doc; BlogAuthor blogAuthor = null; Elements elements; /**作者名字*/ String authorName; /** 访问数量*/ String visitNumber; /** 积分*/ String mark; /** 排名*/ String rank; /** 原创文章数量*/ String originalArticleNumber; /** 转载文章数量*/ String reprintArticleNumber; /** 翻译文章数量*/ String translateArticleNumber; /** 评论数量*/ String commentNumber; /** 头像链接*/ String avatarUrl; /**我的代号*/ String code; /**我的名言*/ String myHelloWorld; try { doc = Jsoup.connect(BLOG_HOMEPAGE) .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31") //"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31" //"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0" .timeout(10000).get(); if (doc != null) { elements = doc.select("div#blog_title").select("h2").select("a"); code = elements.first().text(); elements = doc.select("div#blog_title").select("h3"); myHelloWorld = elements.first().text(); elements = doc.select("div#blog_userface").select("a.user_name"); authorName = elements.first().text(); elements = doc.select("div#blog_userface").select("a").select("img"); avatarUrl = elements.first().attr("src"); elements = doc.select("ul#blog_rank").select("li"); visitNumber = elements.get(0).text(); mark = elements.get(1).text(); rank = elements.get(3).text(); elements = doc.select("ul#blog_statistics").select("li"); originalArticleNumber = elements.get(0).text(); reprintArticleNumber = elements.get(1).text(); translateArticleNumber = elements.get(2).text(); commentNumber = elements.get(3).text(); blogAuthor = new BlogAuthor(code,myHelloWorld,authorName, visitNumber, mark, rank, originalArticleNumber, reprintArticleNumber, translateArticleNumber, commentNumber, avatarUrl); } } catch (Exception e) { e.printStackTrace(); blogAuthor = new BlogAuthor("","","", "访问:0", "积分:0", "积分:0", "原创:0", "转载:0", "译文:0", "评论:0", ""); } return blogAuthor; }
/** * 按时间排列,获取指定页的博客简介 * * @param pages 页数 * @return 简介 */ public static List<BlogIntroduction> getOnePageBlogIntroductionByTime(int pages) { if (pages < 1) { return null; } int totalPages = getBlogPages(); if (pages > totalPages) { return null; } Document doc; Elements blogList; List<BlogIntroduction> blogIntroductionList = null; BlogIntroduction blogIntroduction; try { doc = Jsoup.connect(BLOG_HOMEPAGE + "/article/list/" + pages) .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31") .timeout(10000).get(); blogList = doc.select("div#article_list > div"); if (blogList != null) { blogIntroductionList = new ArrayList<>(); for (Element blogItem : blogList) { blogIntroduction = new BlogIntroduction(); String title = blogItem.select("div.article_title > h1").text(); String description = blogItem.select("div.article_description").text(); String msg = blogItem.select("div.article_manage").text(); String link = BASE_PATH + blogItem.select("div.article_title > h1").select("span.link_title") .select("a").attr("href"); blogIntroduction.setTitle(title); blogIntroduction.setDescription(description); blogIntroduction.setMsg(msg); blogIntroduction.setUrl(link); blogIntroduction.setCategory(""); blogIntroductionList.add(blogIntroduction); } } } catch (IOException e1) { e1.printStackTrace(); } return blogIntroductionList; }测试类
public class JsoupUtilTest { public static void main(String[] args) { BlogAuthor blogAuthor = JsoupUtil.getBlogAutoMessage(); List<BlogIntroduction> blogs = JsoupUtil.getOnePageBlogIntroductionByTime(1); System.out.println("==-->"+ blogAuthor.getAuthorName()); System.out.println("==-->"+blogAuthor.getCommentNumber()); System.out.println("==-->"+ blogAuthor.getAvatarUrl()); System.out.println("==-->"+ blogAuthor.getVisitNumber()); System.out.println("==-->"+blogAuthor.getRank()); System.out.println("==-->"+blogAuthor.getMark()); System.out.println("==-->"+blogAuthor.getOriginalArticleNumber()); System.out.println("==-->"+blogAuthor.getReprintArticleNumber()); System.out.println("==-->"+blogAuthor.getTranslateArticleNumber()); System.out.println("==-->"+blogAuthor.getCode()); System.out.println("==-->"+blogAuthor.getMyHelloWorld()); for(BlogIntroduction blog:blogs){ System.out.println("==-->"+blog.getTitle()); System.out.println("==-->"+blog.getDescription()); } } }输出见效果图
参考内容:
Android 个人博客客户端——My CSDN 的实现(2)
四丶源码下载
如果文章对你有用欢迎star,欢迎关注