jsoup解析HTML

13 篇文章 0 订阅
package com.test.html.jsoup;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import javax.naming.Context;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;



public abstract class BasicHtmlFramework implements IHtmlBehavior{

    protected List<? extends Object> showList;//在子类中使用不需要实例化,直接赋对象引用

    public Document jsoupDocument;
    public Connection jsoupConnection;

    protected String strUrl;// 原网页的链接地址

    protected String htmlTitle;// 页面主标题
    protected String htmlHeader;// 页面的头
    protected List<SubTitle> htmlSubTitles;// 页面的副标题

    protected BasicHtmlFramework[] mBasicHtmlFrameworks = new BasicHtmlFramework[10];;

    public BasicHtmlFramework() {
        // TODO Auto-generated constructor stub

        mBasicHtmlFrameworks[0] = this;
        System.out.println("mBasicHtmlFrameworks[0] -> " + mBasicHtmlFrameworks[0]);
        init();
    }

    public BasicHtmlFramework(String strUrl){
        this.strUrl = strUrl;
        init();
    }

    private void init(){
        htmlSubTitles = new ArrayList<>();
    }



    @Override
    public List<? extends Object> getShowList() {
        // TODO Auto-generated method stub
        if (showList == null){
            return null;
        }
//      Iterator<? extends Object> iterator = showList.iterator();
//      while (iterator.hasNext()) {
//            System.out.println(iterator.next().toString());
//      }
        return showList;
    }

    @Override
    public void doGetHtmlJsoupDocument(String urlParam) {
        // TODO Auto-generated method stub
            setStrUrl(urlParam);
            jsoupConnection = Jsoup.connect(urlParam).timeout(5000);    
    }

    @Override
    public void doParseHtmlDocument(int param) throws IOException {
        // TODO Auto-generated method stub
        if (param < 0){
            throw new IOException("unexpected param's value required : greater param than 0 : " + param);
        }
    }

    @Override
    public void doGetHtmlContent(String htmlContent) {
        // TODO Auto-generated method stub

    }

    public String getStrUrl() {
        return strUrl;
    }

    public void setStrUrl(String strUrl) {
        this.strUrl = strUrl;
    }



    public String getHtmlTitle() {
        return htmlTitle;
    }

    public void setHtmlTitle(String htmlTitle) {
        this.htmlTitle = htmlTitle;
    }

    public String getHtmlHeader() {
        return htmlHeader;
    }

    public void setHtmlHeader(String htmlHeader) {
        this.htmlHeader = htmlHeader;
    }

    public List<SubTitle> getHtmlSubTitles() {
        return htmlSubTitles;
    }

    public void setHtmlSubTitles(List<SubTitle> htmlSubTitles) {
        this.htmlSubTitles = htmlSubTitles;
    }


    @Override
    public String toString() {
        return "BasicHtmlFramework [htmlTitle=" + htmlTitle + ", htmlHeader="
                + htmlHeader + ", htmlSubTitles=" + htmlSubTitles + "]";
    }


    public class SubTitle{
        private String subTitle;
        private String subTitleLink;
        public String getSubTitle() {
            return subTitle;
        }
        public void setSubTitle(String subTitle) {
            this.subTitle = subTitle;
        }
        public String getSubTitleLink() {
            return subTitleLink;
        }
        public void setSubTitleLink(String subTitleLink) {
            this.subTitleLink = subTitleLink;
        }

        public URL getUrlFromLink(){
            try {
                return new URL(subTitleLink);
            } catch (MalformedURLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            return null;
        }

    }

}

上述定义此次解析HTML的抽象类和接口的实现

package com.test.html.jsoup;

import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class RongShuXiaHtmlFramework extends BasicHtmlFramework{

    public RongShuXiaHtmlFramework(String strUrl) {
        super(strUrl);
        // TODO Auto-generated constructor stub
    }

    public RongShuXiaHtmlFramework() {
        // TODO Auto-generated constructor stub
    }

    private List<AllShowBook> books = new LinkedList<>();
    private List<RSXBookTypes> bookTypes = new LinkedList<>();
    private String h2TitleBookType;

    public List<? extends RSXBookTypes> getBookTypes(){
        return bookTypes;
    }

    public List<? extends RSXBookTypes> getBooks(){
        return books;
    }

    public String getBookTypeTitle(){
        return h2TitleBookType;
    }

    private synchronized void handleHtmlBookTypes(){// 解析图书类型
        Element element = jsoupDocument.getElementsByAttributeValue("class", "showContentLeft").first();
        h2TitleBookType = element.select("h2").text();
        Element elType = element.getElementsByAttributeValue("class", "clear").first();
        Elements elements = elType.getElementsByTag("li");

        for (Element el : elements){

            RSXBookTypes raxBookTypes = new RSXBookTypes(); 
            String nameString = el.getElementsByTag("a").text();
            String link = el.select("a").attr("href").trim();
            String count = el.getElementsByTag("span").text();
            if (nameString.contains(count)){
                int index = nameString.indexOf(count);
                nameString = nameString.substring(0, index);
            }
            raxBookTypes.typeBookCount = count;
            raxBookTypes.typeBookLink = link;
            raxBookTypes.typeBookName = nameString;
            bookTypes.add(raxBookTypes);
        }
        showList = bookTypes;
    }



    private synchronized void handleHtmlTitle(){// 解析标题
        htmlTitle = jsoupDocument.head().getElementsByTag("title").text().trim();

        Element elementContents = jsoupDocument.getElementById("nav");
        Elements titleElements = elementContents.getElementsByTag("li");
                //documentContent.getElementsByClass(HtmlTagName.ulClass);//该方法是包含的关系
        if (titleElements == null){
            return;
        }
        for (Element links : titleElements){
            String linkUrl ;
            String title = links.getElementsByTag("a").text(); 
            String link = links.select("a").attr("href").trim();
            if (link == null || link.equals("")){
                linkUrl = strUrl;
            } else {
                linkUrl = link;
            }

            SubTitle html = new SubTitle();
            html.setSubTitleLink(linkUrl);
            html.setSubTitle(title);
            htmlSubTitles.add(html);
        }
        showList = htmlSubTitles;

    }

    @Override
    public void doGetHtmlJsoupDocument(String urlParam) {
        // TODO Auto-generated method stub
        super.doGetHtmlJsoupDocument(urlParam);
        if (jsoupConnection == null){
            return;
        }
        try {
            jsoupDocument = jsoupConnection.get();// 可能会使用post方式获取对象的实例

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    @Override
    public synchronized void doParseHtmlDocument(int param) {
        // TODO Auto-generated method stub
        if (jsoupDocument == null){
            return;
        }

        switch (param) {
        case 0:
            handleHtmlTitle();
            break;
        case 1:
            handleHtmlBookTypes();
            break;
        default:
            break;
        }
    }

    @Override
    public void doGetHtmlContent(String htmlContent) {
        // TODO Auto-generated method stub

    }

    public class RSXBookTypes{
        public String typeBookCount;
        public String typeBookName;
        public String typeBookLink;
    }

    public class AllShowBook extends RSXBookTypes{
        public String picLink;
    }

}

具体的实现类内容如上
相关DEMO下载:
http://download.csdn.net/detail/tangzhide/9647865

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值