HtmlParser，详细解释下对于页面上的所有节点解析。

最新推荐文章于 2015-11-23 22:04:00 发布

manzuo245756

最新推荐文章于 2015-11-23 22:04:00 发布

阅读量3k

点赞数

分类专栏： java解析类文章标签： properties string class html解析器 html null

本文链接：https://blog.csdn.net/manzuo245756/article/details/5891244

版权

java解析类专栏收录该内容

2 篇文章 0 订阅

订阅专栏

首先说下，当初弄这个htmlparser的时候，也晕头转向的，不过经过2天深入，终于写出来了一些简单的代码，才发现这个jar挺好滴，这里分享哦，是我以前在网上都找不到的内容哦，那么简单介绍下，什么是HtmlParser？这是一个开源的项目。专门解析html页面的包，很强大哦。现在的版本2.0+了。好，废话不多说，直接进入正题。htmlparser对html的操作有好几种，简单介绍2种，

第一种，用标签过滤

/**
     * 取IMG
     *
     * @param url
     * @throws ParserException
     * @throws IOException
     */
    public List<String> getHtmlImage(String url) throws ParserException, IOException {
        List<String> image = new ArrayList<String>();

        //这里的url就是要传入的地址，网络以及本地
        Parser parser = new Parser(url);
        parser.setEncoding("UTF-8");//编码设置。

//标签过滤，andFilter 见名字就知道，是与什么进行过滤，首先声明AndFliter，这个对象里有两个参数，都是NodeFilter类型的,

//里面的第一个参数传入的是一个标签，第二个参数对象呢是获取的属性进行过滤，也有两个参数，都是String类型的，第一个参数是

//以什么方式传入，方式例如标签里的class stype id name 等等。后边的就是传入以第一个参数对应的值

//例如<img hspace=2 src="http://www.baidu.com/bug.jpg">img.jpg</img> 本代码会解析出src里的值，如果想要把标签

//里的img.jpg解析出来用String temp=node.toPlainTextString().trim();
AndFilter img = new AndFilter(new TagNameFilter("img"), new HasAttributeFilter("hspace", "2"));

        //把过滤后的内容加入NodeList里进行遍历
        NodeList nodelist = parser.extractAllNodesThatMatch(img);
        for (int i = 0; i < nodelist.size(); i++) {
            Node node = nodelist.elementAt(i);

            //下列两行代码因本需求的需要，进行的截取字符串，后续介绍不需要这么截取，因为它不通用。
            int begin = node.getText().indexOf("src=") + 5;
            int end = node.getText().indexOf("width") - 2;
            String temp = node.getText().substring(begin, end);
            this.saveImg.saveToFile(temp,this.tuanPath);
            if (temp != null && temp != "" && !temp.equals("")) {
                image.add(temp);
            }
        }
        return image;
    }

第二种，使用NodeVisitor解析html。个人觉得这种比较使用

package com.tz.html;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;
import com.tz.tools.PropertiesTools;
/**
* HTML解析
* @author Sunweikun
*
*/
public class TagSoupHtml {
    private PropertiesTools pts;
    private List<String> imagesList=new ArrayList<String>();
    private List<String> priceList=new ArrayList<String>();
    private List<String> valueList=new ArrayList<String>();
    private List<String> titleList=new ArrayList<String>();
    private List<String> numberList=new ArrayList<String>();
    private List<String> cityList=new ArrayList<String>();
    private List<String> urlList=new ArrayList<String>();
    /**
     * html解析器
     */
    public void getVisitorAll() {
        try {

//初始化一个Parser
Parser parser = new Parser();

//下列的循环是以配置文件方式写入，在后续会介绍读取配置文件的代码
for (int i = 0; i < this.pts.getHTMLURL().length; i++) {

//声明Properties 来接收已经写好的读取方法
final Properties p = this.pts.getProperties(this.pts.getHTMLURL()[i], "html");

//设置参数，URL
parser.setURL(p.getProperty("url"));

//根据URL的编码来动态过去编码
parser.setEncoding(parser.getEncoding());

                //开始进入正题，NodeVistor来解析，使用本身的内部类
                NodeVisitor visitor = new NodeVisitor() {
                    public void visitTag(Tag tag) {

//这句判断的意思是，如果我传入的images-tag里的值等于页面的节点开始解析,
if (tag.getTagName().equals(p.getProperty("images-tag"))) {

//如果标签等于images-tag之后并且传入的方式images-manner，例如标签的class style 的与获取的属性

//值images-class相匹配开始进行以下操作
if (p.getProperty("images-class").equals(tag.getAttribute(p.getProperty("images-manner")))) {

//如果这个节点下才是我们要取的数据比如<div class="hah"><img style="with:100px">asd.jpg</img>

                                //</div>这样的话就需要取下一个节点开始遍历
                                NodeList nodelist = tag.getChildren();
                                for (int i = 0; i < nodelist.size(); i++) {

//把遍历后的节点用Node接收
Node node = (Node) nodelist.elementAt(i);

//先要判断下边还有没有节点，如果没有
if(node.getChildren()==null){

//就判断这个节点的标签是否等于ImageTag 这个类就是对应Img标签的
if (node instanceof ImageTag) {

//取到所有Img的节点
ImageTag image = (ImageTag) nodelist .elementAt(i);

                                            //加入list里的是imagesURL地址这种方式比截取字符串更为准确
                                            imagesList.add(image.getImageURL());
                                        }
                                    }else{

//这个else就是代表着上个节点不为空，证明还有节点例如<div class="hah">

                                        // <a href="......." ><img style="with:100px">asd.jpg</img></a>这就是节点下还有节点
                                        NodeList list = node.getChildren();
                                        for (int j = 0; j < list.size(); j++) {

                                             //那么继续用Node接收相对应上个节点下的所有节点
                                            Node img = (Node) list.elementAt(j);
                                            if (img instanceof ImageTag) {
                                                ImageTag image = (ImageTag) list.elementAt(j);
                                                imagesList.add(image.getImageURL());
                                            }
                                        }
                                    }
                                }
                            }
                        }

                        //以下内容大同小异，。
                        if(tag.getTagName().equals(p.getProperty("title-tag"))){
                            if (p.getProperty("title-class").equals(tag.getAttribute(p.getProperty("title-manner")))) {
                                NodeList list = tag.getChildren();
                                for (int i = 0; i < list.size(); i++) {
                                    Node node = (Node) list.elementAt(i);
                                    if (node instanceof HeadingTag) {
                                        HeadingTag head = (HeadingTag) list.elementAt(i);
                                        titleList.add(head.toPlainTextString().trim());
                                        urlList.add(p.getProperty("url"));
                                    }
                                }
                            }
                        }
                        if (tag.getTagName().equals(p.getProperty("price-tag"))) {
                            if (p.getProperty("price-class").equals(tag.getAttribute(p.getProperty("price-manner")))) {
                                int begin= tag.toPlainTextString().indexOf("¥");
                                String price=null;
                                if(begin!=-1){
                                    price=tag.toPlainTextString().substring(begin + 1).trim();
                                }else{
                                    price=tag.toPlainTextString().substring(tag.toPlainTextString().indexOf("￥")+1);
                                }
                                priceList.add(price);
                            }
                        }
                        if (tag.getTagName().equals(p.getProperty("value-tag"))) {
                            if (p.getProperty("value-class").equals(tag.getAttribute(p.getProperty("value-manner")))) {
                                int begin= tag.getParent().toPlainTextString().indexOf("¥");
                                String value=null;
                                if(begin!=-1){
                                    value=tag.getParent().toPlainTextString().substring(begin + 1).trim();
                                }else{
                                    value=tag.getParent().toPlainTextString().substring(tag.getParent().toPlainTextString().indexOf("￥")+1).trim();
                                }
                                valueList.add(value);
                            }
                        }
                        if (tag.getTagName().equals(p.getProperty("number-tag"))) {
                            if (p.getProperty("number-class").equals(tag.getAttribute(p.getProperty("number-manner")))) {
                                String number=tag.toPlainTextString().trim();
                                numberList.add(number);
                            }
                        }
                        if(tag.getTagName().equals(p.getProperty("city-tag"))){
                            if(p.getProperty("city-class").equals(tag.getAttribute(p.getProperty("city-manner")))){
                               String city=null;
                               if(tag.toPlainTextString().length()==2 || tag.toPlainTextString().length()==3){
                                   city = tag.toPlainTextString().trim();
                               }else{
                                   city = tag.toPlainTextString().substring(0, 3).trim();
                               }
                               cityList.add(city);
                            }
                        }


                    }
                };

                parser.visitAllNodesWith(visitor);
            }
        } catch (ParserException e) {
            e.printStackTrace();
        }

    }

    public void setPts(PropertiesTools pts) {
        this.pts = pts;
    }
    public List<String> getPriceList() {
        return priceList;
    }

    public void setPriceList(List<String> priceList) {
        this.priceList = priceList;
    }

    public List<String> getValueList() {
        return valueList;
    }

    public void setValueList(List<String> valueList) {
        this.valueList = valueList;
    }

    public List<String> getTitleList() {
        return titleList;
    }

    public void setTitleList(List<String> titleList) {
        this.titleList = titleList;
    }

    public List<String> getNumberList() {
        return numberList;
    }

    public void setNumberList(List<String> numberList) {
        this.numberList = numberList;
    }

    public List<String> getCityList() {
        return cityList;
    }

    public void setCityList(List<String> cityList) {
        this.cityList = cityList;
    }

    public List<String> getImagesList() {
        return imagesList;
    }

    public void setImagesList(List<String> imagesList) {
        this.imagesList = imagesList;
    }

    public List<String> getUrlList() {
        return urlList;
    }

    public void setUrlList(List<String> urlList) {
        this.urlList = urlList;

}
}

这就是第二种的解析，也很简单把，下面我发下读取属性文件类，和属性文件里的内容，这样我想大家都能看的很清楚

package com.tz.tools;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import org.apache.commons.lang.StringUtils;

/**
* 读文件
*
* @author Sunweikun
*
*/
public class PropertiesTools {

    /**
     * 读取properties配置文件
     *
     * @param path
     * @param manner 方式 xml 或者 html
     * @return
     */
    public Properties getProperties(String path,String manner) {
        Properties p = new Properties();
        try {
            InputStream in = new BufferedInputStream(new FileInputStream(manner+"/"+path
                    + ".properties"));
            p.load(in);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return p;
    }

    /**
     * 动态读取xml的配置文件
     *
     * @return
     */
    public String[] getXMLURL() {
        String[] paths = null;
        Properties p = new Properties();
        try {
            InputStream in = new BufferedInputStream(new FileInputStream("config.properties"));
            p.load(in);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        paths = p.getProperty("xmlName").split(",");
        return paths;
    }
    /**
     * 动态读取html的配置文件
     * @return
     */
    public String[] getHTMLURL() {
        String[] paths = null;
        Properties p = new Properties();
        try {
            InputStream in = new BufferedInputStream(new FileInputStream("config.properties"));
            p.load(in);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        paths = p.getProperty("htmlName").split(",");
        return paths;
    }
    /**
     * 去掉左右空格后字符串是否为空
     * @param astr String
     * @return boolean
     */
    public static boolean isTrimEmpty(String astr)
    {
        if ((null == astr) || (astr.length() == 0))
        {
            return true;
        }
        if (StringUtils.isBlank(astr.trim()))
        {
            return true;
        }
        return false;
    }
    /**
     * 去掉重复的List
     * @param list
     */
    @SuppressWarnings("unchecked")
    public static void removeDuplicateWithOrder(List<String> list)
    {
        Set set = new HashSet();
        List newList = new ArrayList();
        for (Iterator iter = list.iterator(); iter.hasNext();){
            Object element = iter.next();
            if (set.add(element)) newList.add(element);
        }
        list.clear();
        list.addAll(newList);
    }

}

配置文件内容,我写的多个解析，这只是一个配置文件文件名为nuomi.properties
#备注:要抓取的标签名称必须大写,配置文件主要以3种方式来配，标签名称，标签方式，标签属性及样式

#网站名称
url=http/://www.nuomi.com
#要取的标签
title-tag=DIV
#要取的标签里的 class 或者 style 里的内容
title-class=deal-main
#以什么方式比如<div class="deal-title"> 或 <div style="width:00px">
title-manner=class
#图片
images-tag=DIV
images-class=product-pic
images-manner=class
#原始价格
value-tag=STRONG
value-class=original
value-manner=class
#购买人数
number-tag=SPAN
number-class=num
number-manner=class
#现在价格
price-tag=P
price-class=cur-price
price-manner=class
#城市
city-tag=DIV
city-class=area
city-manner=class

------------------------------------------

这里是config.properties这里是主的配置文件

#写入新加的配置文件名称以,号取文件名称对xml操作24juan,aibang,didatuan,ftuan,lashou,meituan,pintuan,sohu,tuanbao,tuanku,wowo
xmlName=24juan,aibang,didatuan,ftuan,lashou,meituan,pintuan,sohu,tuanku,tuanbao,wowo
#写入新加的配置对html操作
htmlName=nuomi,xinlang

#24juan,aibang,didatuan,ftuan,lashou,meituan,pintuan,sohu,tuanku,tuanbao,wowo,

很久都没有写博客，这次换个博客地址来写，因为以前的博客都丢了，这是我丢了博客之后第一次写博客哦，。。嗯。。相隔了也有半年多了。那里写的不好，各位大虾可以批评，欢迎各种反对意见，如果不满意之处，395141222，请加本人QQ批评本人。