HtmlParser,详细解释下对于页面上的所有节点解析。

首先说下,当初弄这个htmlparser的时候,也晕头转向的,不过经过2天深入,终于写出来了一些简单的代码,才发现这个jar挺好滴,这里分享哦,是我以前在网上都找不到的内容哦,那么简单介绍下,什么是HtmlParser?这是一个开源的项目。专门解析html页面的包,很强大哦。现在的版本2.0+了。好,废话不多说,直接进入正题。htmlparser对html的操作有好几种,简单介绍2种,

第一种,用标签过滤

 /**
     * 取IMG
     *
     * @param url
     * @throws ParserException
     * @throws IOException
     */
    public List<String> getHtmlImage(String url) throws ParserException, IOException {
        List<String> image = new ArrayList<String>();

        //这里的url就是要传入的地址,网络以及本地
        Parser parser = new Parser(url);
        parser.setEncoding("UTF-8");//编码设置。

        //标签过滤,andFilter 见名字就知道,是与什么进行过滤,首先声明AndFliter,这个对象里有两个参数,都是NodeFilter类型的,

        //里面的第一个参数传入的是一个标签,第二个参数对象呢是获取的属性进行过滤,也有两个参数,都是String类型的,第一个参数是

        //以什么方式传入,方式例如标签里的class stype id name 等等。后边的就是传入以第一个参数对应的值

        //例如<img hspace=2 src="http://www.baidu.com/bug.jpg">img.jpg</img> 本代码会解析出src里的值,如果想要把标签

        //里的img.jpg解析出来用String temp=node.toPlainTextString().trim();
        AndFilter img = new AndFilter(new TagNameFilter("img"), new HasAttributeFilter("hspace", "2"));

        //把过滤后的内容加入NodeList里进行遍历
        NodeList nodelist = parser.extractAllNodesThatMatch(img);
        for (int i = 0; i < nodelist.size(); i++) {
            Node node = nodelist.elementAt(i);

            //下列两行代码因本需求的需要,进行的截取字符串,后续介绍不需要这么截取,因为它不通用。
            int begin = node.getText().indexOf("src=") + 5;
            int end = node.getText().indexOf("width") - 2;
            String temp = node.getText().substring(begin, end);
            this.saveImg.saveToFile(temp,this.tuanPath);
            if (temp != null && temp != "" && !temp.equals("")) { 
                image.add(temp);
            }
        }
        return image;
    }

 

第二种,使用NodeVisitor解析html。个人觉得这种比较使用

 

package com.tz.html;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;
import com.tz.tools.PropertiesTools;
/**
 * HTML解析
 * @author Sunweikun
 *
 */
public class TagSoupHtml {
    private PropertiesTools pts;
    private List<String> imagesList=new ArrayList<String>();
    private List<String> priceList=new ArrayList<String>();
    private List<String> valueList=new ArrayList<String>();
    private List<String> titleList=new ArrayList<String>();
    private List<String> numberList=new ArrayList<String>();
    private List<String> cityList=new ArrayList<String>();
    private List<String> urlList=new ArrayList<String>();
    /**
     * html解析器
     */
    public void getVisitorAll() {
        try {

            //初始化一个Parser
            Parser parser = new Parser();

            //下列的循环是以配置文件方式写入,在后续会介绍读取配置文件的代码
            for (int i = 0; i < this.pts.getHTMLURL().length; i++) {

                //声明Properties 来接收已经写好的读取方法
                final Properties p = this.pts.getProperties(this.pts.getHTMLURL()[i], "html");

                //设置参数,URL
                parser.setURL(p.getProperty("url"));

                //根据URL的编码来动态过去编码
                parser.setEncoding(parser.getEncoding());

                //开始进入正题,NodeVistor来解析,使用本身的内部类
                NodeVisitor visitor = new NodeVisitor() {
                    public void visitTag(Tag tag) {      

                        //这句判断的意思是,如果我传入的images-tag里的值等于页面的节点开始解析,
                        if (tag.getTagName().equals(p.getProperty("images-tag"))) { 

                         //如果标签等于images-tag之后并且 传入的方式images-manner,例如 标签的class style 的与获取的属性

                           //值images-class相匹配开始进行以下操作
                            if (p.getProperty("images-class").equals(tag.getAttribute(p.getProperty("images-manner")))) {

                                //如果这个节点下才是我们要取的数据 比如<div class="hah"><img style="with:100px">asd.jpg</img>

                                //</div>这样的话就需要取下一个节点开始遍历
                                NodeList nodelist = tag.getChildren();
                                for (int i = 0; i < nodelist.size(); i++) {

                                    //把遍历后的节点用Node接收
                                    Node node = (Node) nodelist.elementAt(i);

                                    //先要判断下边还有没有节点,如果没有
                                     if(node.getChildren()==null){

                                        //就判断这个节点的标签是否等于ImageTag 这个类就是对应Img标签的
                                        if (node instanceof ImageTag) {

                                            //取到所有Img的节点
                                            ImageTag image = (ImageTag) nodelist .elementAt(i);

                                            //加入list里的是imagesURL地址这种方式比截取字符串更为准确
                                            imagesList.add(image.getImageURL());
                                        }
                                    }else{

                                        //这个else就是代表着上个节点不为空,证明还有节点例如<div class="hah">

                                        // <a href="......." ><img style="with:100px">asd.jpg</img></a>这就是节点下还有节点
                                        NodeList list = node.getChildren();
                                        for (int j = 0; j < list.size(); j++) {

                                             //那么继续用Node接收 相对应上个节点下的所有节点
                                            Node img = (Node) list.elementAt(j);
                                            if (img instanceof ImageTag) {
                                                ImageTag image = (ImageTag) list.elementAt(j);
                                                imagesList.add(image.getImageURL());     
                                            }     
                                        }
                                    }
                                }
                            }
                        }

                        //以下内容大同小异,。
                        if(tag.getTagName().equals(p.getProperty("title-tag"))){
                            if (p.getProperty("title-class").equals(tag.getAttribute(p.getProperty("title-manner")))) {
                                NodeList list = tag.getChildren();
                                for (int i = 0; i < list.size(); i++) {
                                    Node node = (Node) list.elementAt(i);
                                    if (node instanceof HeadingTag) {
                                        HeadingTag head = (HeadingTag) list.elementAt(i);
                                        titleList.add(head.toPlainTextString().trim());
                                        urlList.add(p.getProperty("url"));
                                    }
                                }
                            }
                        }
                        if (tag.getTagName().equals(p.getProperty("price-tag"))) {
                            if (p.getProperty("price-class").equals(tag.getAttribute(p.getProperty("price-manner")))) {                         
                                int begin= tag.toPlainTextString().indexOf("¥");
                                String price=null;
                                if(begin!=-1){
                                    price=tag.toPlainTextString().substring(begin + 1).trim();
                                }else{
                                    price=tag.toPlainTextString().substring(tag.toPlainTextString().indexOf("¥")+1);
                                }
                                priceList.add(price);
                            }
                        }
                        if (tag.getTagName().equals(p.getProperty("value-tag"))) {
                            if (p.getProperty("value-class").equals(tag.getAttribute(p.getProperty("value-manner")))) {
                                int begin= tag.getParent().toPlainTextString().indexOf("¥");
                                String value=null;
                                if(begin!=-1){
                                    value=tag.getParent().toPlainTextString().substring(begin + 1).trim();
                                }else{
                                    value=tag.getParent().toPlainTextString().substring(tag.getParent().toPlainTextString().indexOf("¥")+1).trim();
                                }
                                valueList.add(value);
                            }
                        }
                        if (tag.getTagName().equals(p.getProperty("number-tag"))) {
                            if (p.getProperty("number-class").equals(tag.getAttribute(p.getProperty("number-manner")))) {
                                String number=tag.toPlainTextString().trim();
                                numberList.add(number);
                            }
                        }
                        if(tag.getTagName().equals(p.getProperty("city-tag"))){
                            if(p.getProperty("city-class").equals(tag.getAttribute(p.getProperty("city-manner")))){
                               String city=null;
                               if(tag.toPlainTextString().length()==2 || tag.toPlainTextString().length()==3){
                                   city = tag.toPlainTextString().trim();             
                               }else{
                                   city = tag.toPlainTextString().substring(0, 3).trim();
                               }
                               cityList.add(city);
                            }
                        }        
                       
                       
                    } 
                };
               
                parser.visitAllNodesWith(visitor);
            }
        } catch (ParserException e) {
            e.printStackTrace();
        }

    }

    public void setPts(PropertiesTools pts) {
        this.pts = pts;
    }
    public List<String> getPriceList() {
        return priceList;
    }

    public void setPriceList(List<String> priceList) {
        this.priceList = priceList;
    }

    public List<String> getValueList() {
        return valueList;
    }

    public void setValueList(List<String> valueList) {
        this.valueList = valueList;
    }

    public List<String> getTitleList() {
        return titleList;
    }

    public void setTitleList(List<String> titleList) {
        this.titleList = titleList;
    }

    public List<String> getNumberList() {
        return numberList;
    }

    public void setNumberList(List<String> numberList) {
        this.numberList = numberList;
    }

    public List<String> getCityList() {
        return cityList;
    }

    public void setCityList(List<String> cityList) {
        this.cityList = cityList;
    }
   
    public List<String> getImagesList() {
        return imagesList;
    }

    public void setImagesList(List<String> imagesList) {
        this.imagesList = imagesList;
    }

    public List<String> getUrlList() {
        return urlList;
    }

    public void setUrlList(List<String> urlList) {
        this.urlList = urlList;

    }  
}

这就是第二种的解析,也很简单把,下面我发下读取属性文件类,和属性文件里的内容,这样我想大家都能看的很清楚

package com.tz.tools;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import org.apache.commons.lang.StringUtils;

/**
 * 读文件
 *
 * @author Sunweikun
 *
 */
public class PropertiesTools {

    /**
     * 读取properties配置文件
     *
     * @param path
     * @param manner 方式 xml 或者 html
     * @return
     */
    public Properties getProperties(String path,String manner) {
        Properties p = new Properties();
        try {
            InputStream in = new BufferedInputStream(new FileInputStream(manner+"/"+path
                    + ".properties"));
            p.load(in);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return p;
    }

    /**
     * 动态读取xml的配置文件
     *
     * @return
     */
    public String[] getXMLURL() {
        String[] paths = null;
        Properties p = new Properties();
        try {
            InputStream in = new BufferedInputStream(new FileInputStream("config.properties"));
            p.load(in);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        paths = p.getProperty("xmlName").split(",");
        return paths;
    }
    /**
     * 动态读取html的配置文件
     * @return
     */
    public String[] getHTMLURL() {
        String[] paths = null;
        Properties p = new Properties();
        try {
            InputStream in = new BufferedInputStream(new FileInputStream("config.properties"));
            p.load(in);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        paths = p.getProperty("htmlName").split(",");
        return paths;
    }
    /**
     * 去掉左右空格后字符串是否为空
     * @param astr String
     * @return boolean
     */
    public static boolean isTrimEmpty(String astr)
    {
        if ((null == astr) || (astr.length() == 0))
        {
            return true;
        }
        if (StringUtils.isBlank(astr.trim()))
        {
            return true;
        }
        return false;
    }
    /**
     * 去掉重复的List
     * @param list
     */
    @SuppressWarnings("unchecked")
    public static void removeDuplicateWithOrder(List<String> list)   
    {   
        Set set = new HashSet();   
        List newList = new ArrayList();   
        for (Iterator iter = list.iterator(); iter.hasNext();){   
            Object element = iter.next();   
            if (set.add(element)) newList.add(element);   
        }   
        list.clear();   
        list.addAll(newList);   
    }

}

配置文件内容,我写的多个解析,这只是一个配置文件 文件名为nuomi.properties
#备注:要抓取的标签名称必须大写,配置文件主要以3种方式来配,标签名称,标签方式,标签属性及样式

#网站名称
url=http/://www.nuomi.com
#要取的标签
title-tag=DIV
#要取的标签里的 class 或者 style 里的内容
title-class=deal-main
#以什么方式 比如<div class="deal-title"> 或  <div style="width:00px">
title-manner=class
#图片
images-tag=DIV
images-class=product-pic
images-manner=class
#原始价格
value-tag=STRONG
value-class=original
value-manner=class
#购买人数
number-tag=SPAN
number-class=num
number-manner=class
#现在价格
price-tag=P
price-class=cur-price
price-manner=class
#城市
city-tag=DIV
city-class=area
city-manner=class

------------------------------------------

这里是config.properties这里是主的配置文件

#写入新加的配置文件名称以,号取文件名称 对xml操作24juan,aibang,didatuan,ftuan,lashou,meituan,pintuan,sohu,tuanbao,tuanku,wowo
xmlName=24juan,aibang,didatuan,ftuan,lashou,meituan,pintuan,sohu,tuanku,tuanbao,wowo
#写入新加的配置对html操作
htmlName=nuomi,xinlang

 

#24juan,aibang,didatuan,ftuan,lashou,meituan,pintuan,sohu,tuanku,tuanbao,wowo,

 

 

 

很久都没有写博客,这次换个博客地址来写,因为以前的博客都丢了,这是我丢了博客之后第一次写博客哦,。。嗯。。相隔了也有半年多了。那里写的不好,各位大虾可以批评,欢迎各种反对意见,如果不满意之处,395141222,请加本人QQ批评本人。

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 7
    评论
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值