初步使用HTMLParser工具包【部分代码来自网络】

</pre><pre name="code" class="java"><strong><span style="font-size:24px;">先上代码:</span></strong>
</pre><pre name="code" class="java">import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.File;
import java.net.HttpURLConnection;
import java.net.URL;

import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.TextExtractingVisitor;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;

/**
* @author www.baizeju.com
*/
public class TestHtmlParser {
    private static String ENCODE = "GBK";
    private static void message( String szMsg ) {
        try{
        	System.out.println(new String(szMsg.getBytes(ENCODE), System.getProperty("file.encoding")));
        	} 
        catch(Exception e ){
        	e.printStackTrace();
        }
    }
    public static String openFile( String szFileName ) {
        try {
            BufferedReader bis = new BufferedReader(new InputStreamReader(
            		new FileInputStream( new File(szFileName)), ENCODE) );
            String szContent="";
            String szTemp;
            
            while ( (szTemp = bis.readLine()) != null) {
                szContent+=szTemp+"\n";
            }
            bis.close();
            return szContent;
        }
        catch( Exception e ) {
            return "";
        }
    }
    
   public static void main(String[] args) {
        
        String szContent = openFile( "D:\\28GAME\\BeiJing10\\GuanFang\\temp\\2015-1-1.html");
        
        try{
            //Parser parser = Parser.createParser(szContent, ENCODE);
        	Parser parser = new Parser( szContent );
           //Parser parser = new Parser( (HttpURLConnection) (new URL("http://127.0.0.1:8080/HTMLParserTester.html")).openConnection() );
        
            /*TextExtractingVisitor visitor = new TextExtractingVisitor();
            parser.visitAllNodesWith(visitor);
            String textInPage = visitor.getExtractedText();

            message(textInPage);*/
        	//下面提取表格中的数据
        	NodeFilter filter1 = new HasAttributeFilter("class","bgcolor1");
        	NodeFilter filter2 = new HasAttributeFilter("class","bgcolor2");
        	NodeFilter filter3 = new OrFilter(filter1,filter2);
        	NodeFilter filter = new AndFilter(new TagNameFilter("tr"),filter3);
        	
        	NodeList nodelist = parser.parse(filter);//过滤出符合filter_text的节点LIST
			Node[] nodes = nodelist.toNodeArray();//转化为数组
			StringBuffer buftext = new StringBuffer();
			String line = null;
			for(int i=0; i<nodes.length; i++){//循环加到buftext上
				System.out.println(nodes[i].toHtml());
				System.out.println("---------------------------");
				line = nodes[i].toPlainTextString();
				 if(line != null){
					 buftext.append(line);
				 }			
				
			}
			String body = buftext.toString();
			System.out.println(body);//输出
        	
        }
        catch( Exception e ) {            
        }
    }
}


测试的文本:


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    


<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />

<meta http-equiv="X-UA-Compatible" content="IE=edge" />

<meta name="apple-itunes-app" content="app-id=427927518" />
<meta property="qc:admins" content="2012211377645053116375" />



<title>PK拾开奖信息 - 百度乐彩 - PK拾|开奖公告|开奖结果|开奖查询|历史开奖</title>
    <meta name="description" content="PK拾开奖信息提供PK拾开奖结果,开奖公告,历史开奖的详情" />
    <meta name="keywords" content="PK拾,PK拾投注,开奖公告,开奖结果,开奖查询,历史开奖" />
    <link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/detail.css?v=2.9.103" type="text/css" media="screen" />
    <link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/list.css?v=2.9.103" type="text/css" media="screen" />
 </head>
  <body>
   
                    <table id="draw_list">
                        <thead>
                            <tr>
                                <td class="td1">开奖日期</td>
                                <td class="td2">期号</td>
                                <td class="td3">开奖号码</td>
                                <td class="td4">本期销量</td>
                            </tr>
                        </thead>
                        <tbody>
<tr class="bgcolor1"><td class="td1">2015-01-01</td>
                                <td class="td2">466997</td><td class="td3">
             <span class="result">
                    <span class="ball_1">06</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">01</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">08</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
            </span></td>
                                <td class="td4">0</td>
</tr>
<tr class="bgcolor2">
                                <td class="td1">2015-01-01</td>
                                <td class="td2">466996</td>
                                <td class="td3">
<span class="result">
                    <span class="ball_1">01</span>
                    <span class="ball_1">06</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">08</span>
</span></td>
                               <td class="td4">0</td>
</tr>
                                                    </tbody>
                    </table>
  </body>
</html>



结果:


<tr class="bgcolor1"><td class="td1">2015-01-01</td>
                                <td class="td2">466997</td><td class="td3">
             <span class="result">
                    <span class="ball_1">06</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">01</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">08</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
            </span></td>
                                <td class="td4">0</td>
</tr>
---------------------------
<tr class="bgcolor2">
                                <td class="td1">2015-01-01</td>
                                <td class="td2">466996</td>
                                <td class="td3">
<span class="result">
                    <span class="ball_1">01</span>
                    <span class="ball_1">06</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">08</span>
</span></td>
                               <td class="td4">0</td>
</tr>
---------------------------
2015-01-01
                                466997
             
                    06
                    03
                    01
                    07
                    10
                    04
                    08
                    09
                    02
                    05
            
                                0


                                2015-01-01
                                466996
                                


                    01
                    06
                    04
                    02
                    05
                    07
                    10
                    03
                    09
                    08


                               0


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
按DOM模型解析html文件的工具包 已下是源码列表: META-INF/MANIFEST.MF META-INF/maven/org.htmlparser/htmlparser/pom.properties META-INF/maven/org.htmlparser/htmlparser/pom.xml org.htmlparser.Parser.class org.htmlparser.PrototypicalNodeFactory.class org.htmlparser.beans.BeanyBaby.class org.htmlparser.beans.FilterBean.class org.htmlparser.beans.HTMLLinkBean.class org.htmlparser.beans.HTMLTextBean.class org.htmlparser.beans.LinkBean.class org.htmlparser.beans.StringBean.class org.htmlparser.filters.AndFilter.class org.htmlparser.filters.CssSelectorNodeFilter.class org.htmlparser.filters.HasAttributeFilter.class org.htmlparser.filters.HasChildFilter.class org.htmlparser.filters.HasParentFilter.class org.htmlparser.filters.HasSiblingFilter.class org.htmlparser.filters.IsEqualFilter.class org.htmlparser.filters.LinkRegexFilter.class org.htmlparser.filters.LinkStringFilter.class org.htmlparser.filters.NodeClassFilter.class org.htmlparser.filters.NotFilter.class org.htmlparser.filters.OrFilter.class org.htmlparser.filters.RegexFilter.class org.htmlparser.filters.StringFilter.class org.htmlparser.filters.TagNameFilter.class org.htmlparser.http.HttpHeader.class org.htmlparser.sax.Attributes.class org.htmlparser.sax.Feedback.class org.htmlparser.sax.Locator.class org.htmlparser.sax.XMLReader.class org.htmlparser.scanners.CompositeTagScanner.class org.htmlparser.scanners.JspScanner.class org.htmlparser.scanners.ScriptDecoder.class org.htmlparser.scanners.ScriptScanner.class org.htmlparser.scanners.StyleScanner.class org.htmlparser.tags.AppletTag.class org.htmlparser.tags.BaseHrefTag.class org.htmlparser.tags.BlockquoteTag.class org.htmlparser.tags.BodyTag.class org.htmlparser.tags.Bullet.class org.htmlparser.tags.BulletList.class org.htmlparser.tags.CompositeTag.class org.htmlparser.tags.DefinitionList.class org.htmlparser.tags.DefinitionListBullet.class org.htmlparser.tags.Div.class org.htmlparser.tags.DoctypeTag.class org.htmlparser.tags.FormTag.class org.htmlparser.tags.FrameSetTag.class org.htmlparser.tags.FrameTag.class org.htmlparser.tags.HeadTag.class org.htmlparser.tags.HeadingTag.class org.htmlparser.tags.Html.class org.htmlparser.tags.ImageTag.class org.htmlparser.tags.InputTag.class org.htmlparser.tags.JspTag.class org.htmlparser.tags.LabelTag.class org.htmlparser.tags.LinkTag.class org.htmlparser.tags.MetaTag.class org.htmlparser.tags.ObjectTag.class org.htmlparser.tags.OptionTag.class org.htmlparser.tags.ParagraphTag.class org.htmlparser.tags.ProcessingInstructionTag.class org.htmlparser.tags.ScriptTag.class org.htmlparser.tags.SelectTag.class org.htmlparser.tags.Span.class org.htmlparser.tags.StyleTag.class org.htmlparser.tags.TableColumn.class org.htmlparser.tags.TableHeader.class org.htmlparser.tags.TableRow.class org.htmlparser.tags.TableTag.class org.htmlparser.tags.TextareaTag.class org.htmlparser.tags.TitleTag.class org.htmlparser.util.CharacterReference.class org.htmlparser.util.CharacterReferenceEx.class org.htmlparser.util.DefaultParserFeedback.class org.htmlparser.util.FeedbackManager.class org.htmlparser.util.IteratorImpl.class org.htmlparser.util.NodeTreeWalker.class org.htmlparser.util.ParserFeedback.class org.htmlparser.util.ParserUtils.class org.htmlparser.util.Translate.class org.htmlparser.visitors.HtmlPage.class org.htmlparser.visitors.LinkFindingVisitor.class org.htmlparser.visitors.ObjectFindingVisitor.class org.htmlparser.visitors.StringFindingVisitor.class org.htmlparser.visitors.TagFindingVisitor.class org.htmlparser.visitors.TextExtractingVisitor.class org.htmlparser.visitors.UrlModifyingVisitor.class org/htmlparser/beans/images/Chain16.gif org/htmlparser/beans/images/Chain32.gif org/htmlparser/beans/images/Knot16.gif org/htmlparser/beans/images/Knot32.gif
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值