初步使用HTMLParser工具包【部分代码来自网络】


<strong><span style="font-size:24px;">先上代码:</span></strong>

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.File;
import java.net.HttpURLConnection;
import java.net.URL;

import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.TextExtractingVisitor;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;

/**
* @author www.baizeju.com
*/
public class TestHtmlParser {
    private static String ENCODE = "GBK";
    private static void message( String szMsg ) {
        try{
        	System.out.println(new String(szMsg.getBytes(ENCODE), System.getProperty("file.encoding")));
        	} 
        catch(Exception e ){
        	e.printStackTrace();
        }
    }
    public static String openFile( String szFileName ) {
        try {
            BufferedReader bis = new BufferedReader(new InputStreamReader(
            		new FileInputStream( new File(szFileName)), ENCODE) );
            String szContent="";
            String szTemp;
            
            while ( (szTemp = bis.readLine()) != null) {
                szContent+=szTemp+"\n";
            }
            bis.close();
            return szContent;
        }
        catch( Exception e ) {
            return "";
        }
    }
    
   public static void main(String[] args) {
        
        String szContent = openFile( "D:\\28GAME\\BeiJing10\\GuanFang\\temp\\2015-1-1.html");
        
        try{
            //Parser parser = Parser.createParser(szContent, ENCODE);
        	Parser parser = new Parser( szContent );
           //Parser parser = new Parser( (HttpURLConnection) (new URL("http://127.0.0.1:8080/HTMLParserTester.html")).openConnection() );
        
            /*TextExtractingVisitor visitor = new TextExtractingVisitor();
            parser.visitAllNodesWith(visitor);
            String textInPage = visitor.getExtractedText();

            message(textInPage);*/
        	//下面提取表格中的数据
        	NodeFilter filter1 = new HasAttributeFilter("class","bgcolor1");
        	NodeFilter filter2 = new HasAttributeFilter("class","bgcolor2");
        	NodeFilter filter3 = new OrFilter(filter1,filter2);
        	NodeFilter filter = new AndFilter(new TagNameFilter("tr"),filter3);
        	
        	NodeList nodelist = parser.parse(filter);//过滤出符合filter_text的节点LIST
			Node[] nodes = nodelist.toNodeArray();//转化为数组
			StringBuffer buftext = new StringBuffer();
			String line = null;
			for(int i=0; i<nodes.length; i++){//循环加到buftext上
				System.out.println(nodes[i].toHtml());
				System.out.println("---------------------------");
				line = nodes[i].toPlainTextString();
				 if(line != null){
					 buftext.append(line);
				 }			
				
			}
			String body = buftext.toString();
			System.out.println(body);//输出
        	
        }
        catch( Exception e ) {            
        }
    }
}


测试的文本:


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    


<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />

<meta http-equiv="X-UA-Compatible" content="IE=edge" />

<meta name="apple-itunes-app" content="app-id=427927518" />
<meta property="qc:admins" content="2012211377645053116375" />



<title>PK拾开奖信息 - 百度乐彩 - PK拾|开奖公告|开奖结果|开奖查询|历史开奖</title>
    <meta name="description" content="PK拾开奖信息提供PK拾开奖结果,开奖公告,历史开奖的详情" />
    <meta name="keywords" content="PK拾,PK拾投注,开奖公告,开奖结果,开奖查询,历史开奖" />
    <link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/detail.css?v=2.9.103" type="text/css" media="screen" />
    <link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/list.css?v=2.9.103" type="text/css" media="screen" />
 </head>
  <body>
   
                    <table id="draw_list">
                        <thead>
                            <tr>
                                <td class="td1">开奖日期</td>
                                <td class="td2">期号</td>
                                <td class="td3">开奖号码</td>
                                <td class="td4">本期销量</td>
                            </tr>
                        </thead>
                        <tbody>
<tr class="bgcolor1"><td class="td1">2015-01-01</td>
                                <td class="td2">466997</td><td class="td3">
             <span class="result">
                    <span class="ball_1">06</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">01</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">08</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
            </span></td>
                                <td class="td4">0</td>
</tr>
<tr class="bgcolor2">
                                <td class="td1">2015-01-01</td>
                                <td class="td2">466996</td>
                                <td class="td3">
<span class="result">
                    <span class="ball_1">01</span>
                    <span class="ball_1">06</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">08</span>
</span></td>
                               <td class="td4">0</td>
</tr>
                                                    </tbody>
                    </table>
  </body>
</html>



结果:


<tr class="bgcolor1"><td class="td1">2015-01-01</td>
                                <td class="td2">466997</td><td class="td3">
             <span class="result">
                    <span class="ball_1">06</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">01</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">08</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
            </span></td>
                                <td class="td4">0</td>
</tr>
---------------------------
<tr class="bgcolor2">
                                <td class="td1">2015-01-01</td>
                                <td class="td2">466996</td>
                                <td class="td3">
<span class="result">
                    <span class="ball_1">01</span>
                    <span class="ball_1">06</span>
                    <span class="ball_1">04</span>
                    <span class="ball_1">02</span>
                    <span class="ball_1">05</span>
                    <span class="ball_1">07</span>
                    <span class="ball_1">10</span>
                    <span class="ball_1">03</span>
                    <span class="ball_1">09</span>
                    <span class="ball_1">08</span>
</span></td>
                               <td class="td4">0</td>
</tr>
---------------------------
2015-01-01
                                466997
             
                    06
                    03
                    01
                    07
                    10
                    04
                    08
                    09
                    02
                    05
            
                                0


                                2015-01-01
                                466996
                                


                    01
                    06
                    04
                    02
                    05
                    07
                    10
                    03
                    09
                    08


                               0


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值