<strong><span style="font-size:24px;">先上代码:</span></strong>
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.File;
import java.net.HttpURLConnection;
import java.net.URL;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.TextExtractingVisitor;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
/**
* @author www.baizeju.com
*/
public class TestHtmlParser {
private static String ENCODE = "GBK";
private static void message( String szMsg ) {
try{
System.out.println(new String(szMsg.getBytes(ENCODE), System.getProperty("file.encoding")));
}
catch(Exception e ){
e.printStackTrace();
}
}
public static String openFile( String szFileName ) {
try {
BufferedReader bis = new BufferedReader(new InputStreamReader(
new FileInputStream( new File(szFileName)), ENCODE) );
String szContent="";
String szTemp;
while ( (szTemp = bis.readLine()) != null) {
szContent+=szTemp+"\n";
}
bis.close();
return szContent;
}
catch( Exception e ) {
return "";
}
}
public static void main(String[] args) {
String szContent = openFile( "D:\\28GAME\\BeiJing10\\GuanFang\\temp\\2015-1-1.html");
try{
//Parser parser = Parser.createParser(szContent, ENCODE);
Parser parser = new Parser( szContent );
//Parser parser = new Parser( (HttpURLConnection) (new URL("http://127.0.0.1:8080/HTMLParserTester.html")).openConnection() );
/*TextExtractingVisitor visitor = new TextExtractingVisitor();
parser.visitAllNodesWith(visitor);
String textInPage = visitor.getExtractedText();
message(textInPage);*/
//下面提取表格中的数据
NodeFilter filter1 = new HasAttributeFilter("class","bgcolor1");
NodeFilter filter2 = new HasAttributeFilter("class","bgcolor2");
NodeFilter filter3 = new OrFilter(filter1,filter2);
NodeFilter filter = new AndFilter(new TagNameFilter("tr"),filter3);
NodeList nodelist = parser.parse(filter);//过滤出符合filter_text的节点LIST
Node[] nodes = nodelist.toNodeArray();//转化为数组
StringBuffer buftext = new StringBuffer();
String line = null;
for(int i=0; i<nodes.length; i++){//循环加到buftext上
System.out.println(nodes[i].toHtml());
System.out.println("---------------------------");
line = nodes[i].toPlainTextString();
if(line != null){
buftext.append(line);
}
}
String body = buftext.toString();
System.out.println(body);//输出
}
catch( Exception e ) {
}
}
}
测试的文本:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="apple-itunes-app" content="app-id=427927518" />
<meta property="qc:admins" content="2012211377645053116375" />
<title>PK拾开奖信息 - 百度乐彩 - PK拾|开奖公告|开奖结果|开奖查询|历史开奖</title>
<meta name="description" content="PK拾开奖信息提供PK拾开奖结果,开奖公告,历史开奖的详情" />
<meta name="keywords" content="PK拾,PK拾投注,开奖公告,开奖结果,开奖查询,历史开奖" />
<link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/detail.css?v=2.9.103" type="text/css" media="screen" />
<link rel="stylesheet" href="http://static.lecai.com/css/lottery/draw/list.css?v=2.9.103" type="text/css" media="screen" />
</head>
<body>
<table id="draw_list">
<thead>
<tr>
<td class="td1">开奖日期</td>
<td class="td2">期号</td>
<td class="td3">开奖号码</td>
<td class="td4">本期销量</td>
</tr>
</thead>
<tbody>
<tr class="bgcolor1"><td class="td1">2015-01-01</td>
<td class="td2">466997</td><td class="td3">
<span class="result">
<span class="ball_1">06</span>
<span class="ball_1">03</span>
<span class="ball_1">01</span>
<span class="ball_1">07</span>
<span class="ball_1">10</span>
<span class="ball_1">04</span>
<span class="ball_1">08</span>
<span class="ball_1">09</span>
<span class="ball_1">02</span>
<span class="ball_1">05</span>
</span></td>
<td class="td4">0</td>
</tr>
<tr class="bgcolor2">
<td class="td1">2015-01-01</td>
<td class="td2">466996</td>
<td class="td3">
<span class="result">
<span class="ball_1">01</span>
<span class="ball_1">06</span>
<span class="ball_1">04</span>
<span class="ball_1">02</span>
<span class="ball_1">05</span>
<span class="ball_1">07</span>
<span class="ball_1">10</span>
<span class="ball_1">03</span>
<span class="ball_1">09</span>
<span class="ball_1">08</span>
</span></td>
<td class="td4">0</td>
</tr>
</tbody>
</table>
</body>
</html>
结果:
<tr class="bgcolor1"><td class="td1">2015-01-01</td>
<td class="td2">466997</td><td class="td3">
<span class="result">
<span class="ball_1">06</span>
<span class="ball_1">03</span>
<span class="ball_1">01</span>
<span class="ball_1">07</span>
<span class="ball_1">10</span>
<span class="ball_1">04</span>
<span class="ball_1">08</span>
<span class="ball_1">09</span>
<span class="ball_1">02</span>
<span class="ball_1">05</span>
</span></td>
<td class="td4">0</td>
</tr>
---------------------------
<tr class="bgcolor2">
<td class="td1">2015-01-01</td>
<td class="td2">466996</td>
<td class="td3">
<span class="result">
<span class="ball_1">01</span>
<span class="ball_1">06</span>
<span class="ball_1">04</span>
<span class="ball_1">02</span>
<span class="ball_1">05</span>
<span class="ball_1">07</span>
<span class="ball_1">10</span>
<span class="ball_1">03</span>
<span class="ball_1">09</span>
<span class="ball_1">08</span>
</span></td>
<td class="td4">0</td>
</tr>
---------------------------
2015-01-01
466997
06
03
01
07
10
04
08
09
02
05
0
2015-01-01
466996
01
06
04
02
05
07
10
03
09
08
0