用正则来匹配的确很强大,但如果是网页的话HtmlPaser更方便,由于抓下来的信息中文是unicode的,所以要用到apache的一个包,以下是代码:
import java.net.URL;
import org.apache.commons.lang3.StringEscapeUtils;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
public class GetURLText {
public void getText() throws Exception {
String urlString="http://localhost:8080/TestXFace/TestHtmlPaser/ABC.jsp";
URL url = new URL(urlString);
Parser parser = new Parser(url.openConnection());
parser.setEncoding("UTF-8");
NodeFilter nodeFilter = new NodeClassFilter(TableTag.class);
NodeList nodeList = parser.parse(nodeFilter);//得到table标签里所有的信息
System.out.println(nodeList);
for(int i=0;i<nodeList.size();i++){
TableTag tableTag = (TableTag) nodeList.elementAt(i);
TableRow[] rows = tableTag.getRows();
for(TableRow row:rows){
System.out.println("<tr> :"+ row.toPlainTextString());
TableColumn[] tableColumns = row.getColumns();
for(TableColumn tableColumn :tableColumns){
String string = tableColumn.toPlainTextString();
string = StringEscapeUtils.escapeHtml3(string);//unicode2String
System.out.println("<td> :"+string+"</td>");//得到<td>标签里的内容
}
}
}
}
public static void main(String[] args) throws Exception {
GetURLText getURLText = new GetURLText();
getURLText.getText();
}
}