在用htmlparser解析网页内容时,发现iframe没有tag类,于是参照源码自己增加并注册使用。
1、创建iFrameTag类:
// HTMLParser Library - A java-based parser for HTML
// http://htmlparser.org
// Copyright (C) 2006 Somik Raha
//
// Revision Control Information
//
// $URL: https://svn.sourceforge.net/svnroot/htmlparser/trunk/parser/src/main/java/org/htmlparser/tags/FrameTag.java $
// $Author: derrickoswald $
// $Date: 2006-09-16 10:44:17 -0400 (Sat, 16 Sep 2006) $
// $Revision: 4 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the Common Public License; either
// version 1.0 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// Common Public License for more details.
//
// You should have received a copy of the Common Public License
// along with this library; if not, the license is available from
// the Open Source Initiative (OSI) website:
// http://opensource.org/licenses/cpl1.0.php
package an;
import org.htmlparser.nodes.TagNode;
/**
* Identifies a frame tag
*/
public class iFrameTag
extends
TagNode
{
/**
* The set of names handled by this tag.
*/
private static final String[] mIds = new String[] {"iframe"};
private static final String mEndTagEnders[] = {
"iframe"
};
/**
* Create a new frame tag.
*/
public iFrameTag ()
{
}
/**
* Return the set of names handled by this tag.
* @return The names to be matched that create tags of this type.
*/
public String[] getIds ()
{
return (mIds);
}
public String[] getEndTagEnders()
{
return mEndTagEnders;
}
/**
* Returns the location of the frame.
* @return The contents of the SRC attribute converted to an absolute URL.
*/
public String getFrameLocation ()
{
String ret;
ret = getAttribute ("SRC");
if (null == ret)
ret = "";
else if (null != getPage ())
ret = getPage ().getAbsoluteURL (ret);
return (ret);
}
/**
* Sets the location of the frame.
* @param url The new frame location.
*/
public void setFrameLocation (String url)
{
setAttribute ("SRC", url);
}
/**
* Get the <code>NAME</code> attribute, if any.
* @return The value of the <code>NAME</code> attribute,
* or <code>null</code> if the attribute doesn't exist.
*/
public String getFrameName()
{
return (getAttribute ("NAME"));
}
/**
* Return a string representation of the contents of this <code>FRAME</code> tag suitable for debugging.
* @return A string with this tag's contents.
*/
public String toString()
{
return "FRAME TAG : Frame " +getFrameName() + " at "+getFrameLocation()+"; begins at : "+getStartPosition ()+"; ends at : "+getEndPosition ();
}
}
2、注册并使用
package an;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.FormTag;
import org.htmlparser.tags.FrameSetTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.InputTag;
import org.htmlparser.util.NodeList;
/*提取网页html源码中form\frame\iframe\input\frameset,查看表单参数
* iframe要自定义tag类,iFrameTag.java
* */
public List<String> getHtml(String url) throws Exception{
List<String> htmlList=new ArrayList<String>();//初始化
//生成一个解析器对象,用网页的 url 作为参数
Parser parser = new Parser(url);
if(parser.getEncoding().equals("ISO-8859-1"))
parser.setEncoding("UTF-8");
//设置节点过滤
NodeFilter formFilter = new NodeClassFilter(FormTag.class);
NodeFilter frameFilter = new NodeClassFilter(FrameTag.class);
NodeFilter inputFilter = new NodeClassFilter(InputTag.class);
NodeFilter framesetFilter = new NodeClassFilter(FrameSetTag.class);
//注册iframetag
PrototypicalNodeFactory p=new PrototypicalNodeFactory();
p.registerTag(new iFrameTag());
parser.setNodeFactory(p);
NodeFilter iframeFilter = new NodeClassFilter(iFrameTag.class);
//添加过滤条件
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { formFilter,frameFilter,inputFilter,framesetFilter ,iframeFilter});
NodeList nodeList = parser.parse(lastFilter);
for(int i = 0; i <= nodeList.size(); i++) {
if(nodeList.elementAt(i) instanceof FormTag) {
FormTag tag = (FormTag) nodeList.elementAt(i);
htmlList.add(tag.getText());
}
if(nodeList.elementAt(i) instanceof InputTag) {
InputTag tag = (InputTag) nodeList.elementAt(i);
htmlList.add(tag.getText());
}
if(nodeList.elementAt(i) instanceof FrameTag){
FrameTag tag = (FrameTag) nodeList.elementAt(i);
htmlList.add(tag.getText());
}
if(nodeList.elementAt(i) instanceof FrameSetTag){
FrameSetTag tag = (FrameSetTag) nodeList.elementAt(i);
htmlList.add(tag.getText());
}
if(nodeList.elementAt(i) instanceof iFrameTag){
iFrameTag tag = (iFrameTag) nodeList.elementAt(i);
htmlList.add(tag.getText());
}
}
return htmlList;
}
public static void main(String[]args) throws Exception{
UrlParser fd=new UrlParser();
List<String> list=fd.getHtml("http://mail.189.cn");
for(int i=0;i<list.size();i++){
System.out.println(list.get(i));
}
}