以下是示例代码
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.HttpURLConnection;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.Tag;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.util.ParserException;
import com.lietu.tag.CnTagMaker.WordWeight;
public class TagExt {
private static StringBuffer body ;
private static String title ;
public static void main(String[] args) throws Exception {
String path;
if (0 >= args. length ) {
path =
/** 以下是测试地址 **/
// "http://www.ibm.com/developerworks/cn/webservices/0901_haoxf_humantask/";
// "http://developers.sun.com.cn/Java/xref_index.html";
"http://mil.news.sina.com.cn/2009-01-09/0839538126.html" ;
// "http://www.sina.com.cn/";
// "http://www.ibm.com/";
// "http://hao861002.iteye.com/blog/301581";
} else {
path = args[0];
}
/**
* 构造 URL ,并打开网络链接。
*/
URL url = new URL(path);
HttpURLConnection httpUrl = (HttpURLConnection) url.openConnection();
/** 对该网页进行解析 **/
parseHTML (httpUrl);
/** 打印解析后的内容 **/
System. out .println( "body=" + body );
}
/**
* 解析网页内容
* @param uc 传入一个 HttpURLConnection 链接对象
* @throws ParserException
*/
public static void parseHTML(HttpURLConnection uc) throws ParserException {
/** 声明节点 **/
Node node;
String stringText;
body = new StringBuffer();
/** 从 head 头获取网页编码格式。该方式取决于服务器是否设置 charSet 值,如果没有,该方式将无法获取 charSet 值 **/
String contentType = uc.getContentType();
String charSet = getCharset (contentType);
Lexer lexer = null ;
if (charSet == null ) {
charSet = "UTF-8" ;
}
try {
lexer = new Lexer( new Page(uc.getInputStream(), charSet));
} catch (Exception e) {
e.printStackTrace();
return ;
}
/** 对网页内容进行解析 **/
lexer.setNodeFactory( new PrototypicalNodeFactory());
/** 设置开关,决定网页是否重新解析 **/
boolean tryAgain = false ;
while ( null != (node = lexer.nextNode())) {
/** 以下是判断节点的类型,并作相应的处理 **/
if (node instanceof ScriptTag) {
while ( null != (node = lexer.nextNode())) {
if (node instanceof Tag) {
Tag tag = (Tag) node;
if (tag.isEndTag() && "SCRIPT" .equals(tag.getTagName())) {
break ;
}
}
}
if ( null == node)
break ;
} else if (node instanceof StyleTag) {
while ( null != (node = lexer.nextNode())) {
if (node instanceof Tag) {
Tag tag = (Tag) node;
if (tag.isEndTag())
break ;
}
}
if ( null == node)
break ;
} else if (node instanceof TextNode) {
stringText = node.toPlainTextString();
if ( "" .equals( title ))
continue
未完,接2
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.HttpURLConnection;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.Tag;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.util.ParserException;
import com.lietu.tag.CnTagMaker.WordWeight;
public class TagExt {
private static StringBuffer body ;
private static String title ;
public static void main(String[] args) throws Exception {
String path;
if (0 >= args. length ) {
path =
/** 以下是测试地址 **/
// "http://www.ibm.com/developerworks/cn/webservices/0901_haoxf_humantask/";
// "http://developers.sun.com.cn/Java/xref_index.html";
"http://mil.news.sina.com.cn/2009-01-09/0839538126.html" ;
// "http://www.sina.com.cn/";
// "http://www.ibm.com/";
// "http://hao861002.iteye.com/blog/301581";
} else {
path = args[0];
}
/**
* 构造 URL ,并打开网络链接。
*/
URL url = new URL(path);
HttpURLConnection httpUrl = (HttpURLConnection) url.openConnection();
/** 对该网页进行解析 **/
parseHTML (httpUrl);
/** 打印解析后的内容 **/
System. out .println( "body=" + body );
}
/**
* 解析网页内容
* @param uc 传入一个 HttpURLConnection 链接对象
* @throws ParserException
*/
public static void parseHTML(HttpURLConnection uc) throws ParserException {
/** 声明节点 **/
Node node;
String stringText;
body = new StringBuffer();
/** 从 head 头获取网页编码格式。该方式取决于服务器是否设置 charSet 值,如果没有,该方式将无法获取 charSet 值 **/
String contentType = uc.getContentType();
String charSet = getCharset (contentType);
Lexer lexer = null ;
if (charSet == null ) {
charSet = "UTF-8" ;
}
try {
lexer = new Lexer( new Page(uc.getInputStream(), charSet));
} catch (Exception e) {
e.printStackTrace();
return ;
}
/** 对网页内容进行解析 **/
lexer.setNodeFactory( new PrototypicalNodeFactory());
/** 设置开关,决定网页是否重新解析 **/
boolean tryAgain = false ;
while ( null != (node = lexer.nextNode())) {
/** 以下是判断节点的类型,并作相应的处理 **/
if (node instanceof ScriptTag) {
while ( null != (node = lexer.nextNode())) {
if (node instanceof Tag) {
Tag tag = (Tag) node;
if (tag.isEndTag() && "SCRIPT" .equals(tag.getTagName())) {
break ;
}
}
}
if ( null == node)
break ;
} else if (node instanceof StyleTag) {
while ( null != (node = lexer.nextNode())) {
if (node instanceof Tag) {
Tag tag = (Tag) node;
if (tag.isEndTag())
break ;
}
}
if ( null == node)
break ;
} else if (node instanceof TextNode) {
stringText = node.toPlainTextString();
if ( "" .equals( title ))
continue
未完,接2