package org.sam.util;
import java.net.URL;
import junit.framework.TestCase;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.beans.LinkBean;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.RegexFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.OptionTag;
import org.htmlparser.tags.SelectTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.htmlparser.visitors.NodeVisitor;
import org.htmlparser.visitors.ObjectFindingVisitor;
import org.junit.Test;
public class HtmlToolCase extends TestCase {
private Parser parser;
//private static final Logger logger = Logger.getLogger(HtmlToolCase.class);
private static String ENCODE = "GBK";
@Test
public void setUp(){
parser = new Parser();
/*
* HTMLParser的核心模块是org.htmlparser.Parser:
* 以下为构造函数
* public Parser ();
* public Parser (Lexer lexer, ParserFeedback fb);
* public Parser (URLConnection connection, ParserFeedback fb) throws ParserException;
* public Parser (String resource, ParserFeedback feedback) throws ParserException;
* public Parser (String resource) throws ParserException;
* public Parser (Lexer lexer);
* public Parser (URLConnection connection) throws ParserException;
* public static Parser createParser (String html, String charset);静态方法
*/
try {
parser.setURL("http://www.csdn.net/");
parser.setEncoding(ENCODE);
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
}
public void msg( String msg ) {
try{
System.out.println(new String(msg.getBytes(ENCODE), System.getProperty("file.encoding")));
} catch(Exception e ){
System.err.println(ENCODE + "|" + msg);
}
}
/*
* ObjectFindingVisitor: 用来找出所有指定类型的节点,采用getTags()来获取结果
*/
public void testObjectFindingVisitor() {
ObjectFindingVisitor ofv = new ObjectFindingVisitor(ImageTag.class);
try {
parser.visitAllNodesWith(ofv);
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
Node[] nodes = ofv.getTags();
for(int i=0; i<nodes.length; i++) {
ImageTag it = (ImageTag)nodes[i];//图片节点
msg("------>Image(" + (i+1) + ")toHtml=" + it.toHtml());
msg("------>Image(" + (i+1) + ")toPlainTextString=" + it.toPlainTextString());
msg("------>Image(" + (i+1) + ")toTagHtml=" + it.toTagHtml());
msg("------>Image(" + (i+1) + ")toHtml-TRUE=" + it.toHtml(true));
msg("------>Image(" + (i+1) + ")toHtml-FALSE=" + it.toHtml(false));
msg("------>Image(" + (i+1) + ")Text=" + it.getText());
msg("------>Image(" + (i+1) + ")URL=" + it.getImageURL());
msg("------>Image(" + (i+1) + ")Location=" + it.extractImageLocn());
msg("------>Image(" + (i+1) + ")src=" + it.getAttribute("src"));
}
}
/*
* 判断类Filter-----》 TagNameFilter:标签名过虑
* HasAttributeFilter
* HasChildFilter
* HasParentFilter
* HasSiblingFilter
* IsEqualFilter
*/
public void testTagNameFilter() {
NodeFilter nf = new TagNameFilter("img");
try {
NodeList nl = parser.extractAllNodesThatMatch(nf);
for(int i=0; i<nl.size(); i++) {
msg("------>Image(" + (i+1) + ")toHtml=" + nl.elementAt(i).toHtml());
}
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
}
/*
* 逻辑运算Filter-----》 OrFilter:Filter或关系
* AndFilter
* NotFilter
* XorFilter
*/
public void testOrFilter() {
NodeFilter input = new NodeClassFilter(InputTag.class);
NodeFilter image = new NodeClassFilter(ImageTag.class);
NodeFilter select = new NodeClassFilter(SelectTag.class);
OrFilter of = new OrFilter();
of.setPredicates(new NodeFilter[]{input, image, select});
try {
NodeList nl = parser.extractAllNodesThatMatch(of);
for(int i=0; i<nl.size(); i++) {
if(nl.elementAt(i) instanceof InputTag) {
InputTag it = (InputTag) nl.elementAt(i);
msg("------>InputTag(" + (i+1) + ")name=" + it.getTagName() + " | value=" + it.getAttribute("value"));
} else if(nl.elementAt(i) instanceof ImageTag) {
ImageTag it = (ImageTag) nl.elementAt(i);
msg("------>Image(" + (i+1) + ")toHtml=" + it.toHtml());
} else if(nl.elementAt(i) instanceof SelectTag) {
SelectTag st = (SelectTag) nl.elementAt(i);
msg("------>Image(" + (i+1) + ")toHtml=" + st.toHtml());
NodeList childList = st.getChildren();
for(int k=0; k<childList.size(); k++) {
OptionTag ot = (OptionTag) childList.elementAt(k);
msg("------>OptionTag(" + (i+1) + "-" + (k+1) + ")value=" + ot.getValue() + " | text=" + ot.getOptionText());
}
} else {
msg("------>Unknown(" + (i+1) + ")toHtml=" + nl.elementAt(i).toHtml());
}
}
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
}
/*
* 其他Filter:
* NodeClassFilter:
* StringFilter
* LinkStringFilter
* LinkRegexFilter
* RegexFilter
* CssSelectorNodeFilter
*/
public void testNodeClassFilter() {
NodeFilter nf = new NodeClassFilter(LinkTag.class);//a标签
try {
NodeList nl = parser.extractAllNodesThatMatch(nf);
for(int i=0; i<nl.size(); i++) {
LinkTag lt = (LinkTag) nl.elementAt(i);
msg("------>LinkTag(" + (i+1) + ")toHtml=" + lt.toHtml());
msg("------>LinkTag(" + (i+1) + ")extractLink=" + lt.extractLink());
}
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
}
/*
* RegexFilter: HTMLParser 的 RegexFilter 用法示例
*/
public void testRegexFilter() {
RegexFilter rfDate = new RegexFilter("\\d{4}[\\/-]\\d{1,2}[\\/-]\\d{1,2}");//日期
RegexFilter rfURL = new RegexFilter("(http:|https:|ftp:)//[^[A-Za-z0-9\\._\\?%&+\\-=/#]]*");//url(很不全面)
OrFilter of = new OrFilter();
of.setPredicates(new NodeFilter[]{rfDate, rfURL});
try {
NodeList nl = parser.extractAllNodesThatMatch(of);
for(int i=0; i<nl.size(); i++) {
msg("------>Regex(" + (i+1) + ")toHtml=" + nl.elementAt(i).toHtml());
}
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
}
/*
* 对<table><tr><td></td></tr></table>的解析
*/
public void testTable() {
NodeFilter table = new NodeClassFilter(TableTag.class);
OrFilter of = new OrFilter();
of.setPredicates(new NodeFilter[]{table});
try {
NodeList nl = parser.extractAllNodesThatMatch(of);
for(int i=0; i<nl.size(); i++) {
if(nl.elementAt(i) instanceof TableTag) {
TableTag tt = (TableTag) nl.elementAt(i);
TableRow[] trs = tt.getRows();
for(int k=0; k<trs.length; k++) {
TableRow tr = trs[k];
msg("------>tr(" + (i+1) + "-" + (k+1) + ")toHtml=" + tr.toHtml());
TableColumn[] tcs = tr.getColumns();
for(TableColumn tc : tcs) {
msg("------>------>td-toHtml=" + tc.toHtml());
}
}
}
}
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
}
/*
* 测试HtmlPage的用法
*/
public void testHtmlPage() {
HtmlPage hp = new HtmlPage(parser);
try {
parser.visitAllNodesWith(hp);
msg("HtmlPage------>title=" + hp.getTitle());
NodeList nl = hp.getBody();
for(NodeIterator ni=nl.elements(); ni.hasMoreNodes();) {
Node n = ni.nextNode();
msg("Node------>Class=" + n.getClass() + "------>Text=" + n.getText());
}
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
}
/*
* 采用bean方式访问html
*/
public void testLinkBean() {
LinkBean lb = new LinkBean();
lb.setURL("http://www.csdn.net");
URL[] urls = lb.getLinks();
for(int i=0; i<urls.length; i++) {
URL url = urls[i];
System.err.println("第" + (i+1) + "个超链接:" + url);
Parser p = new Parser();
try {
p.setURL(url.toString());
p.setEncoding(ENCODE);
NodeVisitor nv = new NodeVisitor() {
public void visitTag(Tag t) {
msg("Tag------>name=" + t.getTagName() + "------>Text=" + t.getText());
}
};
parser.visitAllNodesWith(nv);
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
}
}
/*
* 采用Visitor方式访问html
*/
public void testNodeVisitor() {
NodeVisitor nv = new NodeVisitor() {
public void visitTag(Tag t) {
msg("Tag------>name=" + t.getTagName() + "------>Text=" + t.getText());
}
};
try {
parser.visitAllNodesWith(nv);
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
}
}
/*
对于树型结构进行遍历的函数,这些函数最容易理解:
Node getParent ():取得父节点
NodeList getChildren ():取得子节点的列表
Node getFirstChild ():取得第一个子节点
Node getLastChild ():取得最后一个子节点
Node getPreviousSibling ():取得前一个兄弟
Node getNextSibling ():取得下一个兄弟节点
取得Node内容的函数:
String getText ():取得文本
String toPlainTextString():取得纯文本信息。
String toHtml () :取得HTML信息(原始HTML)
String toHtml (boolean verbatim):取得HTML信息(原始HTML)
String toString ():取得字符串信息(原始HTML)
Page getPage ():取得这个Node对应的Page对象
int getStartPosition ():取得这个Node在HTML页面中的起始位置
int getEndPosition ():取得这个Node在HTML页面中的结束位置
用于Filter过滤的函数:
void collectInto (NodeList list, NodeFilter filter):基于filter的条件对于这个节点进行过滤,符合条件的节点放到list中。
用于Visitor遍历的函数:
void accept (NodeVisitor visitor):对这个Node应用visitor
用于修改内容的函数,这类用得比较少:
void setPage (Page page):设置这个Node对应的Page对象
void setText (String text):设置文本
void setChildren (NodeList children):设置子节点列表
其他函数:
void doSemanticAction ():执行这个Node对应的操作
Object clone ():接口Clone的抽象函数
*/
package org.sam.util;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.ObjectFindingVisitor;
/**
* 抓取图片
* @author ssh_kobe Date: 2011-12-09
*/
public class SnatchImg {
private static final String ENCODE = "GBK";
private static final int SIZE = 1024;
public static void main(String[] ss) {
String rootName = System.getProperty("user.name");//计算机用户账户
String tmpdir = System.getProperty("java.io.tmpdir");//默认的临时文件路径
String desktop = tmpdir.substring(0, tmpdir.indexOf(rootName)) + rootName + "/Desktop/";
desktop = desktop.replace("\\", "/");//桌面路径//(这个取桌面路径方法一般情况下都适用)
catchImage("http://fj.sina.com.cn/news/p/p/2011-11-02/1056114622_5.html", "E:/img/");
}
/*
* 保存文件
*/
public static void saveFile(String urlPath, String savePath) {
if(!urlPath.startsWith("http://") && urlPath.lastIndexOf("/") < 0) {
System.err.println("文件:" + urlPath + " 路径错误!");
} else {
try {
URL url = new URL(urlPath);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();//打开远程连接
conn.setDoInput(true);
conn.setRequestMethod("GET");
conn.setConnectTimeout(60000);//1分钟
if(conn.getResponseCode() == 200) {
BufferedInputStream bis = new BufferedInputStream(conn.getInputStream());
File outFile = new File(reName(savePath, urlPath.substring(urlPath.lastIndexOf("/") + 1)));
FileOutputStream fos = new FileOutputStream(outFile);
byte[] buff = new byte[SIZE];
int len = -1;
int k = 0;
while((len=bis.read(buff)) != -1) {
fos.write(buff, 0 , len);
k++;
}
fos.flush();
bis.close();
fos.close();
if(k < 10) {//小于10KB
outFile.delete();
} else {
System.out.println("文件名:" + outFile.getName());
}
}
conn.disconnect();
} catch (FileNotFoundException fnfe) {
System.err.println("文件:" + urlPath + " 不存在!");
} catch (IOException ioe) {
System.err.println("读取文件:" + urlPath + " 失败!");
}
}
}
/*
* 抓取图片
*/
public static void catchImage(String url, String save) {//此方法有很多情况不适用,且抓取的图片失真
Parser parser = new Parser();
ObjectFindingVisitor ofv = new ObjectFindingVisitor(ImageTag.class);
try {
parser.setURL(url);
parser.setEncoding(ENCODE);
parser.visitAllNodesWith(ofv);
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
Node[] nodes = ofv.getTags();
for(int i=0; i<nodes.length; i++) {
ImageTag it = (ImageTag)nodes[i];//图片节点
String imgSrc = null;
if(!it.getImageURL().startsWith("http://")) {
try {
URL httpPath = new URL(url);
if(it.getImageURL().startsWith("/")) {
imgSrc = "http://" + httpPath.getHost() + it.getImageURL();
} else {
imgSrc = "http://" + httpPath.getHost() + "/" + it.getImageURL();
}
} catch (MalformedURLException e) {
e.printStackTrace();
}
} else {
imgSrc = it.getImageURL();
}
saveFile(imgSrc, save);
}
}
private static String reName(String path, String name) {
File file = new File(path);
File[] files = file.listFiles();
String pre = "";
String ext = "";
if(name.contains(".")) {
ext = name.substring(name.indexOf("."));
pre = name.substring(0, name.indexOf("."));
} else {
pre = name;
}
int i = 1;
boolean flag = true;
while(flag) {
boolean bl = false;
for(File f : files) {
if(f.getName().equals(name)) {
name = pre + "(" + (i++) + ")" + ext;
bl = true;
break;
}
}
if(!bl) {
flag = false;
}
}
return path + name;
}
}
package org.sam.util;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.ObjectFindingVisitor;
/**
* 抓取图片
* @author ssh_kobe Date: 2011-12-09
*/
public class SnatchImg {
private static final String ENCODE = "GBK";
private static final int SIZE = 1024;
public static void main(String[] ss) {
//for(int page=0; page<79; page++) {
catchImage("", "E:/img/");
//}
}
/*
* 保存文件
*/
public static String saveFile(String urlPath, String savePath) {
if(!urlPath.startsWith("http://") && urlPath.lastIndexOf("/") < 0) {
System.err.println("文件:" + urlPath + " 路径错误!");
} else {
try {
long start = System.currentTimeMillis();
URL url = new URL(urlPath);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();//打开远程连接
conn.setDoInput(true);
conn.setRequestMethod("GET");
//conn.setConnectTimeout(60000);//1分钟
if(conn.getResponseCode() == 200) {
String lastName = reName(savePath, urlPath.substring(urlPath.lastIndexOf("/") + 1));
if(!lastName.equals(savePath + urlPath.substring(urlPath.lastIndexOf("/") + 1))) {
return "fail";
}
File outFile = new File(lastName);
BufferedInputStream bis = new BufferedInputStream(conn.getInputStream());
FileOutputStream fos = new FileOutputStream(outFile);
byte[] buff = new byte[SIZE];
int len = -1;
int k = 0;
while((len=bis.read(buff)) != -1) {
fos.write(buff, 0 , len);
k++;
}
fos.flush();
bis.close();
fos.close();
if(k < 10) {//小于10KB
outFile.delete();
} else {
System.out.println("文件名:" + outFile.getName() + " 用时ms:" + (System.currentTimeMillis() - start));
}
}
conn.disconnect();
} catch (FileNotFoundException fnfe) {
System.err.println("文件:" + urlPath + " 不存在!");
} catch (IOException ioe) {
System.err.println("读取文件:" + urlPath + " 失败!");
}
}
return "ok";
}
/*
* 抓取图片
*/
public static void catchImage(String url, String save) {//此方法有很多情况不适用,且抓取的图片失真
Parser parser = new Parser();
ObjectFindingVisitor ofv = new ObjectFindingVisitor(ImageTag.class);
try {
parser.setURL(url);
parser.setEncoding(ENCODE);
parser.visitAllNodesWith(ofv);
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
Node[] nodes = ofv.getTags();
for(int i=0; i<nodes.length; i++) {
ImageTag it = (ImageTag)nodes[i];//图片节点
String imgSrc = null;
if(!it.getImageURL().startsWith("http://")) {
try {
URL httpPath = new URL(url);
if(it.getImageURL().startsWith("/")) {
imgSrc = "http://" + httpPath.getHost() + it.getImageURL();
} else {
imgSrc = "http://" + httpPath.getHost() + "/" + it.getImageURL();
}
} catch (MalformedURLException e) {
e.printStackTrace();
}
} else {
imgSrc = it.getImageURL();
}
saveFile(imgSrc, save);
}
}
private static String reName(String path, String name) {
File file = new File(path);
File[] files = file.listFiles();
String pre = "";
String ext = "";
if(name.contains(".")) {
ext = name.substring(name.indexOf("."));
pre = name.substring(0, name.indexOf("."));
} else {
pre = name;
}
int i = 1;
boolean flag = true;
while(flag) {
boolean bl = false;
for(File f : files) {
if(f.getName().equals(name)) {
name = pre + "(" + (i++) + ")" + ext;
bl = true;
break;
}
}
if(!bl) {
flag = false;
}
}
return path + name;
}
}
****************************************************************************************************************************另外
package org.sam.util;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.ObjectFindingVisitor;
/**
* 抓取图片
*
* @author ssh_kobe Date: 2011-12-09
*/
public class SnatchImg {
private static final String ENCODE = "GBK";
private static final int SIZE = 1024;
public static void main(String[] ss) {
String rootName = System.getProperty("user.name");// 计算机用户账户
String tmpdir = System.getProperty("java.io.tmpdir");// 默认的临时文件路径
String desktop = tmpdir.substring(0, tmpdir.indexOf(rootName))
+ rootName + "/Desktop/";
desktop = desktop.replace("\\", "/");// 桌面路径//(这个取桌面路径方法一般情况下都适用)
catchImage(
"http://fj.sina.com.cn/news/p/p/2011-11-02/1056114622_5.html",
"E:/img/");
}
/*
* 保存文件
*/
public static void saveFile(String urlPath, String savePath) {
if (!urlPath.startsWith("http://") && urlPath.lastIndexOf("/") < 0) {
System.err.println("文件:" + urlPath + " 路径错误!");
} else {
try {
URL url = new URL(urlPath);
HttpURLConnection conn = (HttpURLConnection) url
.openConnection();// 打开远程连接
conn.setDoInput(true);
conn.setRequestMethod("GET");
conn.setConnectTimeout(60000);// 1分钟
if (conn.getResponseCode() == 200) {
BufferedInputStream bis = new BufferedInputStream(
conn.getInputStream());
File outFile = new File(reName(savePath,
urlPath.substring(urlPath.lastIndexOf("/") + 1)));
FileOutputStream fos = new FileOutputStream(outFile);
byte[] buff = new byte[SIZE];
int len = -1;
int k = 0;
while ((len = bis.read(buff)) != -1) {
fos.write(buff, 0, len);
k++;
}
fos.flush();
bis.close();
fos.close();
if (k < 10) {// 小于10KB
outFile.delete();
} else {
System.out.println("文件名:" + outFile.getName());
}
}
conn.disconnect();
} catch (FileNotFoundException fnfe) {
System.err.println("文件:" + urlPath + " 不存在!");
} catch (IOException ioe) {
System.err.println("读取文件:" + urlPath + " 失败!");
}
}
}
/*
* 抓取图片
*/
public static void catchImage(String url, String save) {// 此方法有很多情况不适用,且抓取的图片失真
Parser parser = new Parser();
ObjectFindingVisitor ofv = new ObjectFindingVisitor(ImageTag.class);
try {
parser.setURL(url);
parser.setEncoding(ENCODE);
parser.visitAllNodesWith(ofv);
} catch (ParserException pe) {
System.err.println(pe.getMessage());
}
Node[] nodes = ofv.getTags();
for (int i = 0; i < nodes.length; i++) {
ImageTag it = (ImageTag) nodes[i];// 图片节点
String imgSrc = null;
if (!it.getImageURL().startsWith("http://")) {
try {
URL httpPath = new URL(url);
if (it.getImageURL().startsWith("/")) {
imgSrc = "http://" + httpPath.getHost()
+ it.getImageURL();
} else {
imgSrc = "http://" + httpPath.getHost() + "/"
+ it.getImageURL();
}
} catch (MalformedURLException e) {
e.printStackTrace();
}
} else {
imgSrc = it.getImageURL();
}
saveFile(imgSrc, save);
}
}
private static String reName(String path, String name) {
File file = new File(path);
File[] files = file.listFiles();
String pre = "";
String ext = "";
if (name.contains(".")) {
ext = name.substring(name.indexOf("."));
pre = name.substring(0, name.indexOf("."));
} else {
pre = name;
}
int i = 1;
boolean flag = true;
while (flag) {
boolean bl = false;
for (File f : files) {
if (f.getName().equals(name)) {
name = pre + "(" + (i++) + ")" + ext;
bl = true;
break;
}
}
if (!bl) {
flag = false;
}
}
return path + name;
}
}