HTML Parser 用于对HTML进行解析,并从中攫取你所需的信息。
Java版:
http://htmlparser.sourceforge.net/
http://www.ibm.com/developerworks/cn/java/l-html-parser/
Htmlparser的使用范例:
package com.amigo.htmlparser;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import org.htmlparser.filters.*;
import org.htmlparser.*;
import org.htmlparser.nodes.*;
import org.htmlparser.tags.*;
import org.htmlparser.util.*;
import org.htmlparser.visitors.*;
public class HTMLParserTest {
public static void main(String args[]) throws Exception {
String path = "http://www.blogjava.net/amigoxie";
URL url = new URL(path);
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
InputStream inputStream = conn.getInputStream();
InputStreamReader isr = new InputStreamReader(inputStream, "utf8");
StringBuffer sb = new StringBuffer();
BufferedReader in = new BufferedReader(isr);
String inputLine;
while ((inputLine = in.readLine()) != null) {
sb.append(inputLine);
sb.append("\n");
}
String result = sb.toString();
readByHtml(result);
readTextAndLinkAndTitle(result);
}
public static void readByHtml(String content) throws Exception {
Parser myParser;
myParser = Parser.createParser(content, "utf8");
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
String textInPage = visitor.getTitle();
System.out.println(textInPage);
NodeList nodelist;
nodelist = visitor.getBody();
System.out.print(nodelist.asString().trim());
}
public static void readTextAndLinkAndTitle(String result) throws Exception {
Parser parser;
NodeList nodelist;
parser = Parser.createParser(result, "utf8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter, titleFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
} else if (node instanceof TitleTag) {
TitleTag titlenode = (TitleTag) node;
line = titlenode.getTitle();
}
if (isTrimEmpty(line))
continue;
System.out.println(line);
}
}
public static boolean isTrimEmpty(String astr) {
if ((null == astr) || (astr.length() == 0)) {
return true;
}
if (isBlank(astr.trim())) {
return true;
}
return false;
}
public static boolean isBlank(String astr) {
if ((null == astr) || (astr.length() == 0)) {
return true;
} else {
return false;
}
}
}
PHP版:
http://sourceforge.net/projects/simplehtmldom/
http://sourceforge.net/projects/html-parser/
$i=0;
while($content=='' or $i==3){
@$content =file_get_contents($url);
$i++;
}
if($i==3) exit("next");
.Net版
http://download.csdn.net/source/737172