import
org.cyberneko.html.parsers.DOMFragmentParser;
import org.apache.html.dom.HTMLDocumentImpl;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* 从html中抽取纯文本
*
* @param content
* @return
* @throws UnsupportedEncodingException
*/
public String extractTextFromHTML(String content)
throws UnsupportedEncodingException {
DOMFragmentParser parser = new DOMFragmentParser();
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
InputStream is = new ByteArrayInputStream(content.getBytes());
try {
parser.parse( new InputSource(is), node);
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException se) {
se.printStackTrace();
}
StringBuffer newContent = new StringBuffer();
this .getText(newContent, node);
String str = ( new String(
newContent.toString().getBytes( " Windows-1252 " ), " GBK " ));
return str;
}
private void getText(StringBuffer sb, Node node) {
if (node.getNodeType() == Node.TEXT_NODE) {
sb.append(node.getNodeValue());
}
NodeList children = node.getChildNodes();
if (children != null ) {
int len = children.getLength();
for ( int i = 0 ; i < len; i ++ ) {
getText(sb, children.item(i));
}
}
}
import org.apache.html.dom.HTMLDocumentImpl;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* 从html中抽取纯文本
*
* @param content
* @return
* @throws UnsupportedEncodingException
*/
public String extractTextFromHTML(String content)
throws UnsupportedEncodingException {
DOMFragmentParser parser = new DOMFragmentParser();
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
InputStream is = new ByteArrayInputStream(content.getBytes());
try {
parser.parse( new InputSource(is), node);
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException se) {
se.printStackTrace();
}
StringBuffer newContent = new StringBuffer();
this .getText(newContent, node);
String str = ( new String(
newContent.toString().getBytes( " Windows-1252 " ), " GBK " ));
return str;
}
private void getText(StringBuffer sb, Node node) {
if (node.getNodeType() == Node.TEXT_NODE) {
sb.append(node.getNodeValue());
}
NodeList children = node.getChildNodes();
if (children != null ) {
int len = children.getLength();
for ( int i = 0 ; i < len; i ++ ) {
getText(sb, children.item(i));
}
}
}
1,nekohtml1.9.6.1版本用到了jdk5的Arrays.hashCode等方法,为兼容jdk1.4,
故采用nekohtml1.9.6版本
2,需要xerces.jar支持
3,
http://hi.baidu.com/walkandsing/blog/item/f5743634c6ba2e3a5bb5f5e5.html