- mport org.htmlparser.Parser;
- import org.htmlparser.beans.StringBean;
- import org.htmlparser.filters.NodeClassFilter;
- import org.htmlparser.parserapplications.StringExtractor;
- import org.htmlparser.tags.BodyTag;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- public class GetContent {
- public void getContentUsingStringBean(String url) {
- StringBean sb = new StringBean();
- sb.setLinks(true);
- sb.setCollapse(true);
- sb.setReplaceNonBreakingSpaces(true);// If true regular space
- sb.setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");
- System.out.println("The Content is :\n" + sb.getStrings());
- }
- public void getContentUsingStringExtractor(String url, boolean link) {
- StringExtractor se = new StringExtractor(url);
- String text = null;
- try {
- text = se.extractStrings(link);
- System.out.println("The content is :\n" + text);
- } catch (ParserException e) {
- e.printStackTrace();
- }
- }
- public void getContentUsingParser(String url) {
- NodeList nl;
- try {
- Parser p = new Parser(url);
- nl = p.parse(new NodeClassFilter(BodyTag.class));
- BodyTag bt = (BodyTag) nl.elementAt(0);
- System.out.println(bt.toPlainTextString());
- } catch (ParserException e) {
- e.printStackTrace();
- }
- }
- public static void main(String[] args) {
- GetContent g = new GetContent();
- // g.getContentUsingStringBean("");
- // g.getContentUsingParser("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");
- g.getContentUsingStringExtractor("http://www.sina.com.cn/", false);
- }
htmlparser提取网页正文
最新推荐文章于 2015-01-14 10:22:53 发布