package atest; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters. * ; import org.htmlparser.tags.Div; import org.htmlparser.tags.LinkTag; import org.htmlparser.util. * ; import com.jdon.controller.events.EventModel; import cn.vetech.framework.base.dao.B_class_dao; import cn.vetech.framework.base.dao.B_class_dao_sql; import cn.vetech.framework.dao.SqlMapDaoTemplateFactory; import cn.vetech.framework.news.dao.B_news_dao; import cn.vetech.framework.news.dao.B_news_dao_sql; import cn.vetech.framework.news.model.B_news; import cn.vetech.framework.news.service.B_news_service; import cn.vetech.framework.news.service.B_news_service_imp; import cn.vetech.framework.util.VeDate; /** */ /** * 抓取中华培训网规则 1.抓取chinahtml/zixunzhongxin目录下的文件 * 说明:循环网址下所有链接,根据一定规则过滤掉一部分链接,读出页面指定节点下的内容 * @author sam.zhang * */ public class TestParser ... { private B_news_dao b_news_dao; private B_class_dao b_class_dao; private B_news_service b_news_service; private String TRAINING_URL = "http://www.china-training.com"; // 中华培训网网址 private static String TRAINING_ZXURL = "http://www.china-training.com/newpage/zxzx.asp"; // 中华培训网资讯根网址 private String FILEPATH = "chinahtml"; // 中华培训网放置新闻的根文件夹名 private String ZXZX = "zixunzhongxin"; // 存放资讯中心的目录 public TestParser() ...{ SqlMapDaoTemplateFactory sqldao = new SqlMapDaoTemplateFactory(); b_news_dao = new B_news_dao_sql(sqldao); b_class_dao = new B_class_dao_sql(sqldao); b_news_service = new B_news_service_imp(b_news_dao, b_class_dao, null, null); } /** *//** * 得到指定网址下所有链接 * * @param url */ public void getAllUrls(String url) ...{ NodeList nodeList = null; try ...{ Parser p = new Parser(url); // nodeList = p.parse(new TagNameFilter( "A ")); // 使用TagNameFilter(两种写法都可以) nodeList = p.parse(new NodeClassFilter(LinkTag.class)); // 使用NodeClassFilter } catch (ParserException e) ...{ e.printStackTrace(); } if (nodeList != null && nodeList.size() > 0) ...{ for (int i = 0; i < nodeList.size(); i++) ...{ String u = ((LinkTag) nodeList.elementAt(i)).getLink() .toString(); if (isIndesOf(u) != -1) ...{ int num = ((LinkTag) nodeList.elementAt(i)).getLink() .toString().split("/").length; if (num > 2) ...{ String str_1 = ((LinkTag) nodeList.elementAt(i)) .getLink().toString().split("/")[1]; String str_2 = ((LinkTag) nodeList.elementAt(i)) .getLink().toString().split("/")[2]; if (str_1 != "" && str_2 != "" && FILEPATH.equals(str_1) && ZXZX.equals(str_2)) ...{ String str_utl = ((LinkTag) nodeList.elementAt(i)) .getLink().toString(); str_utl = TRAINING_URL + str_utl; run(str_utl); } } } } } } /** *//** * 得到根网址的实际内容 * * @param str */ public static void run(String str) ...{ try ...{ NodeList nodelist; Parser parser = new Parser(str); //主要对象(传入网址) NodeFilter divFilter = new NodeClassFilter(Div.class); //创建一个div对象 OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] ...{ divFilter }); nodelist = parser.parse(lastFilter); Node[] nodes = nodelist.toNodeArray(); String line = ""; String title = ""; // 标题 String content = ""; // 内容 String source = ""; // 来源 for (int i = 0; i < nodes.length; i++) ...{ Node node = nodes[i]; if (node instanceof Div) ...{ Div textnode = (Div) node; line = textnode.getAttribute("id"); if ("news_title".equals(line)) ...{ // ...得到新闻标题(纯文本) title = textnode.toPlainTextString(); } // ...得到文章正文(源码) if ("news_content".equals(line)) ...{ content = textnode.getStringText(); } // ...得到文章来源(纯文本) if ("news_date".equals(line)) ...{ source = isTrim(textnode.toPlainTextString()); if (isIndesOfSource(source) != -1) ...{ source = isTrimSource(source.split(":")[1]); if