最近开发一个爬取新闻的小虫子,使用了htmlparser,感觉还不错?
package org.snowfish.Parser;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.*;
import org.htmlparser.tags.*;
import org.htmlparser.util.*;
import org.snowfish.News.News;
import org.snowfish.Utils.PrintUtil;
public class IFengNewsGetter implements NewsGetter {
//常用tag过滤器
static TagNameFilter divFilter = new TagNameFilter("div");
static TagNameFilter h1Filter = new TagNameFilter("h1");
static TagNameFilter h2Filter = new TagNameFilter("h2");
static TagNameFilter pFilter = new TagNameFilter("p");
static TagNameFilter spanFilter = new TagNameFilter("span");
/**
*解析新闻方法
* url:新闻列表页地址
* date:新闻起始时间
*/
public News[] parseNews(String url, String date) {
List
listOfNews = new ArrayList
(); //新闻对象列表
try {
/* 解析新闻列表页中列表 */
Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection());
NodeFilter nf = new AndFilter(divFilter, new HasAttributeFilter("class", "comListBox"));
NodeList newsList = parser.parse(nf);
NodeList titleList = newsList.extractAllNodesThatMatch(h2Filter, true);
NodeList timeList = newsList.extractAllNodesThatMatch(pFilter, true);
NodeFilter contentAllFilter = new AndFilter(divFilter, new HasAttributeFilter("id", "artical"));
AndFilter contentFilter = new AndFilter(divFilter, new HasAttributeFilter("id", "main_content"));
AndFilter datePublished = new AndFilter(spanFilter, new HasAttributeFilter("itemprop", "datePublished"));
AndFilter publisher = new AndFilter(spanFilter, new HasAttributeFilter("itemprop", "publisher"));
for (int i = 0; i < titleList.size(); i++) {
/* 解析新闻列表中各条新闻标题、时间及链接 */
String time = timeList.elementAt(i).toPlainTextString();
if(date!=""&&date!=null){
DateFormat fmt = new SimpleDateFormat("yyyy/MM/dd HH:mm");//yyyyMMddHHmmssSSSS
Date newsBegin = fmt.parse(date);
Date newsTime = fmt.parse(time);
if (!newsBegin.before(newsTime))
break; // 时间比较,目标新闻时间小于等于起始时间则终止解析
}
LinkTag lt = (LinkTag) titleList.elementAt(i).getChildren().elementAt(0);
String title = lt.toPlainTextString();
/* 将列表页中信息存入临时对象 */
News tmp = new News();
tmp.setTitle(title);
tmp.setTime(time);
tmp.setUrl(lt.getLink());
/* 解析单条新闻中编辑信息及内容 */
parser.setURL(lt.getLink());
NodeList contentAll = parser.parse(contentAllFilter);
tmp.setDatePublished(
contentAll.extractAllNodesThatMatch(datePublished, true).elementAt(0).toPlainTextString());
tmp.setPublisher(contentAll.extractAllNodesThatMatch(publisher, true).elementAt(0).getChildren()
.elementAt(0).toPlainTextString());
NodeList para = contentAll.extractAllNodesThatMatch(contentFilter, true).elementAt(0).getChildren();
String tmpContent = "";
for (int j = 0; j < para.size(); j++) {
Node ttmp = para.elementAt(j);
/* 新闻中图片单独处理*/
if (ttmp.toHtml().contains("detailPic")) {
ImageTag it = (ImageTag) ttmp.getChildren().elementAt(0);
tmpContent+="
"+it.toHtml().trim()+"\n";
continue;
} else if (ttmp.toHtml().contains("picIntro")) {
tmpContent+="
"+ttmp.toPlainTextString().trim()+"\n";
continue;
}
/* 新闻中正文 */
tmpContent += ttmp.toPlainTextString().trim()+"\n";
}
/* 正文格式调整 */
tmpContent = tmpContent.trim().replaceAll(" ", " ").replaceAll(" ", "\n ");
/* 正文内容存入临时对象 */
tmp.setContent(tmpContent);
listOfNews.add(tmp);
}
News[] list = (News[]) listOfNews.toArray(new News[0]);
return list;
} catch (Exception e) {
PrintUtil.messageln("ERRRRRRRRRRRRRRRRRRRRRRRRRRRRRR");
PrintUtil.messageln(e.getMessage());
}
return null;
}
}
package org.snowfish.News;
import org.snowfish.Utils.HTMLReplacer;
public class News {
private String title;
private String time;
private String content;
private String url;
private String datePublished;
private String publisher;
public News() {
this.title = "";
this.time = "";
this.content = "";
this.url = "";
this.datePublished = "";
this.publisher = "";
}
/* Setter与Getter */
public String getDatePublished() {
return datePublished;
}
public void setDatePublished(String datePublished) {
this.datePublished = datePublished;
}
public String getPublisher() {
return publisher;
}
/* 替换HTML符号代码 */
public void setPublisher(String publisher) {
this.publisher = HTMLReplacer.replaceAllSpecialSymbols(publisher);
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = HTMLReplacer.replaceAllSpecialSymbols(title);
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = HTMLReplacer.replaceAllSpecialSymbols(content);
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}
package org.snowfish.Utils;
public class HTMLReplacer {
/**
* @param s
* 待替换字符串
*/
public static String replaceAllSpecialSymbols(String s) {
s = s.replaceAll("&", "&");
s = s.replaceAll(""", "\"");
s = s.replaceAll("<", "<");
s = s.replaceAll(">", ">");
s = s.replaceAll(" ", " ");
s = s.replaceAll("«", "《");
s = s.replaceAll("»", "》");
s = s.replaceAll("‘", "‘");
s = s.replaceAll("’", "’");
s = s.replaceAll("“", "“");
s = s.replaceAll("”", "”");
s = s.replaceAll("§", "§");
s = s.replaceAll("…", "…");
s = s.replaceAll("·", "·");
s = s.replaceAll("‰", "‰");
s = s.replaceAll("—", "—");
s = s.replaceAll("•", "·");
s = s.replaceAll("Ø", "〇");
return s;
}
}
package org.snowfish.Utils;
public class PrintUtil {
private static String ENCODE = "GBK";
public static void message(String szMsg) {
try {
System.out.print(new String(szMsg.getBytes(ENCODE), System.getProperty("file.encoding")));
} catch (Exception e) {
}
}
public static void messageln(String szMsg) {
try {
System.out.println(new String(szMsg.getBytes(ENCODE), System.getProperty("file.encoding")));
} catch (Exception e) {
}
}
}
package org.snowfish.Parser;
import org.snowfish.News.News;
public interface NewsGetter {
News[] parseNews(String url,String date);
}
使用方式:
/* 解析URL中2016/09/12 16:52之后的新闻 */
News[] data = new IFengNewsGetter().parseNews("http://news.ifeng.com/listpage/7129/1/list.shtml","2016/09/12 16:52");
for(News i :
data){
PrintUtil.messageln(i.getTitle());
}
PrintUtil.messageln(i.getTitle());
}