HtmlParser 解析搜索页面


package com.safetys.crawler.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.safetys.framework.exception.ApplicationAccessException;
/**
* 整合百度、谷歌搜索数据
* @author zhaozhi3758
* date:2011-04-19
*/
public class Crawler {


private final static String splitStr="zzc@cheng";
private String encoding="gbk"; //解析页面编码
public String searchMode;//指定搜索方式 keyword 按关键字搜索,specifyUrl 按指定url搜索
public String baiduUrl; //百度搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.baidu.com/s?rn=${searchNum}&wd=${keyword}"
public String googleUrl; //google 搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=${keyword}&num=${searchNum}&aq=f&aqi=&aql=&oq=&gs_rfai="
public String keyword; //搜索关键字
public int searchNum = 0;//搜索数量
public String specifyUrl; //按指定的url 搜索

/**
* 抓取百度搜索结果页面
*/
public List<String> crawlerBaidu(){
Parser myParser = new Parser();
try {
myParser.setURL(getBaiduUrl());
myParser.setEncoding(myParser.getEncoding());
} catch (ParserException e1) {
e1.printStackTrace();
}
NodeList nodeList = null;
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
List<String> result = new ArrayList<String>();
try {
nodeList = myParser.parse(lastFilter);
for (int i = 0; i <= nodeList.size(); i++) {
if (nodeList.elementAt(i) instanceof TableTag) {
TableTag tag = (TableTag) nodeList.elementAt(i);
if(tag.getAttribute("id")!=null){
result.addAll(getBaiduLink(tag.getChildrenHTML()));
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return result;
}

private List<String> getBaiduLink(String s){
Parser myParser;
NodeList nodeList = null;
myParser = Parser.createParser(s,encoding);
List<String> result = new ArrayList<String>();
try {
//设置解析编码格式
nodeList =myParser.parse (new NodeClassFilter(LinkTag.class)) ; // 使用 NodeClassFilter
if (nodeList!=null && nodeList.size () > 0) {
// 循环遍历每个Url 节点
for (int l = 0; l < nodeList.size () ; l ++) {
String urlLink= ((LinkTag) nodeList.elementAt (l)) .extractLink () ;
String LinkName = ((LinkTag) nodeList.elementAt (l)).getLinkText () ;
if(!LinkName.equals("百度快照") && urlLink.indexOf("baidu")==-1 && urlLink.indexOf("http") == 0){
System.out.println("baidu--->"+LinkName + splitStr + urlLink);
result.add(LinkName + splitStr + urlLink);
}
}
}
} catch (ParserException e) {
e.printStackTrace () ;
}
return result;
}


/**
* 抓取谷歌搜索结果页面的指定范围的链接
*/
private List<String> crawlerGoogle() {
String htmlstr = getUrlHtmlByHttpClient(getGoogleUrl());
List<String> result = new ArrayList<String>();
try {
Parser parser = Parser.createParser(htmlstr, encoding);
// 创建TagNameFilter实例
TagNameFilter filter = new TagNameFilter("A");
// 筛选出所有A标签节点
NodeList nodes = parser.extractAllNodesThatMatch(filter);
if (nodes != null) {
for (int i = 0; i < nodes.size(); i++) {
LinkTag tag = (LinkTag) nodes.elementAt(i);
if (tag.getLink().indexOf ("google") ==-1 && tag.getLink().indexOf ("http") == 0 ){
System.out.println("google--->"+tag.getLinkText() +splitStr+ tag.getLink());
result.add(tag.getLinkText() +splitStr+ tag.getLink());
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
/**
* 模拟客户端访问获取搜索结果页面
* @param url
* @return
*/
private String getUrlHtmlByHttpClient(String url) {
String searchHtml = null;
HttpClient httpClient = new HttpClient();
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000);
GetMethod getMethod = new GetMethod(url);
getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
try {
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: "
+ getMethod.getStatusLine());
}
InputStream bodyIs = getMethod.getResponseBodyAsStream();//
//System.out.println("get reoponse body stream:" + bodyIs);

//如果中文乱码 修改字符集
BufferedReader br = new BufferedReader(
new InputStreamReader(bodyIs,encoding));
//BufferedReader br = new BufferedReader(
//new InputStreamReader(bodyIs));
StringBuffer sb = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) {
sb.append(line);
}
searchHtml = sb.toString();
return searchHtml;
} catch (HttpException e) {
System.out.println("Please check your http address!");
e.printStackTrace();
return null;
} catch (IOException e) {
e.printStackTrace();
return null;
} finally {
getMethod.releaseConnection();
}

}

/**
* 按关键字抓取的统一入口
*/
public List<String> crawler() throws ApplicationAccessException{
if(null == searchMode || searchMode.equals(""))
throw new ApplicationAccessException("searchMode is null");
Set<String> set = new HashSet<String>();
List<String> list = new ArrayList<String>();
if(searchMode.equals("specifyUrl")){ //按指定url搜索


}
else if(searchMode.equals("keyword")){ //按关键字搜索
set.addAll(crawlerBaidu());
set.addAll(crawlerGoogle());
}
list.addAll(set);
return list;
}


public String getBaiduUrl() {
return baiduUrl.replace("${keyword}", getKeyword()).replace("${searchNum}", ""+(searchNum/2));
}
public void setBaiduUrl(String baiduUrl) {
this.baiduUrl = baiduUrl;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public String getGoogleUrl() {

return googleUrl.replace("${keyword}",getKeyword()).replace("${searchNum}",""+(searchNum/2));
}
public void setGoogleUrl(String googleUrl) {
this.googleUrl = googleUrl;
}
public String getKeyword() {
String key ="";
try {
key = URLEncoder.encode(keyword,encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return key;
}
public void setKeyword(String keyword) {
this.keyword = keyword;
}
public String getSearchMode() {
return searchMode;
}
public void setSearchMode(String searchMode) {
this.searchMode = searchMode;
}
public int getSearchNum() {
return searchNum;
}
public void setSearchNum(int searchNum) {
this.searchNum = searchNum;
}
public String getSpecifyUrl() {
return specifyUrl;
}
public void setSpecifyUrl(String specifyUrl) {
this.specifyUrl = specifyUrl;
}

public static void main(String[] args) throws ApplicationAccessException {

Crawler cl = new Crawler();
cl.setEncoding("gbk");
cl.setSearchNum(10);
cl.setKeyword("面包");
cl.setSearchMode("keyword");
cl.setBaiduUrl("http://www.baidu.com/s?rn=${searchNum}&wd=${keyword}");
cl.setGoogleUrl("http://www.google.com.hk/search?hl=zh-CN&source=hp&q=${keyword}&num=${searchNum}&aq=f&aqi=&aql=&oq=&gs_rfai=");
System.out.println("=====>"+cl.getBaiduUrl());
System.out.println("=====>"+cl.getGoogleUrl());
System.out.println(cl.crawler());



}


}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值