search-demo演示了如何利用Java来调用百度搜索和谷歌搜索,更多细节请到github上查看search-demo
自己没搜索引擎,又想要大规模的数据源,怎么办?可以对百度搜索和谷歌搜索善加利用,以小搏大,站在巨人的肩膀上。有很多的应用场景可以很巧妙地借助百度搜索和谷歌搜索来实现,比如网站的新闻采集,比如技术、品牌的新闻跟踪,比如知识库的收集,比如人机问答系统等,我之前做的一个准确率达百分之九十几的人机问答系统的数据源,其中一部分就是充分利用了百度搜索和谷歌搜索。在此演示的技术的基础上,可以容易地扩展到其他的搜索引擎,可以借鉴使用的NekoHTML+XPath或JSoup+CSSPath技术,轻松获取页面的自定义的内容。
- package org.apdplat.demo.search;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.UnsupportedEncodingException;
- import java.net.URLEncoder;
- import java.util.ArrayList;
- import java.util.List;
- import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
- import org.apache.commons.httpclient.HttpClient;
- import org.apache.commons.httpclient.HttpStatus;
- import org.apache.commons.httpclient.methods.GetMethod;
- import org.apache.commons.httpclient.params.HttpMethodParams;
- import org.json.JSONArray;
- import org.json.JSONException;
- import org.json.JSONObject;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- public class GoogleSearcher implements Searcher{
- private static final Logger LOG = LoggerFactory.getLogger(GoogleSearcher.class);
- @Override
- public List<Webpage> search(String url) {
- List<Webpage> webpages = new ArrayList<>();
- try {
- HttpClient httpClient = new HttpClient();
- GetMethod getMethod = new GetMethod(url);
- httpClient.executeMethod(getMethod);
- getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
- new DefaultHttpMethodRetryHandler());
- int statusCode = httpClient.executeMethod(getMethod);
- if (statusCode != HttpStatus.SC_OK) {
- LOG.error("搜索失败: " + getMethod.getStatusLine());
- return null;
- }
- InputStream in = getMethod.getResponseBodyAsStream();
- byte[] responseBody = Tools.readAll(in);
- String response = new String(responseBody, "UTF-8");
- LOG.debug("搜索返回数据:" + response);
- JSONObject json = new JSONObject(response);
- String totalResult = json.getJSONObject("responseData").getJSONObject("cursor").getString("estimatedResultCount");
- int totalResultCount = Integer.parseInt(totalResult);
- LOG.info("搜索返回记录数: " + totalResultCount);
- JSONArray results = json.getJSONObject("responseData").getJSONArray("results");
- LOG.debug("搜索结果:");
- for (int i = 0; i < results.length(); i++) {
- Webpage webpage = new Webpage();
- JSONObject result = results.getJSONObject(i);
- //提取标题
- String title = result.getString("titleNoFormatting");
- LOG.debug("标题:" + title);
- webpage.setTitle(title);
- //提取摘要
- String summary = result.get("content").toString();
- summary = summary.replaceAll("<b>", "");
- summary = summary.replaceAll("</b>", "");
- summary = summary.replaceAll("\\.\\.\\.", "");
- LOG.debug("摘要:" + summary);
- webpage.setSummary(summary);
- //从URL中提取正文
- String _url = result.get("url").toString();
- webpage.setUrl(_url);
- String content = Tools.getHTMLContent(_url);
- LOG.debug("正文:" + content);
- webpage.setContent(content);
- webpages.add(webpage);
- }
- } catch (IOException | JSONException | NumberFormatException e) {
- LOG.error("执行搜索失败:", e);
- }
- return webpages;
- }
- public static void main(String args[]) {
- String query = "杨尚川";
- try {
- query = URLEncoder.encode(query, "UTF-8");
- } catch (UnsupportedEncodingException e) {
- LOG.error("url构造失败", e);
- return;
- }
- String url = "http://ajax.googleapis.com/ajax/services/search/web?start=0&rsz=large&v=1.0&q=" + query;
- Searcher searcher = new GoogleSearcher();
- List<Webpage> webpages = searcher.search(url);
- if (webpages != null) {
- int i = 1;
- for (Webpage webpage : webpages) {
- LOG.info("搜索结果 " + (i++) + " :");
- LOG.info("标题:" + webpage.getTitle());
- LOG.info("URL:" + webpage.getUrl());
- LOG.info("摘要:" + webpage.getSummary());
- LOG.info("正文:" + webpage.getContent());
- LOG.info("");
- }
- } else {
- LOG.error("没有搜索到结果");
- }
- }
- }