java调用百度搜索+Jsoup实现网络资源收集

Jsoup核心jar包:Jsoup核心jar包下载地址
java代码:
抽象搜索资源的实体:Webpage

package com.sinosoft.lhresource.search.common;

public class Webpage {
    // 标题
    private String title;
    // 链接
    private String url;
    // 简介
    private String summary;
    // 正文内容
    private String content;
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getUrl() {
        return url;
    }
    public void setUrl(String url) {
        this.url = url;
    }
    public String getSummary() {
        return summary;
    }
    public void setSummary(String summary) {
        this.summary = summary;
    }
    public String getContent() {
        return content;
    }
    public void setContent(String content) {
        this.content = content;
    }
}

通过资源连接获取资源内容:TextExtract.java;Tools.java


package com.sinosoft.lhresource.search.common;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TextExtract {

    private static final Logger LOG = LoggerFactory.getLogger(TextExtract.class);
    private static List<String> lines;
    private final static int blocksWidth;
    private static int threshold;
    private static String html;
    private static boolean flag;
    private static int start;
    private static int end;
    private static StringBuilder text;
    private static ArrayList<Integer> indexDistribution;

    static {
        lines = new ArrayList<>();
        indexDistribution = new ArrayList<>();
        text = new StringBuilder();
        blocksWidth = 3;
        flag = false;
        /* 当待抽取的网页正文中遇到成块的新闻标题未剔除时,只要增大此阈值即可。*/
        /* 阈值增大,准确率提升,召回率下降;值变小,噪声会大,但可以保证抽到只有一句话的正文 */
        threshold = 86;
    }

    public static void setthreshold(int value) {
        threshold = value;
    }

    /**
     * 抽取网页正文,不判断该网页是否是目录型。即已知传入的肯定是可以抽取正文的主题类网页。
     *
     * @param _html 网页HTML字符串
     *
     * @return 网页正文string
     */
    public static String parse(String _html) {
        return parse(_html, false);
    }

    /**
     * 判断传入HTML,若是主题类网页,则抽取正文;否则输出<b>"unkown"</b>。
     *
     * @param _html 网页HTML字符串
     * @param _flag true进行主题类判断, 省略此参数则默认为false
     *
     * @return 网页正文string
     */
    public static String parse(String _html, boolean _flag) {
        flag = _flag;
        html = _html;
        preProcess();
        LOG.debug(html);
        return getText();
    }

    private static void preProcess() {
        html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
        html = html.replaceAll("(?is)<!--.*?-->", "");              // remove html comment
        html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript
        html = html.replaceAll("(?is)<style.*?>.*?</style>", "");   // remove css
        html = html.replaceAll("&.{2,5};|&#.{2,5};", " ");          // remove special char
        html = html.replaceAll("(?is)<.*?>", "");
        //<!--[if !IE]>|xGv00|9900d21eb16fa4350a3001b3974a9415<![endif]--> 
    }

    private static String getText() {
        lines = Arrays.asList(html.split("\n"));
        indexDistribution.clear();

        for (int i = 0; i < lines.size() - blocksWidth; i++) {
            int wordsNum = 0;
            for (int j = i; j < i + blocksWidth; j++) {
                lines.set(j, lines.get(j).replaceAll("\\s+", ""));
                wordsNum += lines.get(j).length();
            }
            indexDistribution.add(wordsNum);
            LOG.debug(wordsNum + "");
        }

        start = -1;
        end = -1;
        boolean boolstart = false, boolend = false;
        text.setLength(0);

        for (int i = 0; i < indexDistribution.size() - 1; i++) {
            if (indexDistribution.get(i) > threshold && !boolstart) {
                if (indexDistribution.get(i + 1).intValue() != 0
                        || indexDistribution.get(i + 2).intValue() != 0
                        || indexDistribution.get(i + 3).intValue() != 0) {
                    boolstart = true;
                    start = i;
                    continue;
                }
            }
            if (boolstart) {
                if (indexDistribution.get(i).intValue() == 0
                        || indexDistribution.get(i + 1).intValue() == 0) {
                    end = i;
                    boolend = true;
                }
            }
            StringBuilder tmp = new StringBuilder();
            if (boolend) {
                LOG.debug(start + 1 + "\t\t" + end + 1);
                for (int ii = start; ii <= end; ii++) {
                    if (lines.get(ii).length() < 5) {
                        continue;
                    }
                    tmp.append(lines.get(ii)).append("\n");
                }
                String str = tmp.toString();
                LOG.debug(str);
                if (str.contains("Copyright") || str.contains("版权所有")) {
                    continue;
                }
                text.append(str);
                boolstart = boolend = false;
            }
        }
        return text.toString();
    }
}



package com.sinosoft.lhresource.search.common;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URL;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Tools {

     private static final Logger LOG = LoggerFactory.getLogger(Tools.class);

        public static String getHTMLContent(String url) {
            return getHTMLContent(url, "utf-8");
        }

        public static String getHTMLContent(String url, String encoding) {
            try {
                BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(url).openStream(),encoding));
                StringBuilder html = new StringBuilder();
                String line = reader.readLine();
                while (line != null) {
                    html.append(line).append("\n");
                    line = reader.readLine();
                }
                String content = TextExtract.parse(html.toString());
                return content;
            } catch (Exception e) {
                LOG.debug("解析URL失败:" + url, e);
            }
            return null;
        }
        public static void copyFile(InputStream in, File outFile){
            OutputStream out = null;
            try {
                byte[] data=readAll(in);
                out = new FileOutputStream(outFile);
                out.write(data, 0, data.length);
                out.close();
            } catch (IOException ex) {
                LOG.error("文件操作失败",ex);
            } finally {
                try {
                    if(in!=null){
                        in.close();
                    }
                } catch (IOException ex) {
                 LOG.error("文件操作失败",ex);
                }
                try {
                    if(out!=null){
                        out.close();
                    }
                } catch (IOException ex) {
                 LOG.error("文件操作失败",ex);
                }
            }
        }

        public static byte[] readAll(InputStream in) {
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            try {
                byte[] buffer = new byte[1024];
                for (int n; (n = in.read(buffer)) > 0;) {
                    out.write(buffer, 0, n);
                }
            } catch (IOException e) {
                LOG.error("读取失败", e);
            }
            return out.toByteArray();
        }
}

自定义检索接口:Searcher.java

package com.sinosoft.lhresource.search.common;

import java.util.List;

public interface Searcher {

     public List<Webpage> search(String keyword);
     public List<Webpage> search(String keyword, int page);
}

自定义处理百度检索接口:BaiduSearcher.java


package com.sinosoft.lhresource.search.common;

import java.util.List;

public interface BaiduSearcher extends Searcher {

    /**
     * 新闻搜索
     * @param keyword
     * @return 
     */
    public List<Webpage> searchNews(String keyword);
    /**
     * 新闻搜索(分页)
     * @param keyword
     * @param page
     * @return 
     */
    public List<Webpage> searchNews(String keyword, int page);
    /**
     * 贴吧搜索
     * @param keyword
     * @return 
     */
    public List<Webpage> searchTieba(String keyword);
    /**
     * 贴吧搜索(分页)
     * @param keyword
     * @param page
     * @return 
     */
    public List<Webpage> searchTieba(String keyword, int page);
    /**
     * 知道搜索
     * @param keyword
     * @return 
     */
    public List<Webpage> searchZhidao(String keyword);
    /**
     * 知道搜索(分页)
     * @param keyword
     * @param page
     * @return 
     */
    public List<Webpage> searchZhidao(String keyword, int page);
    /**
     * 文库搜索
     * @param keyword
     * @return 
     */
    public List<Webpage> searchWenku(String keyword);
    /**
     * 文库搜索(分页)
     * @param keyword
     * @param page
     * @return 
     */
    public List<Webpage> searchWenku(String keyword, int page);
}


package com.sinosoft.lhresource.search.common;

import java.util.List;

public abstract class AbstractBaiduSearcher implements BaiduSearcher {

    /**
     * 新闻搜索
     * @param keyword
     * @return 
     */
    @Override
    public List<Webpage> searchNews(String keyword){
        return searchNews(keyword, 1);
    }
    /**
     * 新闻搜索(分页)
     * @param keyword
     * @param page
     * @return 
     */
    @Override
    public List<Webpage> searchNews(String keyword, int page){
        throw new RuntimeException("未实现");
    }
    /**
     * 贴吧搜索
     * @param keyword
     * @return 
     */
    @Override
    public List<Webpage> searchTieba(String keyword){
        return searchTieba(keyword, 1);
    }
    /**
     * 贴吧搜索(分页)
     * @param keyword
     * @param page
     * @return 
     */
    @Override
    public List<Webpage> searchTieba(String keyword, int page){
        throw new RuntimeException("未实现");
    }
    /**
     * 知道搜素
     * @param keyword
     * @return 
     */
    @Override
    public List<Webpage> searchZhidao(String keyword){
        return searchZhidao(keyword, 1);
    }
    /**
     * 知道搜索(分页)
     * @param keyword
     * @param page
     * @return 
     */
    @Override
    public List<Webpage> searchZhidao(String keyword, int page){
        throw new RuntimeException("未实现");
    }
    /**
     * 文库搜索
     * @param keyword
     * @return 
     */
    @Override
    public List<Webpage> searchWenku(String keyword){
        return searchWenku(keyword, 1);
    }
    /**
     * 文库搜索(分页)
     * @param keyword
     * @param page
     * @return 
     */
    @Override
    public List<Webpage> searchWenku(String keyword, int page){
        throw new RuntimeException("未实现");
    }
}

百度搜索+Jsoup实现资源收集:JSoupBaiduSearcher.java


package com.sinosoft.lhresource.search.common;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class JSoupBaiduSearcher extends AbstractBaiduSearcher {

     private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class);

        @Override
        public List<Webpage> search(String keyword) {
            return search(keyword, 1);
        }
        @Override
        public List<Webpage> search(String keyword, int page) {
            int pageSize = 10;
            //百度搜索结果每页大小为10,pn参数代表的不是页数,而是返回结果的开始数
            //如获取第一页则pn=0,第二页则pn=10,第三页则pn=20,以此类推,抽象出模式:(page-1)*pageSize
            String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword;

//          SearchResult searchResult = new SearchResult();
//          searchResult.setPage(page);
            List<Webpage> webpages = new ArrayList<>();
            try {
                Document document = Jsoup.connect(url).get();

                //获取搜索结果数目
                int total = getBaiduSearchResultCount(document);
//              searchResult.setTotal(total);
                int len = 10;
                if (total < 1) {
                    return null;
                }
                //如果搜索到的结果不足一页
                if (total < 10) {
                    len = total;
                }
                for (int i = 0; i < len; i++) {
                    String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a";
                    String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract";
                    LOG.debug("titleCssQuery:" + titleCssQuery);
                    LOG.debug("summaryCssQuery:" + summaryCssQuery);
                    Element titleElement = document.select(titleCssQuery).first();
                    String href = "";
                    String titleText = "";
                    if(titleElement != null){
                        titleText = titleElement.text();
                        href = titleElement.attr("href");
                    }else{
                        //处理百度百科
                        titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a";
                        summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p";
                        LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery);
                        LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery);
                        titleElement = document.select(titleCssQuery).first();
                        if(titleElement != null){
                            titleText = titleElement.text();
                            href = titleElement.attr("href");
                        }
                    }
                    LOG.debug(titleText);
                    Element summaryElement = document.select(summaryCssQuery).first();
                    //处理百度知道
                    if(summaryElement == null){
                        summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font");
                        LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery);
                        summaryElement = document.select(summaryCssQuery).first();
                    }
                    String summaryText = "";
                    if(summaryElement != null){
                        summaryText = summaryElement.text(); 
                    }
                    LOG.debug(summaryText);                

                    if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {
                        Webpage webpage = new Webpage();
                        webpage.setTitle(titleText);
                        webpage.setUrl(href);
                        webpage.setSummary(summaryText);
                        /*if (href != null) {
                            String content = Tools.getHTMLContent(href);
                            webpage.setContent(content);
                        } else {
                            LOG.info("页面正确提取失败");
                        }*/
                        webpages.add(webpage);
                    } else {
                        LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText);
                    }
                }

            } catch (IOException ex) {
                LOG.error("搜索出错",ex);
            }
//          searchResult.setWebpages(webpages);
            return webpages;
        }
        /**
         * 获取百度搜索结果数
         * 获取如下文本并解析数字:
         * 百度为您找到相关结果约13,200个
         * @param document 文档
         * @return 结果数
         */
        private int getBaiduSearchResultCount(Document document){
            String cssQuery = "html body div div div div.nums";
            LOG.debug("total cssQuery: " + cssQuery);
            Element totalElement = document.select(cssQuery).first();
            String totalText = totalElement.text(); 
            LOG.info("搜索结果文本:" + totalText);

            String regEx="[^0-9]";   
            Pattern pattern = Pattern.compile(regEx);      
            Matcher matcher = pattern.matcher(totalText);
            totalText = matcher.replaceAll("");
            int total = Integer.parseInt(totalText);
            LOG.info("搜索结果数:" + total);
            return total;
        }

        public static void main(String[] args) {
            Searcher searcher = new JSoupBaiduSearcher();
            List<Webpage> webpages = searcher.search("六扇门",2);
            if (webpages != null) {
                int i = 2;
                LOG.info("搜索结果 当前第 " + 1 + " 页,页面大小为:" + webpages.size() + " 共有结果数:" + webpages.size());
                for (Webpage webpage : webpages) {
                    LOG.info("搜索结果 " + (i++) + " :");
                    LOG.info("标题:" + webpage.getTitle());
                    LOG.info("URL:" + webpage.getUrl());
                    LOG.info("摘要:" + webpage.getSummary());
                    LOG.info("正文:" + webpage.getContent());
                    LOG.info("");
                }
            } else {
                LOG.error("没有搜索到结果");
            }
        }
}

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值