2013-04-23期-创建索引

文件

┳
┣com.searchengine.entity.ContentObject.java (内容对象)
┣com.searchengine.util.WebContent.java(取得网页内容)
┗com.searchengine.util.PackContentObject(对象转换)

ContentObject.java代码

/**
 *  Class Name: ContentObject.java  搜索引擎内容对象
 *  @author JLC 
 *  @version 1.0
 */
public class ContentObject {
    /**
     * 内容标题
     */
    private  String title;
    /**
     * 内容详细信息
     */
    private String content; 
    /**
     * 内容创建时间
     */
    private Date createDate = new Date();

     ............  get set方法就不列出了 请自行添加
}

PackContentObject.java代码

/**
 * 不同对象之间转换
 * @author JLC
 */
public class PackContentObject {
    /**
     * 转换内容对象为Document对象
     * @param ct
     * @return
     */
    public static Document convertContentToDoc(ContentObject ct){
        Document doc = new Document();
           //Field.Store.YES 表示存储内容
        doc.add(new StringField("title", ct.getTitle(), Field.Store.YES));
        doc.add(new TextField("content", ct.getContent(),Field.Store.YES));
        doc.add(new LongField("createDate",new Date().getTime(),Field.Store.YES));
        return doc;
    }
    /**
     * 转换Documnet对象为内容对象
     * @param doc
     * @return
     */
    public static ContentObject convertDocToContent(Document doc){
        ContentObject ct = new ContentObject();
        ct.setTitle(doc.get("title"));
        ct.setContent(doc.get("content"));
        Date dt =new Date(Long.parseLong(doc.get("createDate")));
        ct.setCreateDate(dt);
        return ct;
    }

}

WebContent.java代码

public class WebContent {
    /**
     * 读取一个网页全部内容
     */
    public String getOneHtml(final String htmlurl) throws IOException {
        URL url;
        String temp;
        final StringBuffer sb = new StringBuffer();
        try {
            url = new URL(htmlurl);
            final BufferedReader in = new BufferedReader(new InputStreamReader(
                    url.openStream(), "gbk"));// 读取网页全部内容
            while ((temp = in.readLine()) != null) {
                sb.append(temp);
            }
            in.close();
        } catch (final MalformedURLException me) {
            me.getMessage();
            throw me;
        } catch (final IOException e) {
            e.printStackTrace();
            throw e;
        }
        return sb.toString();
    }

    /**
     * 
     * @param s
     * @return 获得网页标题
     */
    public String getTitle(final String s) {
        String regex;
        String title = "";
        final List<String> list = new ArrayList<String>();
        regex = "<title>.*?</title>";
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++) {
            title = title + list.get(i);
        }
        return outTag(title);
    }

    /**
     * 
     * @param s
     * @return 获得链接
     */
    public List<String> getLink(final String s) {
        String regex;
        final List<String> list = new ArrayList<String>();
        regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        return list;
    }

    /**
     * 
     * @param s
     * @return 获得脚本代码
     */
    public List<String> getScript(final String s) {
        String regex;
        final List<String> list = new ArrayList<String>();
        regex = "<script.*?</script>";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        return list;
    }

    /**
     * 
     * @param s
     * @return 获得CSS
     */
    public List<String> getCSS(final String s) {
        String regex;
        final List<String> list = new ArrayList<String>();
        regex = "<style.*?</style>";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        return list;
    }

    /**
     * 
     * @param s
     * @return 去掉标记
     */
    public String outTag(final String s) {
        return s.replaceAll("<.*?>", "");
    }

    /**
     *  Function:抓取网页内容
     *  @author JLC
     *  @param url
     *  @return
     */
    public ContentObject getContentFromSite(String url){
        ContentObject cobj= new ContentObject();
        String html = "";
        try {
            html = getOneHtml(url);
            String title = outTag(getTitle(html));
            html = html.replaceAll("(<br>)+?", "\n");// 转化换行
            html = html.replaceAll("<p><em>.*?</em></p>", "");
            cobj.setTitle(title);
            cobj.setContent(outTag(html));
        } catch (final Exception e) {}
        return cobj;
    }
}

mainTest.java测试代码

public class mainTest {
    /**
     *  Function:抓取网页内容
     *  @author JLC
     *  @return
     */
    public static List<Document>  getWebContentDocuments(){
           List<Document> docs = new ArrayList<Document>();
        WebContent wc = new WebContent();
        Document doc1 = PackContentObject.convertContentToDoc(wc.getContentFromSite("http://news.163.com"));
        docs.add(doc1);
        Document doc2 = PackContentObject.convertContentToDoc(wc.getContentFromSite("http://news.sohu.com/"));
        docs.add(doc2);
        return docs;
    }
    /**
     * 创建索引
     */
    public static void createSearchEngineData(){
         SearchEngineCore  se = SearchObject.getInstance().getLuceneContext("search");
         List<Document> docList = getWebContentDocuments();
         for(Document doc:docList){
             try{
                 se.getTw().addDocument(doc);
                 se.commitIndex();
                 se.refreshData();
             }catch(Exception e){
                 e.printStackTrace();
             }
         }
    }

    public static void main(String args[]){
        //创建索引数据
        createSearchEngineData(); 
    }

}

运行Main方法后 可以看到索引已经创建完成  

结果


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值