2013-04-23期-创建索引

最新推荐文章于 2024-08-18 19:55:42 发布

传奇1949

最新推荐文章于 2024-08-18 19:55:42 发布

阅读量656

点赞数

分类专栏： lucene4.X 文章标签： lucene 搜索索引搜索引擎

lucene4.X 专栏收录该内容

14 篇文章 0 订阅

订阅专栏

┳
┣com.searchengine.entity.ContentObject.java (内容对象)
┣com.searchengine.util.WebContent.java（取得网页内容）
┗com.searchengine.util.PackContentObject(对象转换)

ContentObject.java代码

/**
 *  Class Name: ContentObject.java  搜索引擎内容对象
 *  @author JLC 
 *  @version 1.0
 */
public class ContentObject {
    /**
     * 内容标题
     */
    private  String title;
    /**
     * 内容详细信息
     */
    private String content; 
    /**
     * 内容创建时间
     */
    private Date createDate = new Date();

     ............  get set方法就不列出了 请自行添加
}

PackContentObject.java代码

/**
 * 不同对象之间转换
 * @author JLC
 */
public class PackContentObject {
    /**
     * 转换内容对象为Document对象
     * @param ct
     * @return
     */
    public static Document convertContentToDoc(ContentObject ct){
        Document doc = new Document();
           //Field.Store.YES 表示存储内容
        doc.add(new StringField("title", ct.getTitle(), Field.Store.YES));
        doc.add(new TextField("content", ct.getContent(),Field.Store.YES));
        doc.add(new LongField("createDate",new Date().getTime(),Field.Store.YES));
        return doc;
    }
    /**
     * 转换Documnet对象为内容对象
     * @param doc
     * @return
     */
    public static ContentObject convertDocToContent(Document doc){
        ContentObject ct = new ContentObject();
        ct.setTitle(doc.get("title"));
        ct.setContent(doc.get("content"));
        Date dt =new Date(Long.parseLong(doc.get("createDate")));
        ct.setCreateDate(dt);
        return ct;
    }

}

WebContent.java代码

public class WebContent {
    /**
     * 读取一个网页全部内容
     */
    public String getOneHtml(final String htmlurl) throws IOException {
        URL url;
        String temp;
        final StringBuffer sb = new StringBuffer();
        try {
            url = new URL(htmlurl);
            final BufferedReader in = new BufferedReader(new InputStreamReader(
                    url.openStream(), "gbk"));// 读取网页全部内容
            while ((temp = in.readLine()) != null) {
                sb.append(temp);
            }
            in.close();
        } catch (final MalformedURLException me) {
            me.getMessage();
            throw me;
        } catch (final IOException e) {
            e.printStackTrace();
            throw e;
        }
        return sb.toString();
    }

    /**
     * 
     * @param s
     * @return 获得网页标题
     */
    public String getTitle(final String s) {
        String regex;
        String title = "";
        final List<String> list = new ArrayList<String>();
        regex = "<title>.*?</title>";
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++) {
            title = title + list.get(i);
        }
        return outTag(title);
    }

    /**
     * 
     * @param s
     * @return 获得链接
     */
    public List<String> getLink(final String s) {
        String regex;
        final List<String> list = new ArrayList<String>();
        regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        return list;
    }

    /**
     * 
     * @param s
     * @return 获得脚本代码
     */
    public List<String> getScript(final String s) {
        String regex;
        final List<String> list = new ArrayList<String>();
        regex = "<script.*?</script>";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        return list;
    }

    /**
     * 
     * @param s
     * @return 获得CSS
     */
    public List<String> getCSS(final String s) {
        String regex;
        final List<String> list = new ArrayList<String>();
        regex = "<style.*?</style>";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        return list;
    }

    /**
     * 
     * @param s
     * @return 去掉标记
     */
    public String outTag(final String s) {
        return s.replaceAll("<.*?>", "");
    }

    /**
     *  Function:抓取网页内容
     *  @author JLC
     *  @param url
     *  @return
     */
    public ContentObject getContentFromSite(String url){
        ContentObject cobj= new ContentObject();
        String html = "";
        try {
            html = getOneHtml(url);
            String title = outTag(getTitle(html));
            html = html.replaceAll("(<br>)+?", "\n");// 转化换行
            html = html.replaceAll("<p><em>.*?</em></p>", "");
            cobj.setTitle(title);
            cobj.setContent(outTag(html));
        } catch (final Exception e) {}
        return cobj;
    }
}

mainTest.java测试代码

public class mainTest {
    /**
     *  Function:抓取网页内容
     *  @author JLC
     *  @return
     */
    public static List<Document>  getWebContentDocuments(){
           List<Document> docs = new ArrayList<Document>();
        WebContent wc = new WebContent();
        Document doc1 = PackContentObject.convertContentToDoc(wc.getContentFromSite("http://news.163.com"));
        docs.add(doc1);
        Document doc2 = PackContentObject.convertContentToDoc(wc.getContentFromSite("http://news.sohu.com/"));
        docs.add(doc2);
        return docs;
    }
    /**
     * 创建索引
     */
    public static void createSearchEngineData(){
         SearchEngineCore  se = SearchObject.getInstance().getLuceneContext("search");
         List<Document> docList = getWebContentDocuments();
         for(Document doc:docList){
             try{
                 se.getTw().addDocument(doc);
                 se.commitIndex();
                 se.refreshData();
             }catch(Exception e){
                 e.printStackTrace();
             }
         }
    }

    public static void main(String args[]){
        //创建索引数据
        createSearchEngineData(); 
    }

}

运行Main方法后可以看到索引已经创建完成 　

传奇1949

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2013-04-23期-创建索引

┳┣com.searchengine.entity.ContentObject.java (内容对象)┣com.searchengine.util.WebContent.java（取得网页内容）┗com.searchengine.util.PackContentObject(对象转换)ContentObject.java代码/** * Class Name: Content
复制链接

扫一扫

专栏目录