使用lucene 来创建一个知识库

最新推荐文章于 2024-08-29 10:53:37 发布

iteye_4865

最新推荐文章于 2024-08-29 10:53:37 发布

阅读量170

点赞数

分类专栏： java 文章标签： lucene Apache J# F# JSP

java 专栏收录该内容

30 篇文章 0 订阅

订阅专栏

将常用的一些 doc txt html文档索引，关键是 lucene 只需要将 doc html 剥离成普通的还有有效信息的字符串即可。

仿照demo建立 DOCDocument.java ,getText() 利用 POI ,抽取基类 FileDocument,

动态载入后缀名 + Document .class ,

基类FileDocument：

package bts.jsp.kbase;


import java.io.*;
import java.util.Map;
import java.util.HashMap;
import java.util.Arrays;

import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;


public abstract class FileDocument {
    static Map<String, FileDocument> DocmentMap;

    static {
        try {
            DocmentMap = init();
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }

    private static Map<String, FileDocument> init() throws Exception {
        Map<String, FileDocument> map = new HashMap<String, FileDocument>();
        for (String t : KbaseConfig.TYPES) {
            map.put(t, (FileDocument) Class.forName("bts.jsp.kbase." + t.toUpperCase() + "Document").newInstance());
        }

        return map;
    }

    /*
 public static String getCommonContent(String path) {
     if (!KbaseConfig.accceptFile(path))
         return null;
     String subtype = path.substring(path.lastIndexOf(".") + 1);
     String content = DocmentMap.get(subtype.toLowerCase()).getTextContent(path);

     return content;
 }   */

    public static String getCacheStringContent(String path) {
        String stringPath = KbaseConfig.getCacheStringPath(path);
        System.out.println("get cache :" + stringPath);
        StringBuffer sb = new StringBuffer("");
        try {
            BufferedReader reader = new BufferedReader(new FileReader(stringPath));
            String line;
            while ((line = reader.readLine()) != null) {
                sb.append(line).append("\n");
            }
            reader.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return sb.toString();
    }

    private String cacheAndGetStringContent(String path) {
        String content = this.getTextContent(path);
        String stringPath = KbaseConfig.getCacheStringPath(path);
        System.out.println(stringPath);
        {
            String dir = stringPath.substring(0, stringPath.lastIndexOf("/"));
            File f = new File(dir);
            if (!f.exists()) f.mkdirs();
        }
        try {
            PrintWriter pw = new PrintWriter(stringPath);
            pw.println(content);
            pw.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return content;
    }

    public static Document getCommonDocument(String path) {
        if (!KbaseConfig.accceptFile(path))
            return null;
        String subtype = path.substring(path.lastIndexOf(".") + 1);
        try {
            return DocmentMap.get(subtype).Document(new File(path));
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }


    public abstract String getTextContent(String path);

    public Document Document(File f)
            throws java.io.FileNotFoundException {

        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a field that is
        // indexed (i.e. searchable), but don't tokenize the field into words.
        doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));

        doc.add(new Field("title", f.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED));


        // Add the last modified date of the file a field named "modified".  Use
        // a field that is indexed (i.e. searchable), but don't tokenize the field
        // into words.
        doc.add(new Field("modified",
                f.lastModified() + "",
                Field.Store.YES, Field.Index.NOT_ANALYZED));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in the system's default encoding.
        // If that's not the case searching for special characters will fail.
        String content = cacheAndGetStringContent(f.getPath());
        doc.add(new Field("contents", content, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));

        // return the document
        return doc;
    }

}

举例，对于doc 后缀名，有类 DOCDocument

package bts.jsp.kbase;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.poi.hwpf.extractor.WordExtractor;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.File;
import java.io.FileInputStream;

/**
 * Created by IntelliJ IDEA.
 * User: yiminghe
 * Date: 2008-12-11
 * Time: 15:32:09
 * To change this template use File | Settings | File Templates.
 */
public class DOCDocument extends FileDocument {
    /**
     * Makes a document for a File.
     * <p/>
     * The document has three fields:
     * <ul>
     * <li><code>path</code>--containing the pathname of the file, as a stored,
     * untokenized field;
     * <li><code>modified</code>--containing the last modified date of the file as
     * a field as created by <a
     * href="lucene.document.DateTools.html">DateTools</a>; and
     * <li><code>contents</code>--containing the full contents of the file, as a
     * Reader field;
     */


    public String getTextContent(String path) {
        String content = "";
        try {
            WordExtractor wordExtractor = new WordExtractor(new FileInputStream(path));
            content = wordExtractor.getText();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return content;
    }


}

接着要考虑的问题是，如何避免重复索引以及文件修改后重新索引，这就要用到 lastmodifiy 这个文件的属性,每次索引完都将已索引的文件以及其最后修改时间保存下来，下次索引前先检查，更新或新增时才真正索引。

也就是每次都找出应该删掉哪些，应该加入哪些（索引的更新是通过先删除再加入实现的）

package bts.jsp.kbase;

import bts.roi.BtsManager;

import java.io.*;
import java.util.ArrayList;

/**
 * Created by IntelliJ IDEA.
 * User: yiminghe
 * Date: 2008-12-11
 * Time: 17:29:45
 * To change this template use File | Settings | File Templates.
 */
public class KbaseConfig {
    // 已经建立索引的文件集合
    static final String INDEXEDFILES = BtsManager.getProperty("Bts.INDEXEDFILES");
    //索引存放目录
    static final File INDEX_DIR = new File(BtsManager.getProperty("Bts.INDEX_DIR"));

    //真实数据
    static final String DATA_DIR = BtsManager.getProperty("Bts.DATA_DIR");

    //真实数据 String
    static final String DATA_STRING_DIR = BtsManager.getProperty("Bts.DATA_STRING_DIR");

    //索引后缀名列表
    static String[] TYPES = {"html", "htm", "txt", "doc", "ppt", "xls", "pdf"};

    static {
        File f = new File(DATA_STRING_DIR);
        if (!f.exists()) f.mkdirs();
    }

    public static void saveIndexedFiles(ArrayList<String[]> data) {
        try {
            PrintWriter pw = new PrintWriter(INDEXEDFILES);
            for (int i = 0; i < data.size(); i++) {
                String[] d = data.get(i);
                for (int j = 0; j < d.length; j++) {
                    pw.print(d[j] + "\t");
                }
                pw.println();
            }
            pw.close();
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    static String getCacheStringPath(String path) {
        path = path.replaceAll("\\\\", "/");
        String stringPath = path.replaceAll(KbaseConfig.DATA_DIR, KbaseConfig.DATA_STRING_DIR);
        return stringPath;
    }

    public static ArrayList<String[]> loadIndexedFiles() {
        ArrayList<String[]> data = new ArrayList<String[]>();
        if (new File(INDEXEDFILES).exists()) {
            try {
                BufferedReader reader = new BufferedReader(new FileReader(INDEXEDFILES));
                String line;
                while ((line = reader.readLine()) != null) {
                    if ((line = line.trim()).equals("")) continue;
                    String[] d = line.split("\t");
                    data.add(d);
                }
                reader.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return data;
    }


    public static ArrayList<String[]> getCurrentFiles(String dir) {
        ArrayList<String[]> d = new ArrayList<String[]>();
        getCurrentFiles(dir, d);
        return d;
    }


    private static int indexArray(String[] array, String value) {
        value = value.trim();
        for (int i = 0; i < array.length; i++) {
            if (array[i].equals(value))
                return i;
        }
        return -1;
    }

    static boolean accceptFile(String path) {
        int index = path.lastIndexOf(".");
        if (index == -1) return false;
        String subtype = path.substring(index + 1);
        int array_index = indexArray(TYPES, subtype.toLowerCase());
        if (array_index == -1) return false;
        return true;
    }

    private static void getCurrentFiles(String dir, ArrayList<String[]> data) {
        File f = new File(dir);
        if (f.isDirectory()) {
            File[] fs = f.listFiles(new FileFilter() {
                public boolean accept(File pathname) {
                    boolean ac = pathname.isDirectory() || accceptFile(pathname.getAbsolutePath());
                    return ac;
                }
            });

            for (int i = 0; i < fs.length; i++) {
                getCurrentFiles(fs[i].getAbsolutePath(), data);
            }
            return;
        }

        if (!f.canRead()) return;

        String[] d = new String[2];
        d[0] = f.getAbsolutePath();
        d[1] = f.lastModified() + "";
        data.add(d);

    }


    public static ArrayList<String> getDeleted(ArrayList<String[]> original, ArrayList<String[]> newData) {
        ArrayList<String> result = new ArrayList<String>();
        for (int i = 0; i < original.size(); i++) {
            String path = original.get(i)[0];
            long lm = Long.parseLong(original.get(i)[1]);
            boolean modified = false;
            int j = 0;
            for (j = 0; j < newData.size(); j++) {
                String path2 = newData.get(j)[0];
                long lm2 = Long.parseLong(newData.get(j)[1]);
                if (path2.equals(path)) {
                    if (lm2 > lm) {
                        modified = true;
                        break;
                    } else {
                        break;
                    }
                }


            }

            //修改或者已经被删除
            if (modified || j == newData.size()) {
                result.add(path);
            }

        }

        return result;

    }


    public static ArrayList<String> getAdded(ArrayList<String[]> original, ArrayList<String[]> newData) {
        ArrayList<String> result = new ArrayList<String>();
        for (int i = 0; i < newData.size(); i++) {
            String path = newData.get(i)[0];
            long lm = Long.parseLong(newData.get(i)[1]);
            boolean modified = false;
            int j = 0;
            for (j = 0; j < original.size(); j++) {
                String path2 = original.get(j)[0];
                long lm2 = Long.parseLong(original.get(j)[1]);
                if (path2.equals(path)) {
                    if (lm > lm2) {
                        modified = true;
                        break;
                    } else {
                        break;
                    }
                }


            }

            //修改或者已经新的
            if (modified || j == original.size()) {
                result.add(path);
            }

        }

        return result;

    }
}

其他的查询，删除都和 demo 差不多了，加了 highlighter 的应用 ,

package bts.jsp.kbase;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.*;
import java.util.Date;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.highlight.*;

/**
 * Simple command-line based search demo.
 */
public class SearchFiles {

    /**
     * Use the norms from one field for all fields.  Norms are read into memory,
     * using a byte of memory per document per searched field.  This can cause
     * search of large collections with a large number of fields to run out of
     * memory.  If all of the fields contain only a single token, then the norms
     * are all identical, then single norm vector may be shared.
     */
    private static class OneNormsReader extends FilterIndexReader {
        private String field;

        public OneNormsReader(IndexReader in, String field) {
            super(in);
            this.field = field;
        }

        public byte[] norms(String field) throws IOException {
            return in.norms(this.field);
        }
    }

    private SearchFiles() {
    }

    /**
     * Simple command-line based search demo.
     */
    public static KbaseFiles search(String field, String queries, int start, int limit) throws Exception {

        IndexReader reader = IndexReader.open(KbaseConfig.INDEX_DIR);

        /*
        if (normsField != null)
            reader = new OneNormsReader(reader, normsField);
         */


        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new StandardAnalyzer();


        QueryParser parser = new QueryParser(field, analyzer);


        Query query = parser.parse(queries);
        //System.out.println("Searching for: " + query.toString(field));
        KbaseFiles files = null;
        if (start >= 0) {
            files = doPagingSearch(analyzer, searcher, query, start, limit);
        } else {
            doStreamingSearch(searcher, query);
        }
        return files;

    }

    /**
     * This method uses a custom HitCollector implementation which simply prints out
     * the docId and score of every matching document.
     * <p/>
     * This simulates the streaming search use case, where all hits are supposed to
     * be processed, regardless of their relevance.
     */
    public static void doStreamingSearch(final IndexSearcher searcher, Query query) throws IOException {
        HitCollector streamingHitCollector = new HitCollector() {

            // simply print docId and score of every matching document
            public void collect(int doc, float score) {
                //System.out.println("doc=" + doc + " score=" + score);
            }

        };

        searcher.search(query, streamingHitCollector);
    }

    /**
     * This demonstrates a typical paging search scenario, where the search engine presents
     * pages of size n to the user. The user can then go to the next page if interested in
     * the next hits.
     * <p/>
     * When the query is executed for the first time, then only enough results are collected
     * to fill 5 result pages. If the user wants to page beyond this limit, then the query
     * is executed another time and all hits are collected.
     */
    public static KbaseFiles doPagingSearch(Analyzer analyzer, IndexSearcher searcher, Query query,
                                            int start, int limit) throws IOException {
        // Collect enough docs to show 5 pages
        TopDocCollector collector = new TopDocCollector(start + limit);
        searcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        int numTotalHits = collector.getTotalHits();
        //System.out.println(numTotalHits + " total matching documents");

        int end = Math.min(numTotalHits, start + limit);
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<em>", "</em>");
        Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));







        KbaseFiles fileResult = new KbaseFiles();
        fileResult.setTotal(numTotalHits);
        ArrayList<KbaseFile> files = new ArrayList<KbaseFile>();
        for (int i = start; i < end; i++) {
            //if (raw) {                              // output raw format
            //System.out.println("doc=" + hits[i].doc + " score=" + hits[i].score);
            //continue;
            //}
            Document doc = searcher.doc(hits[i].doc);
            String path = doc.get("path");
            if (path != null) {
                //System.out.println((i + 1) + ". " + path);
                String title = doc.get("title");
                String contents = FileDocument.getCommonContent(path);
                String highLightText = highlighter.getBestFragment(analyzer, "contents", contents);







                String modified = doc.get("modified");
                modified = modified.substring(0, modified.length() - 3);
                KbaseFile file = new KbaseFile(title, path, modified, highLightText);
                files.add(file);
            } else {
                //System.out.println((i + 1) + ". " + "No path for this document");
            }
        }
        fileResult.setFiles(files);
        return fileResult;

    }

}

iteye_4865

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
使用lucene 来创建一个知识库

将常用的一些 doc txt html文档索引，关键是 lucene 只需要将 doc html 剥离成普通的还有有效信息的字符串即可。仿照demo建立 DOCDocument.java ,getText() 利用 POI ,抽取基类 FileDocument,动态载入后缀名 + Document .class , 基类File...
复制链接

扫一扫

专栏目录