通过lucene实现文章全文检索

通过lucene实现文章全文检索


之前也学习过solr,solr是基于lucene的基础上进行设计的,帮我们封装好了一些api,但是总感觉使用起来不是很灵活,所以在之前公司的一个小项目中尝试使用了lucene作为全文检索的支持。

文章实体类如下

public class Article {
    private Integer id;

    private String title;
    
    private Integer author;

    private Integer type;

    private String isrecommend;

    private Integer typenumber;

    private Date postmodified;

    private Date postdate;

}

public class ArticleContent {
    private Integer id;

    private String lang;

    private String title;

    private Integer articleid;

    private String content;
}
    

lucene工具类,分词器使用的是IK分词,由于前段传到后台的是用编辑器生成的html代码,所以需要用到HTMLStripCharFilter过滤器将html标签过滤掉,不然会产生大量冗余的内容,影响查询速率。


@Component("luceneUtil")
public class LuceneUtil {
    private String lang;
    private String PATH = null;
    private Analyzer analyzer = null;

    private Analyzer analyzerCn = new IKAnalyzer(true);
    private Analyzer analyzerEn = new StandardAnalyzer();

    public IndexSearcher createIndexSearcher() {
        Path path2 = FileSystems.getDefault().getPath(PATH);
        try {
            FSDirectory directory = FSDirectory.open(path2);
            DirectoryReader reader = DirectoryReader.open(directory);
            IndexSearcher indexSearcher = new IndexSearcher(reader);
            return indexSearcher;
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    public IndexWriter createIndexWriter() {
        Path path2 = FileSystems.getDefault().getPath(PATH);
        try {
            FSDirectory directory = FSDirectory.open(path2);
            // String string = directory.getDirectory().toFile().getAbsolutePath();
            // System.out.println(string);
            Analyzer analyzer2 = getAnalyzer();
            IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer);
            IndexWriter indexWriter = new IndexWriter(directory, writerConfig);
            return indexWriter;
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
    //创建索引
    public Map<String, String> createIndex(Article article) {
        HashMap<String, String> hashMap = new HashMap<String, String>();
        IndexWriter indexWriter = createIndexWriter();
        Document document = new Document();
        document.add(new StringField("id", String.valueOf(article.getId()), Field.Store.YES));
        TextField titleField = new TextField("title", article.getTitle(), Field.Store.YES);
        // 设置标题的权重
        titleField.setBoost(4F);
        document.add(titleField);
        String content = article.getContent();

        StringBuilder sb = new StringBuilder();
        // html过滤
        HTMLStripCharFilter htmlscript = new HTMLStripCharFilter(new StringReader(content));

        // 增加映射过滤 主要过滤掉换行符
        NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
        builder.add("\r", "");// 回车
        builder.add("\t", "");// 横向跳格
        builder.add("\n", "");// 换行
        CharFilter cs = new MappingCharFilter(builder.build(), htmlscript);

        try {
            char[] buffer = new char[10240];
            int count;
            while ((count = cs.read(buffer)) != -1) {
                sb.append(new String(buffer, 0, count));
            }
            content = sb.toString();
            cs.close();

            document.add(new TextField("content", content, Field.Store.YES));
            indexWriter.addDocument(document);
            indexWriter.commit();
            indexWriter.close();
            hashMap.put("state", "0");
            return hashMap;
        } catch (IOException e) {
            e.printStackTrace();
        }
        hashMap.put("state", "1");
        return hashMap;
    }

    public Map<String, String> createIndexByList(List<Article> list) {
        HashMap<String, String> hashMap = new HashMap<String, String>();
        IndexWriter indexWriter = createIndexWriter();
        try {
            for (Article article : list) {
                Document document = new Document();
                document.add(new StringField("id", String.valueOf(article.getId()), Field.Store.YES));
                document.add(new TextField("title", article.getTitle(), Field.Store.YES));
                document.add(new TextField("content", article.getContent(), Field.Store.YES));
                indexWriter.addDocument(document);
            }
            indexWriter.commit();
            indexWriter.close();
            hashMap.put("state", "0");
            return hashMap;
        } catch (IOException e) {
            e.printStackTrace();
        }
        hashMap.put("state", "1");
        return hashMap;
    }

    // 页数从0开始,全文检索分页
    public PageUtil<Article> searcherByKey(Integer cp, Integer ps, String key) {
        IndexSearcher indexSearcher = createIndexSearcher();
        String[] strings = { "title", "content" };

        MultiFieldQueryParser parser = new MultiFieldQueryParser(strings, analyzer);
        ArrayList<Article> arrayList = new ArrayList<Article>();
        try {
            Query query = parser.parse(key);
            // 高亮显示
            TopDocs docs = indexSearcher.search(query, 200);
            ScoreDoc[] scoreDocs = docs.scoreDocs;

            QueryScorer scorer = new QueryScorer(query);
            SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color=red>", "</font>");
            Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);
            // 返回多少个字符
            highlighter.setTextFragmenter(new SimpleFragmenter(100));

            Integer start = cp * ps;
            Integer end = (cp + 1) * ps;
            int length = scoreDocs.length;
            Integer count = length > end ? end : length;
            System.out.println(length);
            Integer allPage = (length + ps - 1) / ps;

            for (int i = start; i < count; i++) {
                int doc = scoreDocs[i].doc;
                Document doc2 = indexSearcher.doc(doc);
                String string = doc2.get("id");
                Integer id = Integer.valueOf(string);
                String title = doc2.get("title");
                String content = doc2.get("content");

                String strTitle = highlighter.getBestFragment(analyzer, "title", title);
                String strContent = highlighter.getBestFragment(analyzer, "content", content);

                title = strTitle == null ? title : strTitle;

                content = strContent == null ? content.substring(0, 100) : strContent;
                Article article = new Article(id, title, content);
                arrayList.add(article);
            }
            PageUtil<Article> pageUtil = new PageUtil<Article>(cp, ps, arrayList, allPage);
            return pageUtil;
        } catch (ParseException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InvalidTokenOffsetsException e) {
            e.printStackTrace();
        }
        return null;
    }

    public void deleteIndex(String id) {
        IndexWriter indexWriter = createIndexWriter();
        Analyzer analyzer2 = getAnalyzer();
        String[] strings = { "id" };
        MultiFieldQueryParser parser = new MultiFieldQueryParser(strings, analyzer2);
        try {
            Query query = parser.parse(id);
            indexWriter.deleteDocuments(query);
            indexWriter.close();
        } catch (ParseException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public String getLang() {
        return lang;
    }

    /**
     * 通过语言配置相应的目录和分词器
     * 
     * @param lang
     */
    public void setLang(String lang) {
        this.lang = lang;
        if (lang.equals("cn")) {
            this.PATH = "../article";
            this.analyzer = analyzerCn;
        } else {
            this.PATH = "../articleEn";
            this.analyzer = analyzerEn;
        }
    }

    public String getPATH() {
        return PATH;
    }

    public void setPATH(String pATH) {
        PATH = pATH;
    }

    public Analyzer getAnalyzer() {
        return analyzer;
    }

    public void setAnalyzer(Analyzer analyzer) {
        this.analyzer = analyzer;
    }

    

}

阅读更多
版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/momomoniqwer/article/details/80346395
文章标签: java
个人分类: lucence
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

不良信息举报

通过lucene实现文章全文检索

最多只允许输入30个字

加入CSDN,享受更精准的内容推荐,与500万程序员共同成长!
关闭
关闭