springboot6==ElasticSearch基础 LUCENE7 demo从14万条商品数据中找出相关度最高的10条数据

最新推荐文章于 2024-08-16 20:21:37 发布

一个java开发

最新推荐文章于 2024-08-16 20:21:37 发布

阅读量978

点赞数

分类专栏： springboot 文章标签： spring spring boot java

本文链接：https://blog.csdn.net/hebian1994/article/details/121444014

版权

springboot 专栏收录该内容

13 篇文章 2 订阅

订阅专栏

感谢这篇文章，让我一下就看懂了，牛皮：

搜索引擎技术系列教材（四）- lucene - 向Lucene中导入14万条产品数据

==================================================================

内容：

14万条原始数据存储在TXT中，取出后存到list中，通过 LUCENE利用list中数据建立索引，将索引存在Directory中，然后找出匹配度最高的10条数据。

主要代码：

package com.how2java;

import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import java.util.Scanner;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class TestLucene {

    public static void main(String[] args) throws Exception {
        // 1. 准备中文分词器
        IKAnalyzer analyzer = new IKAnalyzer();
        // 2. 索引
        Directory index = createIndex(analyzer);

        // 3. 查询器

        Scanner s = new Scanner(System.in);

        while (true) {
            System.out.print("请输入查询关键字：");
            String keyword = s.nextLine();
            System.out.println("当前关键字是：" + keyword);
            Query query = new QueryParser("name", analyzer).parse(keyword);

            // 4. 搜索出10条相关的内容
            IndexReader reader = DirectoryReader.open(index);
            IndexSearcher searcher = new IndexSearcher(reader);
            int numberPerPage = 10;
            ScoreDoc[] hits = searcher.search(query, numberPerPage).scoreDocs;

            // 5. 显示查询结果
            showSearchResults(searcher, hits, query, analyzer);
            // 6. 关闭查询
            reader.close();
        }


    }

    private static void showSearchResults(IndexSearcher searcher, ScoreDoc[] hits, Query query, IKAnalyzer analyzer) throws Exception {
        System.out.println("找到 " + hits.length + " 个命中.");

        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
        Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));


        System.out.println("找到 " + hits.length + " 个命中.");
        System.out.println("序号\t匹配度得分\t结果");
        for (int i = 0; i < hits.length; ++i) {
            ScoreDoc scoreDoc = hits[i];
            int docId = scoreDoc.doc;
            Document d = searcher.doc(docId);
            List<IndexableField> fields = d.getFields();
            //命中的第几条
            System.out.print((i + 1));
            //匹配度得分
            System.out.print("\t" + scoreDoc.score);
            System.out.print("内容");
            //命中的内容详情
            for (IndexableField f : fields) {
                if ("name".equals(f.name())) {
                    TokenStream tokenStream = analyzer.tokenStream(f.name(), new StringReader(d.get(f.name())));
                    String fieldContent = highlighter.getBestFragment(tokenStream, d.get(f.name()));
                    System.out.print("\t" + fieldContent);
                } else {
                    System.out.print("\t" + d.get(f.name()));
                }
            }
            System.out.println("<br>");
        }
    }

    /**
     * 做 Lucene的思路。
     * 1. 首先搜集数据，从TXT中读取14万条数据
     * 数据可以是文件系统，数据库，网络上，手工输入的，或者像本例直接写在内存上的
     * 2. 通过数据创建索引
     * 3. 用户输入关键字
     * 4. 通过关键字创建查询器
     * 5. 根据查询器到索引里获取数据
     * 6. 然后把查询结果展示在用户面前
     * Directory，可以理解为存在内存中的“数据库”，将Document存在Directory中
     */
    private static Directory createIndex(IKAnalyzer analyzer) throws IOException {
        Directory index = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(index, config);
        String fileName = "E:\\ideaMyProject\\demo-LUCENE\\demo3_14w\\140k_products.txt";
        //14万条数据存入list集合中
        List<Product> products = ProductUtil.file2list(fileName);
        int total = products.size();
        int count = 0;
        int per = 0;
        int oldPer = 0;
        for (Product p : products) {
            //创建索引
            addDoc(writer, p);
            count++;
            per = count * 100 / total;
            if (per != oldPer) {
                oldPer = per;
                System.out.printf("索引中，总共要添加 %d 条记录，当前添加进度是： %d%% %n", total, per);
            }

        }
        writer.close();
        return index;
    }

    /**
     * Directory，可以理解为存在内存中的“数据库”，将Document存在Directory中
     */
    private static void addDoc(IndexWriter w, Product p) throws IOException {
        Document doc = new Document();
        doc.add(new TextField("id", String.valueOf(p.getId()), Field.Store.YES));
        doc.add(new TextField("name", p.getName(), Field.Store.YES));
        doc.add(new TextField("category", p.getCategory(), Field.Store.YES));
        doc.add(new TextField("price", String.valueOf(p.getPrice()), Field.Store.YES));
        doc.add(new TextField("place", p.getPlace(), Field.Store.YES));
        doc.add(new TextField("code", p.getCode(), Field.Store.YES));
        w.addDocument(doc);
    }
}

测试查找"鞋子"相关的10条数据，神奇~

一个java开发

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
springboot6==ElasticSearch基础 LUCENE7 demo从14万条商品数据中找出相关度最高的10条数据

感谢这篇文章，让我一下就看懂了，牛皮：搜索引擎技术系列教材（四）- lucene - 向Lucene中导入14万条产品数据==================================================================内容：14万条原始数据存储在TXT中，取出后存到list中，通过LUCENE利用list中数据建立索引，将索引存在Directory中，然后找出匹配度最高的10条数据。主要代码：package com.how2java;im.
复制链接

扫一扫

专栏目录