lucene7.6.0中文分词+权重设置

学JAVA的李先生

于 2022-08-30 14:43:27 发布

阅读量854

点赞数 2

分类专栏： java 搜索引擎分词检索文章标签：中文分词 lucene java

本文链接：https://blog.csdn.net/weixin_46422238/article/details/126602960

版权

java 同时被 3 个专栏收录

2 篇文章 0 订阅

订阅专栏

搜索引擎

2 篇文章 0 订阅

订阅专栏

分词检索

2 篇文章 0 订阅

订阅专栏

引言

最近项目需要使用到中文分词，所以想到了使用lucene+ik分词来实现

步骤

导入依赖

在pom .xml引入

        <!-- lucene核心库 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>7.6.0</version>
		</dependency>
		<!-- Lucene的查询解析器 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>7.6.0</version>
		</dependency>
		<!-- lucene的默认分词器库 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-common</artifactId>
			<version>7.6.0</version>
		</dependency>
		<!-- lucene的高亮显示 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-highlighter</artifactId>
			<version>7.6.0</version>
		</dependency>
		<!-- ik分词器 -->

		<dependency>
			<groupId>com.github.magese</groupId>
			<artifactId>ik-analyzer</artifactId>
			<version>7.6.0</version>
		</dependency>

建立字典

在配置文件中配置字典路径

lucence.CUSTOMER_ALL=d:\\indexDir\\customer
lucence.SALE_PRODUCT_ALL=d:\\indexDir\\saleproduct

因为需要生成多个字典，而且每个不同环境所生成的路径都需要不同，所以采用配置文件动态配置

@Data
public class LucenceType {

    public static String CUSTOMER_ALL;
    public static String SALE_PRODUCT_ALL;

    //代码放入静态块中，项目启动就加载
    static {
        load();
    }

    public static void load() {
        try {
            Configuration config = new PropertiesConfiguration("wxArticleConfiguration.properties" );
            CUSTOMER_ALL = config.getString("lucence.CUSTOMER_ALL" );
            SALE_PRODUCT_ALL = config.getString("lucence.SALE_PRODUCT_ALL" );
        } catch (ConfigurationException e) {
            throw new RuntimeException("获取配置文件失败，", e);
        }
    }

}

编写生成字典的主方法

@ResponseBody
    @RequestMapping("/createCustomerIndex")
    public String createCustomerIndex()  {
        try {
            List<TSaleCustomer> list1 =tSaleCustomerDao.tSaleCustomerList(new HashMap<>());
            // 创建文档的集合
            Collection<Document> docs = new ArrayList<>();
            for(int i=0;i<list1.size();i++){
                // 创建文档对象
                Document document1 = new Document();
                //StringField会创建索引，但是不会被分词，TextField，即创建索引又会被分词。
                document1.add(new StringField("customerId", list1.get(i).getId()+"", Field.Store.YES));
                //因为分词后可能无法匹配原完整词的结果，所以将同一个数据做两次引入
                document1.add(new TextField("namelike", list1.get(i).getName()+"", Field.Store.YES));//分词匹配
                document1.add(new StringField("name", list1.get(i).getName()+"", Field.Store.YES));//全字匹配

                docs.add(document1);
            }
            LucenceUtils.createIndex(docs, LucenceType.CUSTOMER_ALL);
            log.info("----创建索引完成----");
            System.out.println("----创建索引完成----");
            return"success";
        }catch (Exception e){
            log.error("----创建索引异常----",e);
            System.out.println(e);
            return "erro";
        }

    }

在执行完成之后在目录下会出现如下几个文件，这样就表示字典生成成功

查询数据

    @RequestMapping("/tSaleCustomerList2")
    public String[] luceneTest(@RequestBody Map<String,Object> map) throws Exception {
        int pageNow= org.apache.commons.collections.MapUtils.getIntValue(map,"page");
        int pageSize= org.apache.commons.collections.MapUtils.getIntValue(map,"limit");
        String keyword= org.apache.commons.collections.MapUtils.getString(map,"keyword");

        Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(LucenceType.CUSTOMER_ALL));
        // 索引读取工具
        IndexReader reader = DirectoryReader.open(directory);

        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        // 索引搜索工具
        IndexSearcher searcher = new IndexSearcher(reader);

        // 指定多默认域数组
        String[] arr=new String[]{"name","namelike"};
        //搜索时设置权重
        //如果不设置权重有可能全字匹配会在分词匹配后面
        Map<String,Float> boosts = new HashMap<String,Float>();
        boosts.put("name", 50.0f);//权重默认是1

        boosts.put("namelike", 10.0f);
        MultiFieldQueryParser queryParser = new MultiFieldQueryParser(arr, new IKAnalyzer(),boosts);//指定搜索权重
        Query parse = queryParser.parse(keyword);//查询所有默认域里有关键字的文档
        builder.add(parse, BooleanClause.Occur.MUST);
        BooleanQuery booleanQuery = builder.build();
        //分页查询
        TopDocs topDocs = searcher.search(booleanQuery, pageSize*pageNow);

        // 获取总条数
        System.out.println("本次搜索共找到" + topDocs.totalHits + "条数据");
        // 获取得分文档对象（ScoreDoc）数组.SocreDoc中包含：文档的编号、文档的得分
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;

        int start = (pageNow -1)*pageSize ;
        int end = pageSize*pageNow;
        String[] customerIds=new String[topDocs.scoreDocs.length];
        for(int i=start;i<end&&i<scoreDocs.length;i++){
            if(scoreDocs[i]!=null){
                Document doc2 = reader.document(scoreDocs[i].doc);
                customerIds[i]=doc2.get("customerId");
            }
        }
        //可以根据这些id去查询详情后返回
        return customerIds
    }

更新数据

//更新客户数据到Lucene
    public String updateCustomerIndex(String customerId)  {
        IndexWriter writer = null;
        try {
            TSaleCustomer customer=tSaleCustomerDao.findByCustomerId(customerId);
            Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(LucenceType.CUSTOMER_ALL));
            writer = new IndexWriter(directory,new IndexWriterConfig(new StandardAnalyzer()));

            // 创建文档对象
            Document document1 = new Document();
            //StringField会创建索引，但是不会被分词，TextField，即创建索引又会被分词。
            document1.add(new StringField("customerId", customer.getId()+"", Field.Store.YES));
            document1.add(new TextField("namelike", customer.getName()+"", Field.Store.YES));
            document1.add(new StringField("name", customer.getName()+"", Field.Store.YES));

            writer.updateDocument(new Term("customerId",customer.getId()+""), document1);//更新id为2的索引
            writer.commit();
            writer.close();
            log.info("----索引更新完成----"+customerId);
            return"success";
        }catch (Exception e){
            log.error("----更新索引异常----"+customerId,e);
            System.out.println(e);
            return "erro";
        }

    }