Lucene7.0与HanLP分词器整合索引数据库建立索引文件

HanLP官网:http://hanlp.linrunsoft.com/

GitHup地址:https://github.com/hankcs/HanLP

HanLP插件地址:https://github.com/hankcs/hanlp-lucene-plugin


需要一下jar包



package com.kyd.demo.hanLP;

import java.io.IOException;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;

import com.hankcs.lucene.HanLPAnalyzer;
import com.hankcs.lucene.HanLPIndexAnalyzer;

/**
 * 索引数据库字段建立索引文件
 * 
 * @author zhengzhen
 *
 */
public class JdbcIndexDemo {
	public static void main(String[] args) {
		try {
			Class.forName("com.mysql.jdbc.Driver");
			String url = "jdbc:mysql://192.168.100.69:3306/xxxx?useUnicode=true&characterEncoding=utf8&autoReconnect=true&failOverReadOnly=false";
			String password ="root";
			String userName ="root";
			String sql ="select * from xxxx";
			try (
					Connection conn = DriverManager.getConnection(url,userName,password);
					PreparedStatement sta =conn.prepareStatement(sql);
					ResultSet rs = sta.executeQuery();
					){
				/**
				 * 1.设置索引文件保存路径
				 */
					Directory directory = FSDirectory.open(Paths.get("xxxx_index"));
					/**
					 * 2.创建分词器
					 */
					Analyzer analyzer = new HanLPIndexAnalyzer();
					/**
					 * 3.分词器配置
					 */
					IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
					indexWriterConfig.setOpenMode(OpenMode.CREATE);
					/**
					 * 4.创建索引输出流
					 */
					IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);
					/**
					 * 5.循环遍历创建索引文档
					 */
					while (rs.next()) {
						/**
						 * 5.1.创建文档
						 */
						Document document = new Document();
						/**
						 * 5.2.添加字段
						 */
						Long id  =rs.getLong("unitId");
						IndexableField unitIdField = new StringField("unitId", id+"",Store.YES);
						document.add(unitIdField);
						
						String title = rs.getString("title");
						if( title != null) {
							IndexableField sectionNameField = new TextField("sectionName", title, Store.YES);
							document.add(sectionNameField);
						}
						
					
						
						String  unitName= rs.getString("unitName");
						if( unitName != null) {
							IndexableField unitNameField = new TextField("unitName", unitName, Store.YES);
							document.add(unitNameField);
						}
						
						
						String  courseName= rs.getString("courseName");
						if(courseName !=null) {
							IndexableField courseNameField = new TextField("courseName", courseName, Store.YES);
							document.add(courseNameField);
						}
						
						
						String  startPage= rs.getString("startPage");
						if(startPage !=null) {
							IndexableField startPageField = new StringField("startPage", startPage, Store.YES);
							document.add(startPageField);
						}
						
						
						String  endPage= rs.getString("startEndPage");
						if(endPage != null) {
							IndexableField endPageField = new StringField("endPage", endPage,Store.YES);
							document.add(endPageField);
						}
					
						
						indexWriter.addDocument(document);
						
					}
					indexWriter.commit();
			} catch (Exception e) {
				e.printStackTrace();
			}
		} catch (ClassNotFoundException e1) {
			
			e1.printStackTrace();
		}
		
	}
	/**
	 * HanLPAnalyzer
	 * 这个分词器对于长词不会切割 ,例如 “中华人民共和国” 是一个长词会保留下来
	 * @throws IOException
	 */
	@Test
	public void hanLPAnalyzerTest() throws IOException {
		String text = "中华人民共和国很辽阔";
		for (int i = 0; i < text.length(); ++i)
		{
		    System.out.print(text.charAt(i) + "" + i + " ");
		}
		System.out.println();
		Analyzer analyzer = new HanLPAnalyzer();
		TokenStream tokenStream = analyzer.tokenStream("field", text);
		tokenStream.reset();
		while (tokenStream.incrementToken())
		{
		    CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
		    // 偏移量
		    OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
		    // 距离
		    PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
		    System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());
		}
		/* 输出:
		 * 中0 华1 人2 民3 共4 和5 国6 很7 辽8 阔9 
		 * 中华人民共和国 0 7 1
		 * 很 7 8 1
		 * 辽阔 8 10 1
		 */
	}
	/**
	 * HanLPIndexAnalyzer
	 * 这个分词器会对长词进行分割 “中华人民共和国” 会切分成“中华人民共和国” “中华” “人民”等等
	 * @throws IOException
	 */
	@Test
	public void hanLPIndexAnalyzerTest() throws IOException {
		String text = "中华人民共和国很辽阔";
		for (int i = 0; i < text.length(); ++i)
		{
		    System.out.print(text.charAt(i) + "" + i + " ");
		}
		System.out.println();
		Analyzer analyzer = new HanLPIndexAnalyzer();
		TokenStream tokenStream = analyzer.tokenStream("field", text);
		tokenStream.reset();
		while (tokenStream.incrementToken())
		{
		    CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
		    // 偏移量
		    OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
		    // 距离
		    PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
		    System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());
		}
		/* 输出:
		 * 中0 华1 人2 民3 共4 和5 国6 很7 辽8 阔9 
		 * 中华人民共和国 0 7 1
		 * 中华人民 0 4 1
		 * 中华 0 2 1
		 * 华人 1 3 1
		 * 人民共和国 2 7 1
		 * 人民 2 4 1
		 * 共和国 4 7 1
		 * 共和 4 6 1
		 * 很 7 8 1
		 * 辽阔 8 10 1

		 */
	}
}


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值