边学边记(一) lucene索引创建

最新推荐文章于 2021-06-03 12:49:23 发布

一洽客服系统

最新推荐文章于 2021-06-03 12:49:23 发布

阅读量1.5k

点赞数

分类专栏： Lucene 文章标签： lucene创建索引 lucene学习教程

本文链接：https://blog.csdn.net/a276202460/article/details/5621157

版权

Lucene 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

看过几天的lucene，中间又需要研究jboss 就把lucene的学习落下了。今天重新开始学习lucene 只研究应用不研究算法

搞明白lucene的应用后在研究切词，索引算法。

lucene在使用上无外乎就两点索引的创建（IndexWriter) 其中用到Analyzer 索引的查询（IndexSearcher) 其中也用到Analyzer，查询的策略就在Query的使用上 Query相当于给了lucene一个查询的指令语句和DB的sql有点类似。根据指令获得Result

先创建一个索引，待接下来几天研究Query的使用情况。

早前学习lucene的时候写个几行代码 copy过来改改温故下

编写Document类

/****************
 *
 *Create Class:TestDocument.java
 *Author:a276202460
 *Create at:2010-5-24
 */
package com.rich.lucene.document;


import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Random;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;

import com.rich.lucene.util.IndexKeys;

public class TestDocument {
	private final int maxnum = 10000;

	private final DateFormat df = new SimpleDateFormat("MM/dd/yyyy");
   
	/*
	 * minstartdate 和minendate定义一个创建索引中的开始日期，结束日期的Field
	 */
	private final String minstartdate = "01/01/2001";

	private final String minenddate = "01/01/2006";
    
	/*
	 * 随机日期和最小日期的最大间隔
	 */
	private final int maxdaterange = 200;

	private final Calendar calendar = Calendar.getInstance();

	private Date startdate = null;

	private Date enddate = null;
    
	private final Random random = new Random();
	
	
    /*
     * 位document的创建随机一个type field
     */
	private final String[] types = { "jsp", "java", "sp", "txt" };
	/*
	 * 为document随机一个biz field
	 */

	private final String[] bizes = { "db", "ws", "jms", "lucene" };
	
	/*
	 * 随机document的content field
	 */
	private String[] contentseeds = {"java","lucene","jsp","web","swing","awt","swt","tomcat","weblogic",
			"websphere","resin","jboss","eclipse","netbean","oracle","mysql","sybase","sqlserver","database",
			"datasource","axis","jms","servlet","maven","jdbc","jbpm","myeclipse","java3D"};

	public TestDocument() {
		try {
			startdate = df.parse(minstartdate);
			enddate = df.parse(minenddate);
		} catch (ParseException e) {

			e.printStackTrace();
			throw new RuntimeException("Can't init date value");
		}

	}
	/*
	 * 创建一个数字型的field 为RangeTerm做准备（如果不是区间查询区间查询的Comparator使用的是字符串的比较器）
	 * 这里解释下这三个参数第一个参数 field name 相当于table中的列名
	 * 第二个参数定义store属性 是否需要存储此field
	 * 第三个参数true or false  当前field是否需要索引  如果不索引查询此field的时候就查不到数据  
	 * 
	 */
	NumericField numfield = new NumericField(IndexKeys.TEST_NUMBERIC,Field.Store.YES,true);
	/*
	 * 两个date field 设置为数字型的field 可以做区间查询
	 */
	NumericField startdatefield = new NumericField(IndexKeys.TEST_STARTDATE,Field.Store.YES,true);
	NumericField enddatefield = new NumericField(IndexKeys.TEST_ENDDATE,Field.Store.YES,true);
	/*
	 * 为索引的创建返回一个document
	 */
	public Document getDocument() {
		Document doc = new Document();
		
		doc.add(numfield.setIntValue(random.nextInt(maxnum)));
		doc.add(new Field(IndexKeys.TEST_TYPE, types[random
				.nextInt(types.length)], Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		doc.add(new Field(IndexKeys.TEST_BIZ, bizes[random
				.nextInt(bizes.length)], Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		calendar.setTime(startdate);
		calendar.add(Calendar.DATE, random.nextInt(maxdaterange));
		doc.add(startdatefield.setLongValue(calendar.getTimeInMillis()));
		calendar.setTime(enddate);
		calendar.add(Calendar.DATE, random.nextInt(maxdaterange));
		doc.add(enddatefield.setLongValue(calendar.getTimeInMillis()));
		StringBuffer content = new StringBuffer();
		/*
		 * content 随机1000个word
		 */
		for (int i = 0; i < 1000; i++) {
			 
			content.append(contentseeds[random.nextInt(contentseeds.length)]);
			content.append(" ");
		}
		doc.add(new Field(IndexKeys.TEST_CONTENT, content.toString(),
				Field.Store.YES, Field.Index.ANALYZED));
		return doc;
	}
}

IndexKeys代码：

package com.rich.lucene.util;

public class IndexKeys {
	public static final String FILE_PATH = "path";

	public static final String FILE_MODIFIED = "modified";

	public static final String FILE_CONTENTS = "contents";
	
	public static final String FILE_TYPE = "filetype";
	
	public static final String TXT_INDEX_BASE_DIR = "D://lucenetest//indexs//txtindex";
	
	public static final String HTML_INDEX_BASE_DIR = "D://lucenetest//indexs//htmlindex";
	
	public static final String INDEX_SEGMENTS_PREFIX = "segments";
	
	public static final String TEST_NUMBERIC = "numberic";
	
	public static final String TEST_CONTENT = "testcontent";
	
	public static final String TEST_TYPE = "testtype";
	
	public static final String TEST_BIZ = "testbiz";
	
	public static final String TEST_STARTDATE = "teststartdate";
	
	public static final String TEST_ENDDATE = "testenddate";
	
	
	
}

创建索引，有了索引就可以为接下来这几天Query的研究做准备

/****************
 *
 *Create Class:IndexTest.java
 *Author:a276202460
 *Create at:2010-5-24
 */
package com.rich.lucene.test;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import com.rich.lucene.document.TestDocument;
import com.rich.lucene.util.IndexKeys;
 
public class IndexPrepare {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		 String indexdir = IndexKeys.TXT_INDEX_BASE_DIR+"//index2";
		 File indexfile = new File(indexdir);
		 Directory directory = null;
		 TestDocument testdocument = new TestDocument();
		 IndexWriter writer = null;
		 try{
			 boolean create = true;
			 if(!indexfile.exists()){
				 create = true;
				 indexfile.mkdir();
			 }
			 directory = FSDirectory.open(indexfile);
			 long starttime = System.currentTimeMillis();
			 writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_CURRENT), create, IndexWriter.MaxFieldLength.LIMITED);
			 writer.setRAMBufferSizeMB(500);
			 /*
			  * 创建索引 100W记录的话 我本机空间是7.37个G 时间还是比较久 可适量更改document的个数
			  * 我建100W是想为以后查询大数据量的不同query比较性能差异
			  */
			 for(int i = 0 ;i < 1000000;i++){
				 writer.addDocument(testdocument.getDocument());
			 }
			 
			 writer.optimize();
			 System.out.println(">>>>>>>>"+(System.currentTimeMillis() - starttime));
		 }catch(Exception e){
			 e.printStackTrace();	
		 }finally{
			 if(writer != null){
				 try {
					writer.close();
				} catch (CorruptIndexException e) {
					 
					e.printStackTrace();
				} catch (IOException e) {
					 
					e.printStackTrace();
				}
			 }
			 if(directory != null){
				 try {
					directory.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			 }
		 }

	}

}

今天的任务就这么多了，复习下Query的代码明天研究Query的查询