看过几天的lucene,中间又需要研究jboss 就把lucene的学习落下了。今天重新开始学习lucene 只研究应用不研究算法
搞明白lucene的应用后在研究切词,索引算法。
lucene在使用上无外乎就两点 索引的创建 (IndexWriter) 其中用到Analyzer 索引的查询 (IndexSearcher) 其中也用到Analyzer,查询的策略就在Query的使用上 Query相当于给了lucene一个查询的指令语句和DB的sql有点类似。根据指令获得Result
先创建一个索引,待接下来几天研究Query的使用情况。
早前学习lucene的时候写个几行代码 copy过来改改温故下
编写Document类
/****************
*
*Create Class:TestDocument.java
*Author:a276202460
*Create at:2010-5-24
*/
package com.rich.lucene.document;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Random;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import com.rich.lucene.util.IndexKeys;
public class TestDocument {
private final int maxnum = 10000;
private final DateFormat df = new SimpleDateFormat("MM/dd/yyyy");
/*
* minstartdate 和minendate定义一个创建索引中的开始日期,结束日期的Field
*/
private final String minstartdate = "01/01/2001";
private final String minenddate = "01/01/2006";
/*
* 随机日期和最小日期的最大间隔
*/
private final int maxdaterange = 200;
private final Calendar calendar = Calendar.getInstance();
private Date startdate = null;
private Date enddate = null;
private final Random random = new Random();
/*
* 位document的创建随机一个type field
*/
private final String[] types = { "jsp", "java", "sp", "txt" };
/*
* 为document随机一个biz field
*/
private final String[] bizes = { "db", "ws", "jms", "lucene" };
/*
* 随机document的content field
*/
private String[] contentseeds = {"java","lucene","jsp","web","swing","awt","swt","tomcat","weblogic",
"websphere","resin","jboss","eclipse","netbean","oracle","mysql","sybase","sqlserver","database",
"datasource","axis","jms","servlet","maven","jdbc","jbpm","myeclipse","java3D"};
public TestDocument() {
try {
startdate = df.parse(minstartdate);
enddate = df.parse(minenddate);
} catch (ParseException e) {
e.printStackTrace();
throw new RuntimeException("Can't init date value");
}
}
/*
* 创建一个数字型的field 为RangeTerm做准备(如果不是区间查询区间查询的Comparator使用的是字符串的比较器)
* 这里解释下这三个参数第一个参数 field name 相当于table中的列名
* 第二个参数定义store属性 是否需要存储此field
* 第三个参数true or false 当前field是否需要索引 如果不索引查询此field的时候就查不到数据
*
*/
NumericField numfield = new NumericField(IndexKeys.TEST_NUMBERIC,Field.Store.YES,true);
/*
* 两个date field 设置为数字型的field 可以做区间查询
*/
NumericField startdatefield = new NumericField(IndexKeys.TEST_STARTDATE,Field.Store.YES,true);
NumericField enddatefield = new NumericField(IndexKeys.TEST_ENDDATE,Field.Store.YES,true);
/*
* 为索引的创建返回一个document
*/
public Document getDocument() {
Document doc = new Document();
doc.add(numfield.setIntValue(random.nextInt(maxnum)));
doc.add(new Field(IndexKeys.TEST_TYPE, types[random
.nextInt(types.length)], Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field(IndexKeys.TEST_BIZ, bizes[random
.nextInt(bizes.length)], Field.Store.YES,
Field.Index.NOT_ANALYZED));
calendar.setTime(startdate);
calendar.add(Calendar.DATE, random.nextInt(maxdaterange));
doc.add(startdatefield.setLongValue(calendar.getTimeInMillis()));
calendar.setTime(enddate);
calendar.add(Calendar.DATE, random.nextInt(maxdaterange));
doc.add(enddatefield.setLongValue(calendar.getTimeInMillis()));
StringBuffer content = new StringBuffer();
/*
* content 随机1000个word
*/
for (int i = 0; i < 1000; i++) {
content.append(contentseeds[random.nextInt(contentseeds.length)]);
content.append(" ");
}
doc.add(new Field(IndexKeys.TEST_CONTENT, content.toString(),
Field.Store.YES, Field.Index.ANALYZED));
return doc;
}
}
IndexKeys代码:
package com.rich.lucene.util;
public class IndexKeys {
public static final String FILE_PATH = "path";
public static final String FILE_MODIFIED = "modified";
public static final String FILE_CONTENTS = "contents";
public static final String FILE_TYPE = "filetype";
public static final String TXT_INDEX_BASE_DIR = "D://lucenetest//indexs//txtindex";
public static final String HTML_INDEX_BASE_DIR = "D://lucenetest//indexs//htmlindex";
public static final String INDEX_SEGMENTS_PREFIX = "segments";
public static final String TEST_NUMBERIC = "numberic";
public static final String TEST_CONTENT = "testcontent";
public static final String TEST_TYPE = "testtype";
public static final String TEST_BIZ = "testbiz";
public static final String TEST_STARTDATE = "teststartdate";
public static final String TEST_ENDDATE = "testenddate";
}
创建索引,有了索引就可以为接下来这几天Query的研究做准备
/****************
*
*Create Class:IndexTest.java
*Author:a276202460
*Create at:2010-5-24
*/
package com.rich.lucene.test;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import com.rich.lucene.document.TestDocument;
import com.rich.lucene.util.IndexKeys;
public class IndexPrepare {
/**
* @param args
*/
public static void main(String[] args) {
String indexdir = IndexKeys.TXT_INDEX_BASE_DIR+"//index2";
File indexfile = new File(indexdir);
Directory directory = null;
TestDocument testdocument = new TestDocument();
IndexWriter writer = null;
try{
boolean create = true;
if(!indexfile.exists()){
create = true;
indexfile.mkdir();
}
directory = FSDirectory.open(indexfile);
long starttime = System.currentTimeMillis();
writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_CURRENT), create, IndexWriter.MaxFieldLength.LIMITED);
writer.setRAMBufferSizeMB(500);
/*
* 创建索引 100W记录的话 我本机空间是7.37个G 时间还是比较久 可适量更改document的个数
* 我建100W是想为以后查询大数据量的不同query比较性能差异
*/
for(int i = 0 ;i < 1000000;i++){
writer.addDocument(testdocument.getDocument());
}
writer.optimize();
System.out.println(">>>>>>>>"+(System.currentTimeMillis() - starttime));
}catch(Exception e){
e.printStackTrace();
}finally{
if(writer != null){
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
if(directory != null){
try {
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
今天的任务就这么多了,复习下Query的代码 明天研究Query的查询