TestIndex.java
package org.lucene.test;
import java.io.File;
import org.junit.Test;
import org.lucene.util.FileIndexUtil;
import org.lucene.util.IndexUtil;
import org.lucene.util.SearcherUtil;
public class TestIndex {
/**
*@MethodName:testIndex
*@Description:创建索引
*@author:半仙儿
*@return void
*@date:2015-4-21上午11:50:58
*/
@Test
public void testIndex() {
IndexUtil iu = new IndexUtil();
iu.index();
}
/**
*@MethodName:testTika01
*@Description:使用tika插件进行解析doc文件到控制台
*@author:半仙儿
*@return void
*@date:2015-4-21下午12:03:31
*/
@Test
public void testTika01() {
IndexUtil iu = new IndexUtil();
System.out.println(iu
.fileToTxt(new File("D:/lucene/example2/职位JD.doc")));
}
/**
*@MethodName:testTika02
*@Description:使用tika进行解析doc(第二种方式)
*@author:半仙儿
*@return void
*@date:2015-4-21下午01:13:05
*/
@Test
public void testTika02() {
IndexUtil iu = new IndexUtil();
System.out
.println(iu.tikaTool(new File("D:/lucene/example2/职位JD.doc")));
}
/**
*@MethodName:testIndex03
*@Description:使用Tika进行索引的创建
*@author:半仙儿
*@return void
*@date:2015-4-21下午02:14:00
*/
@Test
public void testIndex03() {
FileIndexUtil.index(true);
}
/**
*@MethodName:testSearcher01
*@Description:使用tika进行解析文档之后,进行搜索
*@author:半仙儿
*@return void
*@date:2015-4-21下午03:38:43
*/
@Test
public void testSearcher01() {
SearcherUtil su = new SearcherUtil();
su.searcher01();
}
}
FileIndexUtil.java
package org.lucene.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
public class FileIndexUtil {
private static Directory directory = null;
static {
try {
directory = FSDirectory.open(new File("d:/lucene/files"));
} catch (Exception e) {
e.printStackTrace();
}
}
public static Directory getDirectory() {
return directory;
}
/**
*@MethodName:generatorDocument
*@Description:获取文件的页数
*@param f
*@return
*@author:半仙儿
*@return Document
* @throws IOException
*@date:2015-4-21下午02:05:48
*/
public static Document generatorDocument(File f) throws IOException {
Document doc = new Document();
Metadata metadata = new Metadata();
doc.add(new Field("content", new Tika().parse(new FileInputStream(f),
metadata)));
doc.add(new Field("title", FilenameUtils.getBaseName(f.getName()),
Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("filename", f.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
// 类型
doc.add(new Field("type", FilenameUtils.getExtension(f.getName()),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
int page = 0;
doc.add(new Field("path", f.getAbsolutePath(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
try {
page = Integer.parseInt(metadata.get("xmpTPg:NPage"));
} catch (Exception e) {
}
// 存储页码
doc.add(new NumericField("page", Field.Store.YES, true)
.setIntValue(page));
doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(f
.lastModified()));
doc.add(new NumericField("size", Field.Store.YES, true)
.setIntValue((int) f.length() / 1024));
return doc;
}
/**
*@MethodName:index
*@Description:创建索引
*@param hasNew是否要新建索引
*@author:半仙儿
*@return void
*@date:2015-4-15下午04:05:04
*/
public static void index(boolean hasNew) {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(
Version.LUCENE_35, new MMSegAnalyzer()));
if (hasNew) {
writer.deleteAll();
}
File file = new File("d:/lucene/example2");
Document doc = null;
for (File f : file.listFiles()) {
doc = generatorDocument(f);
// 通过tika直接存储
doc.add(new Field("content", new Tika().parse(f)));
doc.add(new Field("title", FilenameUtils.getBaseName(f
.getName()), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("filename", f.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
// 类型
doc.add(new Field("type", FilenameUtils.getExtension(f
.getName()), Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("path", f.getAbsolutePath(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new NumericField("date", Field.Store.YES, true)
.setLongValue(f.lastModified()));
doc.add(new NumericField("size", Field.Store.YES, true)
.setIntValue((int) f.length() / 1024));
writer.addDocument(doc);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (writer != null)
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
IndexUtil.java
package org.lucene.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
public class IndexUtil {
/**
*
*@MethodName:index
*@Description:创建索引
*@author:半仙儿
*@return void
*@date:2015-4-21上午11:36:54
*/
public void index() {
try {
File f = new File("D:/lucene/example2/职位JD.doc");
Directory dir = FSDirectory.open(new File("d:/lucene/file2"));
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
Version.LUCENE_35, new MMSegAnalyzer()));
writer.deleteAll();
Document doc = new Document();
doc.add(new Field("content", new FileReader(f)));
writer.addDocument(doc);
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:fileToTxt
*@Description:使用tika进行doc文件的解析
*@param f
*@return
*@author:半仙儿
*@return String
*@date:2015-4-21下午01:08:32
*/
public String fileToTxt(File f) {
Parser parser = new AutoDetectParser();
InputStream is = null;
try {
Metadata metadata = new Metadata();
metadata.set(Metadata.AUTHOR, "空号");
is = new FileInputStream(f);
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
parser.parse(is, handler, metadata, context);
for (String name : metadata.names()) {
System.out.println(name + ":" + metadata.get(name));
}
return handler.toString();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (is != null)
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
/**
*@MethodName:tikaTool
*@Description:封装工具类
*@param f
*@return
*@author:半仙儿
*@return String
*@date:2015-4-21下午01:09:27
*/
public String tikaTool(File f) {
Tika tika=new Tika();
try {
return tika.parseToString(f);
} catch (IOException e) {
e.printStackTrace();
} catch (TikaException e) {
e.printStackTrace();
}
return null;
}
}
SearcherUtil.java
package org.lucene.util;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
public class SearcherUtil {
public void searcher01() {
try {
IndexSearcher searcher = new IndexSearcher(IndexReader
.open(FileIndexUtil.getDirectory()));
TermQuery query = new TermQuery(new Term("content", "强"));
TopDocs tds = searcher.search(query, 20);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("title"));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}