首先去官网下载lucene-4.7.0.zip,解压后找到这几个jar包加到项目库中(build path):
lucene-analyzers-common-4.7.0.jar
lucene-classification-4.7.0.jar
lucene-core-4.7.0.jar
lucene-demo-4.7.0.jar
lucene-queries-4.7.0.jar
lucene-queryparser-4.7.0.jar
一、索引创建如下:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class testIndexer {
public static void main(String args[]){
final File docDir=new File("./myCollectionsTRAINs2");
if (!docDir.exists()||!docDir.canRead()) {
System.out.println("read error!");
System.exit(1);
}
Date startDate=new Date();
try {
Directory directory=FSDirectory.open(new File("index"));
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_47);
IndexWriterConfig iwConfig=new IndexWriterConfig(Version.LUCENE_47, analyzer);
iwConfig.setOpenMode(OpenMode.CREATE);
IndexWriter writer=new IndexWriter(directory,iwConfig);
indexDocs(writer, docDir);
writer.close();
Date end = new Date();
System.out.println(end.getTime() - startDate.getTime() + " total milliseconds");
} catch (Exception e) {
// TODO: handle exception
}
}
static void indexDocs(IndexWriter writer,File file) throws IOException{
if (file.canRead()) {
if (file.isDirectory()) {
String[] fileStrings=file.list();
if (fileStrings!=null) {
for (int i = 0; i < fileStrings.length; i++) {
indexDocs(writer, new File(file,fileStrings[i]));
}
}
}else {
FileInputStream fis;
try {
fis=new FileInputStream(file);
} catch (FileNotFoundException e) {
// TODO: handle exception
return;
}
try{
Document document=new Document();
Field pathField=new StringField("path", file.getPath(), Field.Store.YES);
document.add(pathField);
Field classField=new StringField("theclass", file.getParentFile().getName(), Field.Store.YES);
document.add(classField);
document.add(new LongField("modified", file.lastModified(), Field.Store.NO));//Field.Store.NO,default
document.add(new TextField("content", new BufferedReader(new InputStreamReader(fis,"UTF-8"))));
if (writer.getConfig().getOpenMode()==OpenMode.CREATE) {
System.out.println("adding"+file);
writer.addDocument(document);
}
}finally{
fis.close();
}
}
}
}
}
二、检索文本如下:
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import javax.print.Doc;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class testSearcher {
public static void main(String[] args) throws Exception{
String indexPath="index";String field="content";
IndexReader reader=DirectoryReader.open(FSDirectory.open(new File(indexPath)));
IndexSearcher searcher=new IndexSearcher(reader);
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_47);
BufferedReader in=new BufferedReader(new InputStreamReader(System.in,"UTF-8"));
QueryParser parser=new QueryParser(Version.LUCENE_47,field,analyzer);
while (true) {
System.out.println("Enter query:");
String line = in.readLine();
if (line==null||line.length()==-1) {
break;
}
line =line.trim();
if (line.length()==0) {
break;
}
Query query=parser.parse(line);
System.out.println("searching for:"+query.toString(field));
doPagingSearch(in, searcher, query, 10);
}
reader.close();
}
public static void doPagingSearch(BufferedReader in, IndexSearcher searcher, Query query, int hitsPerPage) throws IOException {
TopDocs resultsDocs=searcher.search(query, 5*hitsPerPage);
ScoreDoc[] hits=resultsDocs.scoreDocs;
int numTotalHits=resultsDocs.totalHits;
System.out.println(numTotalHits+" total matchings!");
int start=0;
int end=Math.min(numTotalHits, hitsPerPage);
while (true) {
if (end>hits.length) {
System.out
.println("Only results 1 - " + hits.length + " of "
+ numTotalHits
+ " total matching documents collected.");
System.out.println("Collect more (y/n) ?");
String line = in.readLine();
if (line.length() == 0 || line.charAt(0) == 'n') {
break;
}
hits = searcher.search(query, numTotalHits).scoreDocs;
}
end=Math.min(hits.length, start+hitsPerPage);
for (int i = 0; i < end; i++) {
System.out.println("doc="+hits[i].doc+" Score="+hits[i].score);
Document document=searcher.doc(hits[i].doc);
String path = document.get("path");
if (path != null) {
System.out.println((i + 1) + ". " + path);
String theclass = document.get("theclass");
if (theclass != null) {
System.out.println("Class: " + theclass);
}
} else {
System.out.println((i + 1) + ". "
+ "No path for this document");
}
}
if (numTotalHits>=end) {
boolean quit=false;
while (true) {
System.out.print("Press ");
if (start - hitsPerPage >= 0) {
System.out.print("(p)revious page, ");
}
if (start + hitsPerPage < numTotalHits) {
System.out.print("(n)ext page, ");
}
System.out
.println("(q)uit or enter number to jump to a page.");
String line = in.readLine();
if (line.length() == 0 || line.charAt(0) == 'q') {
quit = true;
break;
}
if (line.charAt(0) == 'p') {
start = Math.max(0, start - hitsPerPage);
break;
} else if (line.charAt(0) == 'n') {
if (start + hitsPerPage < numTotalHits) {
start += hitsPerPage;
}
break;
} else {
int page = Integer.parseInt(line);
if ((page - 1) * hitsPerPage < numTotalHits) {
start = (page - 1) * hitsPerPage;
break;
} else {
System.out.println("No such page");
}
}
}
if (quit) break;
end = Math.min(numTotalHits, start + hitsPerPage);
}
}
}
}
三、分类器(KNN和NB)训练和测试如下:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.classification.ClassificationResult;
import org.apache.lucene.classification.KNearestNeighborClassifier;
import org.apache.lucene.classification.SimpleNaiveBayesClassifier;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
public class testClassification {
public static void main(String args[]) throws IOException{
Date startDate = new Date();
ArrayList
counterArrayList=new ArrayList
();
boolean isKnearst=true;
KNearestNeighborClassifier kNearestNeighborClassifier=null;
SimpleNaiveBayesClassifier simpleNaiveBayesClassifier=null;
ClassificationResult
classificationResult=null;
if (isKnearst) {
kNearestNeighborClassifier=new KNearestNeighborClassifier(12);
}else {
simpleNaiveBayesClassifier=new SimpleNaiveBayesClassifier();
}
IndexReader reader=DirectoryReader.open(FSDirectory.open(new File("index")));
AtomicReaderContext context=reader.getContext().leaves().get(0);//very important!!--leaves=>list
AtomicReader atomicReader=context.reader();
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_47);
if (isKnearst) {
kNearestNeighborClassifier.train(atomicReader, "content", "theclass", analyzer);
}else {
simpleNaiveBayesClassifier.train(atomicReader, "content", "theclass", analyzer);
}
//
int lenSum=0,trueSum=0;
final File docDir=new File("./myCollectionsTESTs");
String[] subdoclist=docDir.list();
for (int i = 0; i < subdoclist.length; i++) {
counterArrayList.clear();
for (int ii = 0; ii < subdoclist.length; ii++) {
counterArrayList.add(0);
}
System.err.println("------------------------"+"now testing on "+classref(i)+"------------------------");
File docdir2=new File("./myCollectionsTESTs"+"/"+subdoclist[i]);
String[] fileStrings=docdir2.list();
if (fileStrings!=null) {
for (int k = 0; k < fileStrings.length; k++) {
// System.out.println(fileStrings[0]);
BufferedReader in = new BufferedReader(new FileReader(new File("./myCollectionsTESTs"+"/"+subdoclist[i]+"/"+fileStrings[k])));
String str;StringBuilder stringBuilder=new StringBuilder();
while ((str = in.readLine()) != null)
{
stringBuilder.append(str);
}
in.close();
//
if (isKnearst) {
classificationResult=kNearestNeighborClassifier.assignClass(stringBuilder.toString());
}else {
classificationResult=simpleNaiveBayesClassifier.assignClass(stringBuilder.toString());
}
BytesRef bytesRef=classificationResult.getAssignedClass();
byte[] refs=bytesRef.bytes;
StringBuilder stringBuilder2=new StringBuilder();
for (byte ref : refs) {
stringBuilder2.append((char)ref);
}
String classString=stringBuilder2.toString().trim();
counterArrayList.set(Integer.parseInt(classString.substring(1))-1, counterArrayList.get(Integer.parseInt(classString.substring(1))-1)+1);
}
}
System.out.println("-------------------------------------test results on "+classref(i)+"--------------------------------------");
for (int j = 0; j
"+counterArrayList.get(j)); } System.out.println("total number:"+fileStrings.length); lenSum+=fileStrings.length; System.out.println("true positives:"+counterArrayList.get(i)); trueSum+=counterArrayList.get(i); System.out.println("accuracy rate:"+(double)counterArrayList.get(i)/fileStrings.length); System.err.println("***************************************end for "+classref(i)+"************************************"); } Date end = new Date(); System.out.println(end.getTime() - startDate.getTime() + " total milliseconds"); System.out.println("final size:"+lenSum); System.out.println("final true positives:"+trueSum); System.out.println("final accuracy rate:"+(double)trueSum/lenSum); } public static String classref(int i){ switch (i) { case 0: return "course"; case 1: return "department"; case 2: return "faculty"; case 3: return "other"; case 4: return "project"; case 5: return "staff"; case 6: return "student"; default: return ""; } } }