/**
* 搜索引擎工具类
* @author ruanzhiyong6496
* @version 1.0
*/
public class Lucene
{
private static String INDEX_DIR = "D:\\index";// 索引存放目录
private static String DATA_DIR = "D:\\small";// 小文件存放的目录
/**
* 将大文件切割为小文件
*
* @param filepath
* 大文件路径
* @param outputdir
* 小文件輸出目錄
* @param size
* 小文件尺寸
*/
private static void splitToSmallFiles(String filepath)
{
int filePointer = 0;
int MAX_SIZE = 1024 * 10;
BufferedWriter writer = null;
int index1 = filepath.lastIndexOf("/");
int index2 = filepath.lastIndexOf(".");
String fileName = filepath.substring(index1 + 1, index2);
try
{
File dir = new File(DATA_DIR);
if (!dir.exists())
{
dir.mkdir();
}
BufferedReader reader = new BufferedReader(new FileReader(filepath));
StringBuffer buffer = new StringBuffer();
String line = reader.readLine();
while (line != null)
{
buffer.append(line).append("\r\n");
if (buffer.toString().getBytes().length >= MAX_SIZE)
{
File file = new File(dir, fileName + filePointer + ".txt");
writer = new BufferedWriter(new FileWriter(file));
writer.write(buffer.toString());
writer.close();
filePointer++;
buffer = new StringBuffer();
}
line = reader.readLine();
}
System.out.println("The file hava splited to small files !");
}
catch (FileNotFoundException e)
{
System.out.println("file not found !");
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
/**
* 索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量
*
* @param indexDir
* @param dataDir
* @return int
* @throws IOException
*/
private static int index() throws IOException
{
File dataDr = new File(DATA_DIR);
if (!dataDr.exists() || !dataDr.isDirectory())
{
throw new IOException(dataDr
+ " does not exist or is not a directory");
}
IndexWriter writer = new IndexWriter(FSDirectory.open(new File(
INDEX_DIR)), new StandardAnalyzer(Version.LUCENE_CURRENT),
true, IndexWriter.MaxFieldLength.LIMITED);// 有变化的地方
indexDirectory(writer, DATA_DIR);
int numIndexed = writer.numDocs();
writer.optimize();
writer.close();
return numIndexed;
}
/**
* 循环遍历目录下的所有.txt文件并进行索引
*
* @param writer
* @param dir
* @throws IOException
*/
private static void indexDirectory(IndexWriter writer, String dir)
throws IOException
{
File dr = new File(dir);
if (!dr.exists())
{
return;
}
File[] files = dr.listFiles();
for (int i = 0; i < files.length; i++)
{
File f = files[i];
if (f.isDirectory())
{
indexDirectory(writer, f.getName()); // recurse
}
else if (f.getName().endsWith(".txt"))
{
indexFile(writer, f);
}
}
}
/**
* 对单个txt文件进行索引
*
* @param writer
* @param f
* @throws IOException
*/
private static void indexFile(IndexWriter writer, File f)
throws IOException
{
if (f.isHidden() || !f.exists() || !f.canRead())
{
return;
}
// System.out.println("Indexing " + f.getCanonicalPath());
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f)));// 有变化的地方
doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES,
Field.Index.ANALYZED));// 有变化的地方
writer.addDocument(doc);
}
/**
* 查询
*
* @param indexDir
* @param q
* @throws Exception
*/
public static void search(String filepath, String keyword, int topnum)
{
try
{
splitToSmallFiles(filepath);
index();
IndexSearcher is = new IndexSearcher(FSDirectory.open(new File(
INDEX_DIR)), true);// read-only
String field = "contents";
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field,
new StandardAnalyzer(Version.LUCENE_CURRENT));// 有变化的地方
Query query = parser.parse(keyword);
TopScoreDocCollector collector = TopScoreDocCollector.create(
topnum, false);// 有变化的地方
long start = new Date().getTime();// start time
is.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// System.out.println(hits.length);
for (int i = 0; i < hits.length; i++)
{
Document doc = is.doc(hits[i].doc);// new method is.doc()
System.out.println(doc.getField("filename"));
// System.out.println(doc.getField("filename") + " "
// + hits[i].toString() + " ");
}
long end = new Date().getTime();// end time
System.out.println("Found " + collector.getTotalHits()
+ " document(s) (in " + (end - start)
+ " milliseconds) that matched query '" + keyword + "':");
}
catch (Exception e)
{
e.printStackTrace();
}
}
}