最近学习Lucene,在别人基础上,做了一个小例子 ,以便共同学习!
import java.io.InputStream;
import lia.handlingtypes.framework.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.textmining.text.extraction.WordExtractor;
public class DocDocumentHandler implements DocumentHandler ... {
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
String bodyText = null;
try ...{
bodyText = new WordExtractor().extractText(is);
}
catch (Exception e) ...{
throw new DocumentHandlerException(
"Cannot extract text from a Word document", e);
}
if ((bodyText != null) && (bodyText.trim().length() > 0)) ...{
Document doc = new Document();
doc.add(Field.UnStored("body", bodyText));
return doc;
}
return null;
}
}
import java.io.InputStream;
import org.apache.lucene.document.Document;
public interface DocumentHandler ... {
Document getDocument(InputStream is)
throws Exception;
}
import java.io.InputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;
public class HtmlDocumentHandler implements DocumentHandler ... {
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root = tidy.parseDOM(is, null);
Element rawDoc = root.getDocumentElement();
Document doc = new Document();
String title = getTitle(rawDoc);
String body = getBody(rawDoc);
if ((title != null) && (!title.equals(""))) ...{
doc.add(Field.Text("title", title));
}
if ((body != null) && (!body.equals(""))) ...{
doc.add(Field.Text("body", body));
}
return doc;
}
private String getTitle(Element rawDoc) ...{
if (rawDoc == null) ...{
return null;
}
String title = "";
NodeList children = rawDoc.getElementsByTagName("title");
if (children.getLength() > 0) ...{
Element titleElement = ((Element) children.item(0));
Text text = (Text) titleElement.getFirstChild();
if (text != null) ...{
title = text.getData();
}
}
return title;
}
/** *//**
* Gets the body text of the HTML document.
*
* @rawDoc the DOM Element to extract body Node from
* @return the body text
*/
private String getBody(Element rawDoc) ...{
if (rawDoc == null) ...{
return null;
}
String body = "";
NodeList children = rawDoc.getElementsByTagName("body");
if (children.getLength() > 0) ...{
body = getText(children.item(0));
}
return body;
}
/** *//**
* Extracts text from the DOM node.
*
* @param node a DOM node
* @return the text value of the node
*/
private String getText(Node node) ...{
NodeList children = node.getChildNodes();
StringBuffer sb = new StringBuffer();
for (int i = 0; i < children.getLength(); i++) ...{
Node child = children.item(i);
switch (child.getNodeType()) ...{
case Node.ELEMENT_NODE:
sb.append(getText(child));
sb.append(" ");
break;
case Node.TEXT_NODE:
sb.append(((Text) child).getData());
break;
}
}
return sb.toString();
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import lia.handlingtypes.framework.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.encryption.DecryptDocument;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
import org.pdfbox.util.PDFTextStripper;
public class PdfDocumentHandler implements DocumentHandler ... {
public static String password = "-password";
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
COSDocument cosDoc = null;
try ...{
cosDoc = parseDocument(is);
}
catch (IOException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot parse PDF document", e);
}
// decrypt the PDF document, if it is encrypted
try ...{
if (cosDoc.isEncrypted()) ...{
DecryptDocument decryptor = new DecryptDocument(cosDoc);
decryptor.decryptDocument(password);
}
}
catch (CryptographyException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot decrypt PDF document", e);
}
catch (InvalidPasswordException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot decrypt PDF document", e);
}
catch (IOException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot decrypt PDF document", e);
}
// extract PDF document's textual content
String docText = null;
try ...{
PDFTextStripper stripper = new PDFTextStripper();
docText = stripper.getText(new PDDocument(cosDoc));
}
catch (IOException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot parse PDF document", e);
// String errS = e.toString();
// if (errS.toLowerCase().indexOf("font") != -1) {
// }
}
Document doc = new Document();
if (docText != null) ...{
doc.add(Field.UnStored("body", docText));
}
// extract PDF document's meta-data
PDDocument pdDoc = null;
try ...{
pdDoc = new PDDocument(cosDoc);
PDDocumentInformation docInfo =
pdDoc.getDocumentInformation();
String author = docInfo.getAuthor();
String title = docInfo.getTitle();
String keywords = docInfo.getKeywords();
String summary = docInfo.getSubject();
if ((author != null) && (!author.equals(""))) ...{
doc.add(Field.Text("author", author));
}
if ((title != null) && (!title.equals(""))) ...{
doc.add(Field.Text("title", title));
}
if ((keywords != null) && (!keywords.equals(""))) ...{
doc.add(Field.Text("keywords", keywords));
}
if ((summary != null) && (!summary.equals(""))) ...{
doc.add(Field.Text("summary", summary));
}
}
catch (Exception e) ...{
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
System.err.println("Cannot get PDF document meta-data: "
+ e.getMessage());
}
return doc;
}
private static COSDocument parseDocument(InputStream is)
throws IOException ...{
PDFParser parser = new PDFParser(is);
parser.parse();
return parser.getDocument();
}
private void closeCOSDocument(COSDocument cosDoc) ...{
if (cosDoc != null) ...{
try ...{
cosDoc.close();
}
catch (IOException e) ...{
// eat it, what else can we do?
}
}
}
private void closePDDocument(PDDocument pdDoc) ...{
if (pdDoc != null) ...{
try ...{
pdDoc.close();
}
catch (IOException e) ...{
// eat it, what else can we do?
}
}
}
}
import java.io.IOException;
import java.io.InputStream;
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import lia.handlingtypes.framework.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
public class RtfDocumentHandler implements DocumentHandler ... {
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
String bodyText = null;
DefaultStyledDocument styledDoc = new DefaultStyledDocument();
try ...{
new RTFEditorKit().read(is, styledDoc, 0);
bodyText = styledDoc.getText(0, styledDoc.getLength());
}
catch (IOException e) ...{
throw new DocumentHandlerException(
"Cannot extract text from a RTF document", e);
}
catch (BadLocationException e) ...{
throw new DocumentHandlerException(
"Cannot extract text from a RTF document", e);
}
if (bodyText != null) ...{
Document doc = new Document();
doc.add(Field.UnStored("body", bodyText));
return doc;
}
return null;
}
}
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import lia.handlingtypes.framework.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
public class TxtDocumentHandler implements DocumentHandler ... {
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
String bodyText = "";
try ...{
BufferedReader br =
new BufferedReader(new InputStreamReader(is));
String line = null;
while ((line = br.readLine()) != null) ...{
bodyText += line;
}
br.close();
}
catch(IOException e) ...{
throw new DocumentHandlerException(
"Cannot read the text document", e);
}
if (!bodyText.equals("")) ...{
Document doc = new Document();
doc.add(Field.UnStored("body", bodyText));
return doc;
}
return null;
}
}
import java.io.File;
import java.util.Date;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.mira.lucene.analysis.IK_CAnalyzer;
/** */ /**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Searcher ... {
public static void main(String[] args) throws Exception ...{
if (args.length != 1) ...{
throw new Exception("Usage: java " + Searcher.class.getName()
+ " <index dir> <query>");
}
// File indexDir = new File(args[0]);
// String q = args[1];
File indexDir = new File("E:/LUCENE/index");
String q=args[0];
if (!indexDir.exists() || !indexDir.isDirectory()) ...{
throw new Exception(indexDir +
" does not exist or is not a directory.");
}
search(indexDir, q);
}
public static void search(File indexDir, String q)
throws Exception ...{
Directory fsDir = FSDirectory.getDirectory(indexDir, false);
IndexSearcher is = new IndexSearcher(fsDir);
Query query = QueryParser.parse(q, "body",
new IK_CAnalyzer());
//在“body”中查找,必须要已经在create index中已经定义好
//QueryParser .parse(String query, String field, Analyzer analyzer),例如:
//query为检索词, field为检索的字段名, analyzer为分析器
long start = new Date().getTime();
// BooleanQuery m_BooleanQuery = new BooleanQuery();
// m_BooleanQuery.add(query,true,false);
Hits hits = is.search(query); //search
long end = new Date().getTime();
System.err.println("Found " + hits.length() +
" document(s) (in " + (end - start) +
" milliseconds) that matched query '" +
q + "':");
for (int i = 0; i < hits.length(); i++) ...{
Document doc = hits.doc(i);
System.out.println(doc.get("filename"));
// System.out.println(doc.getField("contents"));
}
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
// 下面两个是网上下载的别人的中文分词器,对文中的中文进行分割
import org.mira.lucene.analysis.MIK_CAnalyzer; // (最大全切分)
import org.mira.lucene.analysis.IK_CAnalyzer; // (细粒度全切分)<------引用类
/** */ /**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Indexer ... {
// private static Document doc = new Document();
public static void main(String[] args) throws Exception ...{
// if (args.length != 2) {
// throw new Exception("Usage: java " + Indexer.class.getName()
// + " <index dir> <data dir>");
// }
// File indexDir = new File(args[0]);
// File dataDir = new File(args[1]);
File indexDir = new File("E:/LUCENE/index");
File dataDir = new File("E:/LUCENE/test");
long start = new Date().getTime();
int numIndexed = index(indexDir, dataDir);
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
// test 分词功能:
// System.out.println(new IK_CAnalyzer().tokenStream("用户本地系统中必须安装有Word的应用程序"," "));
}
public static int index(File indexDir, File dataDir)
throws IOException ...{
if (!dataDir.exists() || !dataDir.isDirectory()) ...{
throw new IOException(dataDir
+ " does not exist or is not a directory");
}
IndexWriter writer = new IndexWriter(indexDir,
new IK_CAnalyzer(), true);
writer.setUseCompoundFile(false);
indexDirectory(writer, dataDir);
int numIndexed = writer.docCount();
writer.optimize();
writer.close();
return numIndexed;
}
private static void indexDirectory(IndexWriter writer, File dir)
throws IOException ...{
Document doc = new Document();
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) ...{
File f = files[i];
if (f.isDirectory()) ...{
indexDirectory(writer, f); // recurse
} else
...{
try ...{
doc=Factory(f);
} catch (Exception e) ...{
// TODO Auto-generated catch block
e.printStackTrace();
}
writer.addDocument(doc); //一定要将生成的Document加到Writer中去。
}
}
}
private static Document Factory(File f) throws Exception ...{
Document doc = new Document();
DocumentHandler handler=null;
if (f.getName().endsWith(".txt") || f.getName().endsWith(".java")) ...{
//doc = getTxtDocument(new FileInputStream(f));
handler=new TxtDocumentHandler();
} else if (f.getName().endsWith(".doc")) ...{
// doc = getDocument(new FileInputStream(f));
handler=new DocDocumentHandler();
} else if (f.getName().endsWith(".pdf")) ...{
// doc = LucenePDFDocument.getDocument(f);
handler=new PdfDocumentHandler();
} else if (f.getName().endsWith(".rtf")) ...{
// doc = getRtfDocument(new FileInputStream(f));
handler=new RtfDocumentHandler();
} else if (f.getName().endsWith(".html")
|| f.getName().endsWith(".htm")) ...{
// doc = getHtmlDocument(new FileInputStream(f));
handler=new HtmlDocumentHandler();
}
if(handler!=null)...{
doc=handler.getDocument(new FileInputStream(f));
doc.add(Field.Keyword("filename", f.getCanonicalPath()));
System.out.println("Indexing " + f.getCanonicalPath());
}
return doc;
}
}
import java.io.InputStream;
import lia.handlingtypes.framework.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.textmining.text.extraction.WordExtractor;
public class DocDocumentHandler implements DocumentHandler ... {
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
String bodyText = null;
try ...{
bodyText = new WordExtractor().extractText(is);
}
catch (Exception e) ...{
throw new DocumentHandlerException(
"Cannot extract text from a Word document", e);
}
if ((bodyText != null) && (bodyText.trim().length() > 0)) ...{
Document doc = new Document();
doc.add(Field.UnStored("body", bodyText));
return doc;
}
return null;
}
}
import java.io.InputStream;
import org.apache.lucene.document.Document;
public interface DocumentHandler ... {
Document getDocument(InputStream is)
throws Exception;
}
import java.io.InputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;
public class HtmlDocumentHandler implements DocumentHandler ... {
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root = tidy.parseDOM(is, null);
Element rawDoc = root.getDocumentElement();
Document doc = new Document();
String title = getTitle(rawDoc);
String body = getBody(rawDoc);
if ((title != null) && (!title.equals(""))) ...{
doc.add(Field.Text("title", title));
}
if ((body != null) && (!body.equals(""))) ...{
doc.add(Field.Text("body", body));
}
return doc;
}
private String getTitle(Element rawDoc) ...{
if (rawDoc == null) ...{
return null;
}
String title = "";
NodeList children = rawDoc.getElementsByTagName("title");
if (children.getLength() > 0) ...{
Element titleElement = ((Element) children.item(0));
Text text = (Text) titleElement.getFirstChild();
if (text != null) ...{
title = text.getData();
}
}
return title;
}
/** *//**
* Gets the body text of the HTML document.
*
* @rawDoc the DOM Element to extract body Node from
* @return the body text
*/
private String getBody(Element rawDoc) ...{
if (rawDoc == null) ...{
return null;
}
String body = "";
NodeList children = rawDoc.getElementsByTagName("body");
if (children.getLength() > 0) ...{
body = getText(children.item(0));
}
return body;
}
/** *//**
* Extracts text from the DOM node.
*
* @param node a DOM node
* @return the text value of the node
*/
private String getText(Node node) ...{
NodeList children = node.getChildNodes();
StringBuffer sb = new StringBuffer();
for (int i = 0; i < children.getLength(); i++) ...{
Node child = children.item(i);
switch (child.getNodeType()) ...{
case Node.ELEMENT_NODE:
sb.append(getText(child));
sb.append(" ");
break;
case Node.TEXT_NODE:
sb.append(((Text) child).getData());
break;
}
}
return sb.toString();
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import lia.handlingtypes.framework.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.encryption.DecryptDocument;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
import org.pdfbox.util.PDFTextStripper;
public class PdfDocumentHandler implements DocumentHandler ... {
public static String password = "-password";
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
COSDocument cosDoc = null;
try ...{
cosDoc = parseDocument(is);
}
catch (IOException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot parse PDF document", e);
}
// decrypt the PDF document, if it is encrypted
try ...{
if (cosDoc.isEncrypted()) ...{
DecryptDocument decryptor = new DecryptDocument(cosDoc);
decryptor.decryptDocument(password);
}
}
catch (CryptographyException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot decrypt PDF document", e);
}
catch (InvalidPasswordException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot decrypt PDF document", e);
}
catch (IOException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot decrypt PDF document", e);
}
// extract PDF document's textual content
String docText = null;
try ...{
PDFTextStripper stripper = new PDFTextStripper();
docText = stripper.getText(new PDDocument(cosDoc));
}
catch (IOException e) ...{
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot parse PDF document", e);
// String errS = e.toString();
// if (errS.toLowerCase().indexOf("font") != -1) {
// }
}
Document doc = new Document();
if (docText != null) ...{
doc.add(Field.UnStored("body", docText));
}
// extract PDF document's meta-data
PDDocument pdDoc = null;
try ...{
pdDoc = new PDDocument(cosDoc);
PDDocumentInformation docInfo =
pdDoc.getDocumentInformation();
String author = docInfo.getAuthor();
String title = docInfo.getTitle();
String keywords = docInfo.getKeywords();
String summary = docInfo.getSubject();
if ((author != null) && (!author.equals(""))) ...{
doc.add(Field.Text("author", author));
}
if ((title != null) && (!title.equals(""))) ...{
doc.add(Field.Text("title", title));
}
if ((keywords != null) && (!keywords.equals(""))) ...{
doc.add(Field.Text("keywords", keywords));
}
if ((summary != null) && (!summary.equals(""))) ...{
doc.add(Field.Text("summary", summary));
}
}
catch (Exception e) ...{
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
System.err.println("Cannot get PDF document meta-data: "
+ e.getMessage());
}
return doc;
}
private static COSDocument parseDocument(InputStream is)
throws IOException ...{
PDFParser parser = new PDFParser(is);
parser.parse();
return parser.getDocument();
}
private void closeCOSDocument(COSDocument cosDoc) ...{
if (cosDoc != null) ...{
try ...{
cosDoc.close();
}
catch (IOException e) ...{
// eat it, what else can we do?
}
}
}
private void closePDDocument(PDDocument pdDoc) ...{
if (pdDoc != null) ...{
try ...{
pdDoc.close();
}
catch (IOException e) ...{
// eat it, what else can we do?
}
}
}
}
import java.io.IOException;
import java.io.InputStream;
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import lia.handlingtypes.framework.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
public class RtfDocumentHandler implements DocumentHandler ... {
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
String bodyText = null;
DefaultStyledDocument styledDoc = new DefaultStyledDocument();
try ...{
new RTFEditorKit().read(is, styledDoc, 0);
bodyText = styledDoc.getText(0, styledDoc.getLength());
}
catch (IOException e) ...{
throw new DocumentHandlerException(
"Cannot extract text from a RTF document", e);
}
catch (BadLocationException e) ...{
throw new DocumentHandlerException(
"Cannot extract text from a RTF document", e);
}
if (bodyText != null) ...{
Document doc = new Document();
doc.add(Field.UnStored("body", bodyText));
return doc;
}
return null;
}
}
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import lia.handlingtypes.framework.DocumentHandlerException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
public class TxtDocumentHandler implements DocumentHandler ... {
public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
String bodyText = "";
try ...{
BufferedReader br =
new BufferedReader(new InputStreamReader(is));
String line = null;
while ((line = br.readLine()) != null) ...{
bodyText += line;
}
br.close();
}
catch(IOException e) ...{
throw new DocumentHandlerException(
"Cannot read the text document", e);
}
if (!bodyText.equals("")) ...{
Document doc = new Document();
doc.add(Field.UnStored("body", bodyText));
return doc;
}
return null;
}
}
import java.io.File;
import java.util.Date;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.mira.lucene.analysis.IK_CAnalyzer;
/** */ /**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Searcher ... {
public static void main(String[] args) throws Exception ...{
if (args.length != 1) ...{
throw new Exception("Usage: java " + Searcher.class.getName()
+ " <index dir> <query>");
}
// File indexDir = new File(args[0]);
// String q = args[1];
File indexDir = new File("E:/LUCENE/index");
String q=args[0];
if (!indexDir.exists() || !indexDir.isDirectory()) ...{
throw new Exception(indexDir +
" does not exist or is not a directory.");
}
search(indexDir, q);
}
public static void search(File indexDir, String q)
throws Exception ...{
Directory fsDir = FSDirectory.getDirectory(indexDir, false);
IndexSearcher is = new IndexSearcher(fsDir);
Query query = QueryParser.parse(q, "body",
new IK_CAnalyzer());
//在“body”中查找,必须要已经在create index中已经定义好
//QueryParser .parse(String query, String field, Analyzer analyzer),例如:
//query为检索词, field为检索的字段名, analyzer为分析器
long start = new Date().getTime();
// BooleanQuery m_BooleanQuery = new BooleanQuery();
// m_BooleanQuery.add(query,true,false);
Hits hits = is.search(query); //search
long end = new Date().getTime();
System.err.println("Found " + hits.length() +
" document(s) (in " + (end - start) +
" milliseconds) that matched query '" +
q + "':");
for (int i = 0; i < hits.length(); i++) ...{
Document doc = hits.doc(i);
System.out.println(doc.get("filename"));
// System.out.println(doc.getField("contents"));
}
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
// 下面两个是网上下载的别人的中文分词器,对文中的中文进行分割
import org.mira.lucene.analysis.MIK_CAnalyzer; // (最大全切分)
import org.mira.lucene.analysis.IK_CAnalyzer; // (细粒度全切分)<------引用类
/** */ /**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Indexer ... {
// private static Document doc = new Document();
public static void main(String[] args) throws Exception ...{
// if (args.length != 2) {
// throw new Exception("Usage: java " + Indexer.class.getName()
// + " <index dir> <data dir>");
// }
// File indexDir = new File(args[0]);
// File dataDir = new File(args[1]);
File indexDir = new File("E:/LUCENE/index");
File dataDir = new File("E:/LUCENE/test");
long start = new Date().getTime();
int numIndexed = index(indexDir, dataDir);
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
// test 分词功能:
// System.out.println(new IK_CAnalyzer().tokenStream("用户本地系统中必须安装有Word的应用程序"," "));
}
public static int index(File indexDir, File dataDir)
throws IOException ...{
if (!dataDir.exists() || !dataDir.isDirectory()) ...{
throw new IOException(dataDir
+ " does not exist or is not a directory");
}
IndexWriter writer = new IndexWriter(indexDir,
new IK_CAnalyzer(), true);
writer.setUseCompoundFile(false);
indexDirectory(writer, dataDir);
int numIndexed = writer.docCount();
writer.optimize();
writer.close();
return numIndexed;
}
private static void indexDirectory(IndexWriter writer, File dir)
throws IOException ...{
Document doc = new Document();
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) ...{
File f = files[i];
if (f.isDirectory()) ...{
indexDirectory(writer, f); // recurse
} else
...{
try ...{
doc=Factory(f);
} catch (Exception e) ...{
// TODO Auto-generated catch block
e.printStackTrace();
}
writer.addDocument(doc); //一定要将生成的Document加到Writer中去。
}
}
}
private static Document Factory(File f) throws Exception ...{
Document doc = new Document();
DocumentHandler handler=null;
if (f.getName().endsWith(".txt") || f.getName().endsWith(".java")) ...{
//doc = getTxtDocument(new FileInputStream(f));
handler=new TxtDocumentHandler();
} else if (f.getName().endsWith(".doc")) ...{
// doc = getDocument(new FileInputStream(f));
handler=new DocDocumentHandler();
} else if (f.getName().endsWith(".pdf")) ...{
// doc = LucenePDFDocument.getDocument(f);
handler=new PdfDocumentHandler();
} else if (f.getName().endsWith(".rtf")) ...{
// doc = getRtfDocument(new FileInputStream(f));
handler=new RtfDocumentHandler();
} else if (f.getName().endsWith(".html")
|| f.getName().endsWith(".htm")) ...{
// doc = getHtmlDocument(new FileInputStream(f));
handler=new HtmlDocumentHandler();
}
if(handler!=null)...{
doc=handler.getDocument(new FileInputStream(f));
doc.add(Field.Keyword("filename", f.getCanonicalPath()));
System.out.println("Indexing " + f.getCanonicalPath());
}
return doc;
}
}