最近一直再研究lucene,把入门的程序和大家分享:
对索引的操作类:
- public class IndexDao {
- public IndexDao() {
- try {
- indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,
- Constants.analyzer, MaxFieldLength.LIMITED);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public IndexDao(Directory dir) {
- try {
- indexWriter = new IndexWriter(dir,Constants.analyzer,MaxFieldLength.LIMITED);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public IndexDao( boolean isCreate) {
- try {
- indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,Constants.analyzer, isCreate,MaxFieldLength.LIMITED);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- // 索引器
- private IndexWriter indexWriter = null ;
- /**
- * 添加/创建索引
- *
- * @param folder
- * @throws IOException
- * @throws CorruptIndexException
- */
- public void saveIndex(File folder, String[] unIndeies)
- throws CorruptIndexException, IOException {
- if (folder.isDirectory()) {
- String[] files = folder.list();
- for ( int i = 0 ; i < files.length; i++) {
- File f = new File(folder, files[i]);
- if (!f.isHidden()) {
- if (f.isDirectory()) {
- saveIndex(f, unIndeies);// ② 递归
- }
- String fileTyep = ReadFile.validateFile(f);
- for ( int j = 0 ; j < unIndeies.length; j++) {
- if (fileTyep.equalsIgnoreCase(unIndeies[j])) {
- System.out.println("正在建立索引 : " + f.getName() + "" );
- Document doc = ReadFile.indexFile(f);
- indexWriter.addDocument(doc);
- }
- }
- }
- }
- }
- }
- /**
- * Term是搜索的最小单位,代表某个 Field 中的一个关键词,如:<title, lucene> new Term( "title",
- * "lucene" ); new Term( "id", "5" ); new Term( "id", UUID );
- *
- * @param term
- */
- public void deleteIndex(Term term) {
- try {
- indexWriter.deleteDocuments(term);
- } catch (Exception e) {
- throw new RuntimeException(e);
- } finally {
- try {
- indexWriter.close();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 更新索引 indexWriter.deleteDocuments(term); indexWriter.addDocument(doc);
- *
- * @param term
- * @param doc
- */
- public void updateIndex(Term term, Document doc) {
- try {
- indexWriter.updateDocument(term, doc);
- } catch (Exception e) {
- throw new RuntimeException(e);
- } finally {
- try {
- indexWriter.close();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 查询 totalPage = recordCount / pageSize; if (recordCount % pageSize > 0)
- * totalPage++;
- *
- * @param queryString
- * @param firstResult
- * @param maxResults
- * @return
- */
- public QueryResult search(String queryString, int firstResult,
- int maxResults) {
- try {
- // 1,把要搜索的文本解析为 Query
- String[] fields = { "name" , "content" };
- Map<String, Float> boosts = new HashMap<String, Float>();
- boosts.put("name" , 2f);
- boosts.put("content" , 3f); //默认为1.0f
- QueryParser queryParser = new MultiFieldQueryParser(fields,
- Constants.analyzer, boosts);
- Query query = queryParser.parse(queryString);
- // Query query = IKQueryParser.parse("content", queryString);
- Date start = new Date();
- QueryResult result = search(query, firstResult, maxResults);
- Date end = new Date();
- System.out.println("检索完成,用时" + (end.getTime() - start.getTime())
- + "毫秒" );
- return result;
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
- public QueryResult search(Query query, int firstResult, int maxResults) {
- IndexSearcher indexSearcher = null ;
- try {
- // 2,进行查询
- indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
- Filter filter = new RangeFilter( "size" ,
- NumberTools.longToString(0 ), NumberTools
- .longToString(1000000 ), true , true );
- // 排序
- Sort sort = new Sort();
- sort.setSort(new SortField( "size" )); // 默认为升序
- // sort.setSort(new SortField("size", true));
- TopDocs topDocs = indexSearcher.search(query, filter, 10000 , sort);
- int recordCount = topDocs.totalHits;
- List<Document> recordList = new ArrayList<Document>();
- // 准备高亮器
- Formatter formatter = new SimpleHTMLFormatter( "<font color='red'>" ,
- "</font>" );
- Scorer scorer = new QueryScorer(query);
- Highlighter highlighter = new Highlighter(formatter, scorer);
- Fragmenter fragmenter = new SimpleFragmenter( 50 );
- highlighter.setTextFragmenter(fragmenter);
- // 3,取出当前页的数据
- int end = Math.min(firstResult + maxResults, topDocs.totalHits);
- for ( int i = firstResult; i < end; i++) {
- ScoreDoc scoreDoc = topDocs.scoreDocs[i];
- int docSn = scoreDoc.doc; // 文档内部编号
- Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
- // 高亮 返回高亮后的结果,如果当前属性值中没有出现关键字,会返回 null
- String hc = highlighter.getBestFragment(Constants.analyzer,
- "content" , doc.get( "content" ));
- if (hc == null ) {
- String content = doc.get("content" );
- int endIndex = Math.min( 50 , content.length());
- hc = content.substring(0 , endIndex); // 最多前50个字符
- }
- doc.getField("content" ).setValue(hc);
- recordList.add(doc);
- }
- // 返回结果
- return new QueryResult(recordCount, recordList);
- } catch (Exception e) {
- throw new RuntimeException(e);
- } finally {
- try {
- indexSearcher.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- public void close() {
- // 对索引进行优化
- try {
- indexWriter.optimize();
- indexWriter.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void readIndex(String key, String value) {
- IndexReader reader;
- try {
- // Directory fsDir = FSDirectory.getDirectory(
- // Constants.INDEX_STORE_PATH, false);
- // if (IndexReader.isLocked(fsDir)) {
- // System.out.println("------unlock-----");
- // IndexReader.unlock(fsDir);
- // }
- reader = IndexReader.open(Constants.INDEX_STORE_PATH);
- for ( int i = 0 ; i < reader.numDocs(); i++)
- // System.out.println(reader.document(i));
- System.out.println("版本:" + reader.getVersion());
- System.out.println("索引内的文档数量:" + reader.numDocs());
- Term term = new Term(key, value);
- TermDocs docs = reader.termDocs(term);
- IndexSearcher indexSearcher = null ;
- indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
- while (docs.next()) {
- int docSn = docs.doc(); // 文档内部编号
- Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
- System.out.println("文档路径 " + doc.get( "path" ));
- System.out.println("含有所查找的 " + term + "的Document的编号为: " + docs.doc());
- System.out.println("Term在文档中的出现 " + docs.freq()+ " 次" );
- }
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
public class IndexDao {
public IndexDao() {
try {
indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,
Constants.analyzer, MaxFieldLength.LIMITED);
} catch (Exception e) {
e.printStackTrace();
}
}
public IndexDao(Directory dir) {
try {
indexWriter = new IndexWriter(dir,Constants.analyzer,MaxFieldLength.LIMITED);
} catch (Exception e) {
e.printStackTrace();
}
}
public IndexDao(boolean isCreate) {
try {
indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,Constants.analyzer, isCreate,MaxFieldLength.LIMITED);
} catch (Exception e) {
e.printStackTrace();
}
}
// 索引器
private IndexWriter indexWriter = null;
/**
* 添加/创建索引
*
* @param folder
* @throws IOException
* @throws CorruptIndexException
*/
public void saveIndex(File folder, String[] unIndeies)
throws CorruptIndexException, IOException {
if (folder.isDirectory()) {
String[] files = folder.list();
for (int i = 0; i < files.length; i++) {
File f = new File(folder, files[i]);
if (!f.isHidden()) {
if (f.isDirectory()) {
saveIndex(f, unIndeies);// ② 递归
}
String fileTyep = ReadFile.validateFile(f);
for (int j = 0; j < unIndeies.length; j++) {
if (fileTyep.equalsIgnoreCase(unIndeies[j])) {
System.out.println("正在建立索引 : " + f.getName() + "");
Document doc = ReadFile.indexFile(f);
indexWriter.addDocument(doc);
}
}
}
}
}
}
/**
* Term是搜索的最小单位,代表某个 Field 中的一个关键词,如:<title, lucene> new Term( "title",
* "lucene" ); new Term( "id", "5" ); new Term( "id", UUID );
*
* @param term
*/
public void deleteIndex(Term term) {
try {
indexWriter.deleteDocuments(term);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 更新索引 indexWriter.deleteDocuments(term); indexWriter.addDocument(doc);
*
* @param term
* @param doc
*/
public void updateIndex(Term term, Document doc) {
try {
indexWriter.updateDocument(term, doc);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 查询 totalPage = recordCount / pageSize; if (recordCount % pageSize > 0)
* totalPage++;
*
* @param queryString
* @param firstResult
* @param maxResults
* @return
*/
public QueryResult search(String queryString, int firstResult,
int maxResults) {
try {
// 1,把要搜索的文本解析为 Query
String[] fields = { "name", "content" };
Map<String, Float> boosts = new HashMap<String, Float>();
boosts.put("name", 2f);
boosts.put("content", 3f); //默认为1.0f
QueryParser queryParser = new MultiFieldQueryParser(fields,
Constants.analyzer, boosts);
Query query = queryParser.parse(queryString);
// Query query = IKQueryParser.parse("content", queryString);
Date start = new Date();
QueryResult result = search(query, firstResult, maxResults);
Date end = new Date();
System.out.println("检索完成,用时" + (end.getTime() - start.getTime())
+ "毫秒");
return result;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public QueryResult search(Query query, int firstResult, int maxResults) {
IndexSearcher indexSearcher = null;
try {
// 2,进行查询
indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
Filter filter = new RangeFilter("size",
NumberTools.longToString(0), NumberTools
.longToString(1000000), true, true);
// 排序
Sort sort = new Sort();
sort.setSort(new SortField("size")); // 默认为升序
// sort.setSort(new SortField("size", true));
TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);
int recordCount = topDocs.totalHits;
List<Document> recordList = new ArrayList<Document>();
// 准备高亮器
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",
"</font>");
Scorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, scorer);
Fragmenter fragmenter = new SimpleFragmenter(50);
highlighter.setTextFragmenter(fragmenter);
// 3,取出当前页的数据
int end = Math.min(firstResult + maxResults, topDocs.totalHits);
for (int i = firstResult; i < end; i++) {
ScoreDoc scoreDoc = topDocs.scoreDocs[i];
int docSn = scoreDoc.doc; // 文档内部编号
Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
// 高亮 返回高亮后的结果,如果当前属性值中没有出现关键字,会返回 null
String hc = highlighter.getBestFragment(Constants.analyzer,
"content", doc.get("content"));
if (hc == null) {
String content = doc.get("content");
int endIndex = Math.min(50, content.length());
hc = content.substring(0, endIndex);// 最多前50个字符
}
doc.getField("content").setValue(hc);
recordList.add(doc);
}
// 返回结果
return new QueryResult(recordCount, recordList);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
indexSearcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void close() {
// 对索引进行优化
try {
indexWriter.optimize();
indexWriter.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public void readIndex(String key, String value) {
IndexReader reader;
try {
// Directory fsDir = FSDirectory.getDirectory(
// Constants.INDEX_STORE_PATH, false);
// if (IndexReader.isLocked(fsDir)) {
// System.out.println("------unlock-----");
// IndexReader.unlock(fsDir);
// }
reader = IndexReader.open(Constants.INDEX_STORE_PATH);
for (int i = 0; i < reader.numDocs(); i++)
// System.out.println(reader.document(i));
System.out.println("版本:" + reader.getVersion());
System.out.println("索引内的文档数量:" + reader.numDocs());
Term term = new Term(key, value);
TermDocs docs = reader.termDocs(term);
IndexSearcher indexSearcher = null;
indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
while (docs.next()) {
int docSn = docs.doc(); // 文档内部编号
Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
System.out.println("文档路径 " + doc.get("path"));
System.out.println("含有所查找的 " + term + "的Document的编号为: "+ docs.doc());
System.out.println("Term在文档中的出现 " + docs.freq()+" 次");
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
读取文件工具类:
- public class ReadFile {
- public static String readWord(File f) {
- StringBuffer content = new StringBuffer( "" ); // 文档内容
- try {
- HWPFDocument doc = new HWPFDocument( new FileInputStream(f));
- Range range = doc.getRange();
- int paragraphCount = range.numParagraphs(); // 段落
- for ( int i = 0 ; i < paragraphCount; i++) { // 遍历段落读取数据
- Paragraph pp = range.getParagraph(i);
- content.append(pp.text());
- }
- // System.out.println("-------word--------"+content.toString());
- } catch (Exception e) {
- System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
- e.printStackTrace();
- }
- return content.toString().trim();
- }
- public static String readPdf(File f){
- StringBuffer content = new StringBuffer( "" ); // 文档内容
- PDDocument pdfDocument = null ;
- try {
- if (f.length()> 10048576 ){
- DecimalFormat df = new DecimalFormat( "#.00" );
- System.out.println("---------------------文件大小------" +df.format(( double ) f.length() / 1048576 ) + "M" );
- return f.getName();
- }
- FileInputStream fis = new FileInputStream(f);
- PDFTextStripper stripper = new PDFTextStripper();
- pdfDocument = PDDocument.load(fis);
- if (pdfDocument.isEncrypted()){
- return f.getName();
- }
- StringWriter writer = new StringWriter();
- stripper.writeText(pdfDocument, writer);
- content.append(writer.getBuffer().toString());
- fis.close();
- } catch (IOException e) {
- System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
- System.err.println("IOException=" + e);
- //System.exit(1);
- } finally {
- if (pdfDocument != null ) {
- // System.err.println("Closing document " + f + "...");
- org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument();
- try {
- cos.close();
- // System.err.println("Closed " + cos);
- pdfDocument.close();
- } catch (IOException e) {
- System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
- e.printStackTrace();
- }
- }
- }
- // System.out.println("-------pdf--------"+content.toString());
- return content.toString().trim();
- }
- public static String readHtml(File f) {
- StringBuffer content = new StringBuffer( "" );
- FileInputStream fis = null ;
- try {
- fis = new FileInputStream(f);
- // 读取页面 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
- BufferedReader reader = new BufferedReader( new InputStreamReader(fis, "gb2312" ));
- String line = null ;
- while ((line = reader.readLine()) != null ) {
- content.append(line + "\n" );
- }
- reader.close();
- } catch (Exception e) {
- System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
- e.printStackTrace();
- }
- String contentString = content.toString();
- // System.out.println("---------htm索引----"+contentString);
- return contentString;
- }
- public static String readTxt(File f) {
- StringBuffer content = new StringBuffer( "" );
- try {
- BufferedReader reader = new BufferedReader( new InputStreamReader(
- new FileInputStream(f)));
- for (String line = null ; (line = reader.readLine()) != null ;) {
- content.append(line).append("\n" );
- }
- } catch (IOException e) {
- System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
- e.printStackTrace();
- }
- return content.toString().trim();
- }
- public static String readExcel(File f,String fileType){
- StringBuffer content = new StringBuffer( "" );
- try {
- ExcelReader er=new ExcelReader(f,fileType);
- String line=er.readLine();
- content.append(line).append("\n" );
- while (line!= null ){
- line=er.readLine();
- content.append(line).append("\n" );
- }
- er.close();
- }catch (Exception e){
- System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
- e.printStackTrace();
- }
- return content.toString();
- }
- public static String validateFile(File f) {
- String fileType = "otherType" ;
- String fileName = f.getName();
- if (fileName.lastIndexOf( '.' ) == - 1 ) {
- fileType = "dir" ;
- return fileType;
- }
- fileName = fileName.substring(fileName.lastIndexOf('.' ) + 1 , fileName
- .length());
- int i = 0 ;
- String [] extension=Constants.EXTENSION;
- for (i = 0 ; i < extension.length; i++) {
- if (fileName.equalsIgnoreCase(extension[i])) {
- fileType = extension[i];
- break ;
- }
- }
- return fileType;
- }
- public static Document indexFile(File f) {
- Document doc = new Document();
- try {
- doc.add(new Field( "name" , f.getName(), Store.YES, Index.ANALYZED));
- doc.add(new Field( "size" , NumberTools.longToString(f.length()),
- Store.YES, Index.NOT_ANALYZED));
- doc.add(new Field( "path" , f.getAbsolutePath(), Store.YES,
- Index.NOT_ANALYZED));
- String fileType = validateFile(f);
- if (fileType.equals( "txt" )) {
- doc.add(new Field( "content" , ReadFile.readTxt(f), Store.YES,
- Index.ANALYZED));
- } else if (fileType.equals( "pdf" )) {
- doc.add(new Field( "content" , ReadFile.readPdf(f), Store.YES,
- Index.ANALYZED));
- } else if (fileType.equals( "doc" )) {
- doc.add(new Field( "content" , ReadFile.readWord(f), Store.YES,
- Index.ANALYZED));
- } else if (fileType.equals( "htm" )) {
- doc.add(new Field( "content" , ReadFile.readHtml(f), Store.YES,
- Index.ANALYZED));
- } else if (fileType.equals( "xls" )){
- doc.add(new Field( "content" , ReadFile.readExcel(f, fileType), Store.YES,
- Index.ANALYZED));
- }else {
- doc.add(new Field( "content" , f.getName(), Store.YES, Index.ANALYZED));
- }
- } catch (Exception e) {
- System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
- e.printStackTrace();
- }
- return doc;
- }
- }
public class ReadFile {
public static String readWord(File f) {
StringBuffer content = new StringBuffer("");// 文档内容
try {
HWPFDocument doc = new HWPFDocument(new FileInputStream(f));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
// System.out.println("-------word--------"+content.toString());
} catch (Exception e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
e.printStackTrace();
}
return content.toString().trim();
}
public static String readPdf(File f){
StringBuffer content = new StringBuffer("");// 文档内容
PDDocument pdfDocument = null;
try {
if(f.length()>10048576){
DecimalFormat df = new DecimalFormat("#.00");
System.out.println("---------------------文件大小------"+df.format((double) f.length() / 1048576) + "M");
return f.getName();
}
FileInputStream fis = new FileInputStream(f);
PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(fis);
if(pdfDocument.isEncrypted()){
return f.getName();
}
StringWriter writer = new StringWriter();
stripper.writeText(pdfDocument, writer);
content.append(writer.getBuffer().toString());
fis.close();
} catch (IOException e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
System.err.println("IOException=" + e);
//System.exit(1);
} finally {
if (pdfDocument != null) {
// System.err.println("Closing document " + f + "...");
org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument();
try {
cos.close();
// System.err.println("Closed " + cos);
pdfDocument.close();
} catch (IOException e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
e.printStackTrace();
}
}
}
// System.out.println("-------pdf--------"+content.toString());
return content.toString().trim();
}
public static String readHtml(File f) {
StringBuffer content = new StringBuffer("");
FileInputStream fis = null;
try {
fis = new FileInputStream(f);
// 读取页面 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
BufferedReader reader = new BufferedReader(new InputStreamReader(fis, "gb2312"));
String line = null;
while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
e.printStackTrace();
}
String contentString = content.toString();
// System.out.println("---------htm索引----"+contentString);
return contentString;
}
public static String readTxt(File f) {
StringBuffer content = new StringBuffer("");
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(f)));
for (String line = null; (line = reader.readLine()) != null;) {
content.append(line).append("\n");
}
} catch (IOException e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
e.printStackTrace();
}
return content.toString().trim();
}
public static String readExcel(File f,String fileType){
StringBuffer content = new StringBuffer("");
try{
ExcelReader er=new ExcelReader(f,fileType);
String line=er.readLine();
content.append(line).append("\n");
while(line!=null){
line=er.readLine();
content.append(line).append("\n");
}
er.close();
}catch(Exception e){
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
e.printStackTrace();
}
return content.toString();
}
public static String validateFile(File f) {
String fileType = "otherType";
String fileName = f.getName();
if (fileName.lastIndexOf('.') == -1) {
fileType = "dir";
return fileType;
}
fileName = fileName.substring(fileName.lastIndexOf('.') + 1, fileName
.length());
int i = 0;
String [] extension=Constants.EXTENSION;
for (i = 0; i < extension.length; i++) {
if (fileName.equalsIgnoreCase(extension[i])) {
fileType = extension[i];
break;
}
}
return fileType;
}
public static Document indexFile(File f) {
Document doc = new Document();
try {
doc.add(new Field("name", f.getName(), Store.YES, Index.ANALYZED));
doc.add(new Field("size", NumberTools.longToString(f.length()),
Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("path", f.getAbsolutePath(), Store.YES,
Index.NOT_ANALYZED));
String fileType = validateFile(f);
if (fileType.equals("txt")) {
doc.add(new Field("content", ReadFile.readTxt(f), Store.YES,
Index.ANALYZED));
} else if (fileType.equals("pdf")) {
doc.add(new Field("content", ReadFile.readPdf(f), Store.YES,
Index.ANALYZED));
} else if (fileType.equals("doc")) {
doc.add(new Field("content", ReadFile.readWord(f), Store.YES,
Index.ANALYZED));
} else if (fileType.equals("htm")) {
doc.add(new Field("content", ReadFile.readHtml(f), Store.YES,
Index.ANALYZED));
} else if(fileType.equals("xls")){
doc.add(new Field("content", ReadFile.readExcel(f, fileType), Store.YES,
Index.ANALYZED));
}else {
doc.add(new Field("content", f.getName(), Store.YES, Index.ANALYZED));
}
} catch (Exception e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
e.printStackTrace();
}
return doc;
}
}
- public class ExcelReader {
- // 创建文件输入流
- private BufferedReader reader = null ;
- // 文件类型
- private String filetype;
- // 文件二进制输入流
- private InputStream is = null ;
- // 当前的Sheet
- private int currSheet;
- // 当前位置
- private int currPosition;
- // Sheet数量
- private int numOfSheets;
- // HSSFWorkbook
- HSSFWorkbook workbook = null ;
- // 设置Cell之间以空格分割
- private static String EXCEL_LINE_DELIMITER = " " ;
- // 设置最大列数
- // private static int MAX_EXCEL_COLUMNS = 64;
- // 构造函数创建一个ExcelReader
- public