lucene入门

最新推荐文章于 2024-07-07 21:39:43 发布

luxiangxing

最新推荐文章于 2024-07-07 21:39:43 发布

阅读量62

点赞数

分类专栏： lucene 文章标签： lucene F# Excel J#

本文链接：https://blog.csdn.net/luxiangxing/article/details/83766028

版权

lucene 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

最近一直再研究lucene，把入门的程序和大家分享：

对索引的操作类：

Java代码

public class IndexDao {
public IndexDao() {
try {
indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,
Constants.analyzer, MaxFieldLength.LIMITED);
} catch (Exception e) {
e.printStackTrace();
}
}
public IndexDao(Directory dir) {
try {
indexWriter = new IndexWriter(dir,Constants.analyzer,MaxFieldLength.LIMITED);
} catch (Exception e) {
e.printStackTrace();
}
}
public IndexDao( boolean isCreate) {
try {
indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,Constants.analyzer, isCreate,MaxFieldLength.LIMITED);
} catch (Exception e) {
e.printStackTrace();
}
}
// 索引器
private IndexWriter indexWriter = null ;
/**
* 添加/创建索引
*
* @param folder
* @throws IOException
* @throws CorruptIndexException
*/
public void saveIndex(File folder, String[] unIndeies)
throws CorruptIndexException, IOException {
if (folder.isDirectory()) {
String[] files = folder.list();
for ( int i = 0 ; i < files.length; i++) {
File f = new File(folder, files[i]);
if (!f.isHidden()) {
if (f.isDirectory()) {
saveIndex(f, unIndeies);// ② 递归
}
String fileTyep = ReadFile.validateFile(f);
for ( int j = 0 ; j < unIndeies.length; j++) {
if (fileTyep.equalsIgnoreCase(unIndeies[j])) {
System.out.println("正在建立索引 : " + f.getName() + "" );
Document doc = ReadFile.indexFile(f);
indexWriter.addDocument(doc);
}
}
}
}
}
}
/**
* Term是搜索的最小单位，代表某个 Field 中的一个关键词，如：<title, lucene> new Term( "title",
* "lucene" ); new Term( "id", "5" ); new Term( "id", UUID );
*
* @param term
*/
public void deleteIndex(Term term) {
try {
indexWriter.deleteDocuments(term);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 更新索引 indexWriter.deleteDocuments(term); indexWriter.addDocument(doc);
*
* @param term
* @param doc
*/
public void updateIndex(Term term, Document doc) {
try {
indexWriter.updateDocument(term, doc);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 查询 totalPage = recordCount / pageSize; if (recordCount % pageSize > 0)
* totalPage++;
*
* @param queryString
* @param firstResult
* @param maxResults
* @return
*/
public QueryResult search(String queryString, int firstResult,
int maxResults) {
try {
// 1，把要搜索的文本解析为 Query
String[] fields = { "name" , "content" };
Map<String, Float> boosts = new HashMap<String, Float>();
boosts.put("name" , 2f);
boosts.put("content" , 3f); //默认为1.0f
QueryParser queryParser = new MultiFieldQueryParser(fields,
Constants.analyzer, boosts);
Query query = queryParser.parse(queryString);
// Query query = IKQueryParser.parse("content", queryString);
Date start = new Date();
QueryResult result = search(query, firstResult, maxResults);
Date end = new Date();
System.out.println("检索完成，用时" + (end.getTime() - start.getTime())
+ "毫秒" );
return result;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public QueryResult search(Query query, int firstResult, int maxResults) {
IndexSearcher indexSearcher = null ;
try {
// 2，进行查询
indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
Filter filter = new RangeFilter( "size" ,
NumberTools.longToString(0 ), NumberTools
.longToString(1000000 ), true , true );
// 排序
Sort sort = new Sort();
sort.setSort(new SortField( "size" )); // 默认为升序
// sort.setSort(new SortField("size", true));
TopDocs topDocs = indexSearcher.search(query, filter, 10000 , sort);
int recordCount = topDocs.totalHits;
List<Document> recordList = new ArrayList<Document>();
// 准备高亮器
Formatter formatter = new SimpleHTMLFormatter( "<font color='red'>" ,
"</font>" );
Scorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, scorer);
Fragmenter fragmenter = new SimpleFragmenter( 50 );
highlighter.setTextFragmenter(fragmenter);
// 3，取出当前页的数据
int end = Math.min(firstResult + maxResults, topDocs.totalHits);
for ( int i = firstResult; i < end; i++) {
ScoreDoc scoreDoc = topDocs.scoreDocs[i];
int docSn = scoreDoc.doc; // 文档内部编号
Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
// 高亮返回高亮后的结果，如果当前属性值中没有出现关键字，会返回 null
String hc = highlighter.getBestFragment(Constants.analyzer,
"content" , doc.get( "content" ));
if (hc == null ) {
String content = doc.get("content" );
int endIndex = Math.min( 50 , content.length());
hc = content.substring(0 , endIndex); // 最多前50个字符
}
doc.getField("content" ).setValue(hc);
recordList.add(doc);
}
// 返回结果
return new QueryResult(recordCount, recordList);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
indexSearcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void close() {
// 对索引进行优化
try {
indexWriter.optimize();
indexWriter.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public void readIndex(String key, String value) {
IndexReader reader;
try {
// Directory fsDir = FSDirectory.getDirectory(
// Constants.INDEX_STORE_PATH, false);
// if (IndexReader.isLocked(fsDir)) {
// System.out.println("------unlock-----");
// IndexReader.unlock(fsDir);
// }
reader = IndexReader.open(Constants.INDEX_STORE_PATH);
for ( int i = 0 ; i < reader.numDocs(); i++)
// System.out.println(reader.document(i));
System.out.println("版本：" + reader.getVersion());
System.out.println("索引内的文档数量：" + reader.numDocs());
Term term = new Term(key, value);
TermDocs docs = reader.termDocs(term);
IndexSearcher indexSearcher = null ;
indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
while (docs.next()) {
int docSn = docs.doc(); // 文档内部编号
Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
System.out.println("文档路径 " + doc.get( "path" ));
System.out.println("含有所查找的 " + term + "的Document的编号为: " + docs.doc());
System.out.println("Term在文档中的出现 " + docs.freq()+ " 次" );
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

public class IndexDao {

	public IndexDao() {
		try {
			indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,
					Constants.analyzer, MaxFieldLength.LIMITED);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	public IndexDao(Directory dir) {
		try {
			indexWriter = new IndexWriter(dir,Constants.analyzer,MaxFieldLength.LIMITED);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	public IndexDao(boolean isCreate) {
		try {
			indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,Constants.analyzer, isCreate,MaxFieldLength.LIMITED);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	// 索引器
	private IndexWriter indexWriter = null;

	/**
	 * 添加/创建索引
	 * 
	 * @param folder
	 * @throws IOException
	 * @throws CorruptIndexException
	 */
	public void saveIndex(File folder, String[] unIndeies)
			throws CorruptIndexException, IOException {
		if (folder.isDirectory()) {
			String[] files = folder.list();
			for (int i = 0; i < files.length; i++) {
				File f = new File(folder, files[i]);
				if (!f.isHidden()) {
					if (f.isDirectory()) {
						saveIndex(f, unIndeies);// ② 递归
					}
					String fileTyep = ReadFile.validateFile(f);
					for (int j = 0; j < unIndeies.length; j++) {
						if (fileTyep.equalsIgnoreCase(unIndeies[j])) {
							System.out.println("正在建立索引 : " + f.getName() + "");
							Document doc = ReadFile.indexFile(f);
							indexWriter.addDocument(doc);
						}
					}
				}
			}
		}
	}

	/**
	 * Term是搜索的最小单位，代表某个 Field 中的一个关键词，如：<title, lucene> new Term( "title",
	 * "lucene" ); new Term( "id", "5" ); new Term( "id", UUID );
	 * 
	 * @param term
	 */
	public void deleteIndex(Term term) {
		try {
			indexWriter.deleteDocuments(term);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 更新索引 indexWriter.deleteDocuments(term); indexWriter.addDocument(doc);
	 * 
	 * @param term
	 * @param doc
	 */
	public void updateIndex(Term term, Document doc) {
		try {
			indexWriter.updateDocument(term, doc);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 查询 totalPage = recordCount / pageSize; if (recordCount % pageSize &gt; 0)
	 * totalPage++;
	 * 
	 * @param queryString
	 * @param firstResult
	 * @param maxResults
	 * @return
	 */
	public QueryResult search(String queryString, int firstResult,
			int maxResults) {
		try {
			// 1，把要搜索的文本解析为 Query
			String[] fields = { "name", "content" };
			Map<String, Float> boosts = new HashMap<String, Float>();
			boosts.put("name", 2f);
			boosts.put("content", 3f); //默认为1.0f
			QueryParser queryParser = new MultiFieldQueryParser(fields,
					Constants.analyzer, boosts);
			Query query = queryParser.parse(queryString);
//			Query query = IKQueryParser.parse("content", queryString);
			Date start = new Date();
			QueryResult result = search(query, firstResult, maxResults);
			Date end = new Date();
			System.out.println("检索完成，用时" + (end.getTime() - start.getTime())
					+ "毫秒");
			return result;
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}

	public QueryResult search(Query query, int firstResult, int maxResults) {
		IndexSearcher indexSearcher = null;
		try {
			// 2，进行查询
			indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
			Filter filter = new RangeFilter("size",
					NumberTools.longToString(0), NumberTools
							.longToString(1000000), true, true);
			// 排序
			Sort sort = new Sort();
			sort.setSort(new SortField("size")); // 默认为升序
			// sort.setSort(new SortField("size", true));
			TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);
			int recordCount = topDocs.totalHits;
			List<Document> recordList = new ArrayList<Document>();
			// 准备高亮器
			Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",
					"</font>");
			Scorer scorer = new QueryScorer(query);
			Highlighter highlighter = new Highlighter(formatter, scorer);
			Fragmenter fragmenter = new SimpleFragmenter(50);
			highlighter.setTextFragmenter(fragmenter);
			// 3，取出当前页的数据
			int end = Math.min(firstResult + maxResults, topDocs.totalHits);
			for (int i = firstResult; i < end; i++) {
				ScoreDoc scoreDoc = topDocs.scoreDocs[i];
				int docSn = scoreDoc.doc; // 文档内部编号
				Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
				// 高亮 返回高亮后的结果，如果当前属性值中没有出现关键字，会返回 null
				String hc = highlighter.getBestFragment(Constants.analyzer,
						"content", doc.get("content"));
				if (hc == null) {
					String content = doc.get("content");
					int endIndex = Math.min(50, content.length());
					hc = content.substring(0, endIndex);// 最多前50个字符
				}
				doc.getField("content").setValue(hc);
				recordList.add(doc);
			}
			// 返回结果
			return new QueryResult(recordCount, recordList);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexSearcher.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

	public void close() {
		// 对索引进行优化
		try {
			indexWriter.optimize();
			indexWriter.close();
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public void readIndex(String key, String value) {
		
		IndexReader reader;
		try {
//			Directory fsDir = FSDirectory.getDirectory(
//					Constants.INDEX_STORE_PATH, false);
//			if (IndexReader.isLocked(fsDir)) {
//				System.out.println("------unlock-----");
//				IndexReader.unlock(fsDir);
//			}
			reader = IndexReader.open(Constants.INDEX_STORE_PATH);
			for (int i = 0; i < reader.numDocs(); i++)
//		    System.out.println(reader.document(i));
		    System.out.println("版本：" + reader.getVersion());
			System.out.println("索引内的文档数量：" + reader.numDocs());
			Term term = new Term(key, value);
			TermDocs docs = reader.termDocs(term);
			IndexSearcher indexSearcher = null;
			indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
			while (docs.next()) {
				int docSn = docs.doc(); // 文档内部编号
				Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
				System.out.println("文档路径 " + doc.get("path"));
				System.out.println("含有所查找的 " + term + "的Document的编号为: "+ docs.doc());
				System.out.println("Term在文档中的出现 " + docs.freq()+" 次");
			}
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}

读取文件工具类：

Java代码

public class ReadFile {
public static String readWord(File f) {
StringBuffer content = new StringBuffer( "" ); // 文档内容
try {
HWPFDocument doc = new HWPFDocument( new FileInputStream(f));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs(); // 段落
for ( int i = 0 ; i < paragraphCount; i++) { // 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
// System.out.println("-------word--------"+content.toString());
} catch (Exception e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
e.printStackTrace();
}
return content.toString().trim();
}
public static String readPdf(File f){
StringBuffer content = new StringBuffer( "" ); // 文档内容
PDDocument pdfDocument = null ;
try {
if (f.length()> 10048576 ){
DecimalFormat df = new DecimalFormat( "#.00" );
System.out.println("---------------------文件大小------" +df.format(( double ) f.length() / 1048576 ) + "M" );
return f.getName();
}
FileInputStream fis = new FileInputStream(f);
PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(fis);
if (pdfDocument.isEncrypted()){
return f.getName();
}
StringWriter writer = new StringWriter();
stripper.writeText(pdfDocument, writer);
content.append(writer.getBuffer().toString());
fis.close();
} catch (IOException e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
System.err.println("IOException=" + e);
//System.exit(1);
} finally {
if (pdfDocument != null ) {
// System.err.println("Closing document " + f + "...");
org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument();
try {
cos.close();
// System.err.println("Closed " + cos);
pdfDocument.close();
} catch (IOException e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
e.printStackTrace();
}
}
}
// System.out.println("-------pdf--------"+content.toString());
return content.toString().trim();
}
public static String readHtml(File f) {
StringBuffer content = new StringBuffer( "" );
FileInputStream fis = null ;
try {
fis = new FileInputStream(f);
// 读取页面这里的字符编码要注意，要对上html头文件的一致，否则会出乱码
BufferedReader reader = new BufferedReader( new InputStreamReader(fis, "gb2312" ));
String line = null ;
while ((line = reader.readLine()) != null ) {
content.append(line + "\n" );
}
reader.close();
} catch (Exception e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
e.printStackTrace();
}
String contentString = content.toString();
// System.out.println("---------htm索引----"+contentString);
return contentString;
}
public static String readTxt(File f) {
StringBuffer content = new StringBuffer( "" );
try {
BufferedReader reader = new BufferedReader( new InputStreamReader(
new FileInputStream(f)));
for (String line = null ; (line = reader.readLine()) != null ;) {
content.append(line).append("\n" );
}
} catch (IOException e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
e.printStackTrace();
}
return content.toString().trim();
}
public static String readExcel(File f,String fileType){
StringBuffer content = new StringBuffer( "" );
try {
ExcelReader er=new ExcelReader(f,fileType);
String line=er.readLine();
content.append(line).append("\n" );
while (line!= null ){
line=er.readLine();
content.append(line).append("\n" );
}
er.close();
}catch (Exception e){
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
e.printStackTrace();
}
return content.toString();
}
public static String validateFile(File f) {
String fileType = "otherType" ;
String fileName = f.getName();
if (fileName.lastIndexOf( '.' ) == - 1 ) {
fileType = "dir" ;
return fileType;
}
fileName = fileName.substring(fileName.lastIndexOf('.' ) + 1 , fileName
.length());
int i = 0 ;
String [] extension=Constants.EXTENSION;
for (i = 0 ; i < extension.length; i++) {
if (fileName.equalsIgnoreCase(extension[i])) {
fileType = extension[i];
break ;
}
}
return fileType;
}
public static Document indexFile(File f) {
Document doc = new Document();
try {
doc.add(new Field( "name" , f.getName(), Store.YES, Index.ANALYZED));
doc.add(new Field( "size" , NumberTools.longToString(f.length()),
Store.YES, Index.NOT_ANALYZED));
doc.add(new Field( "path" , f.getAbsolutePath(), Store.YES,
Index.NOT_ANALYZED));
String fileType = validateFile(f);
if (fileType.equals( "txt" )) {
doc.add(new Field( "content" , ReadFile.readTxt(f), Store.YES,
Index.ANALYZED));
} else if (fileType.equals( "pdf" )) {
doc.add(new Field( "content" , ReadFile.readPdf(f), Store.YES,
Index.ANALYZED));
} else if (fileType.equals( "doc" )) {
doc.add(new Field( "content" , ReadFile.readWord(f), Store.YES,
Index.ANALYZED));
} else if (fileType.equals( "htm" )) {
doc.add(new Field( "content" , ReadFile.readHtml(f), Store.YES,
Index.ANALYZED));
} else if (fileType.equals( "xls" )){
doc.add(new Field( "content" , ReadFile.readExcel(f, fileType), Store.YES,
Index.ANALYZED));
}else {
doc.add(new Field( "content" , f.getName(), Store.YES, Index.ANALYZED));
}
} catch (Exception e) {
System.out.println("建立索引出错 : " + f.getAbsolutePath() + "" );
e.printStackTrace();
}
return doc;
}
}

public class ReadFile {

	public static String readWord(File f) {
		StringBuffer content = new StringBuffer("");// 文档内容
		try {
			 HWPFDocument doc = new HWPFDocument(new FileInputStream(f));
			 Range range = doc.getRange();
			 int paragraphCount = range.numParagraphs();// 段落
			 for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
			 Paragraph pp = range.getParagraph(i);
			 content.append(pp.text());
			 }
//			 System.out.println("-------word--------"+content.toString());
		} catch (Exception e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			e.printStackTrace();
		}
		return content.toString().trim();
	}

	public static String readPdf(File f){
		StringBuffer content = new StringBuffer("");// 文档内容
		PDDocument pdfDocument = null;
		try {
			if(f.length()>10048576){
				DecimalFormat df = new DecimalFormat("#.00");
				System.out.println("---------------------文件大小------"+df.format((double) f.length() / 1048576) + "M");
				return f.getName();
			}
			FileInputStream fis = new FileInputStream(f);
			PDFTextStripper stripper = new PDFTextStripper();
			pdfDocument = PDDocument.load(fis);
			if(pdfDocument.isEncrypted()){
				return f.getName();
			}
			
			StringWriter writer = new StringWriter();
			stripper.writeText(pdfDocument, writer);
			content.append(writer.getBuffer().toString());
			fis.close();
		} catch (IOException e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			System.err.println("IOException=" + e);
			//System.exit(1);
		} finally {
			if (pdfDocument != null) {
				// System.err.println("Closing document " + f + "...");
				org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument();
				try {
					cos.close();
					// System.err.println("Closed " + cos);
					pdfDocument.close();
				} catch (IOException e) {
					System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
					e.printStackTrace();
				}
			}
		}
//		 System.out.println("-------pdf--------"+content.toString());
		return content.toString().trim();
	}
	
	public static String readHtml(File f) {
		StringBuffer content = new StringBuffer("");
		FileInputStream fis = null;
		try {
			fis = new FileInputStream(f);
			// 读取页面 这里的字符编码要注意，要对上html头文件的一致，否则会出乱码
			BufferedReader reader = new BufferedReader(new InputStreamReader(fis, "gb2312"));
			String line = null;
			while ((line = reader.readLine()) != null) {
				content.append(line + "\n");
			}
			reader.close();
		} catch (Exception e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			e.printStackTrace();
		}
		String contentString = content.toString();
//		System.out.println("---------htm索引----"+contentString);
		return contentString;
	}

	public static String readTxt(File f) {
		StringBuffer content = new StringBuffer("");
		try {
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					new FileInputStream(f)));
			for (String line = null; (line = reader.readLine()) != null;) {
				content.append(line).append("\n");
			}
		} catch (IOException e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			e.printStackTrace();
		}
		return content.toString().trim();
	}
	
	public static String readExcel(File f,String fileType){
		StringBuffer content = new StringBuffer("");
			try{
				ExcelReader er=new ExcelReader(f,fileType);	
				String line=er.readLine();
				content.append(line).append("\n");
				while(line!=null){
					line=er.readLine();
					content.append(line).append("\n");
				}
				er.close();
			}catch(Exception e){
				System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
				e.printStackTrace();
			}
			return content.toString();
	}

	public static String validateFile(File f) {
		String fileType = "otherType";
		String fileName = f.getName();
		if (fileName.lastIndexOf('.') == -1) {
			fileType = "dir";
			return fileType;
		}
		fileName = fileName.substring(fileName.lastIndexOf('.') + 1, fileName
				.length());
		
		int i = 0;
		String [] extension=Constants.EXTENSION;
		for (i = 0; i < extension.length; i++) {
			if (fileName.equalsIgnoreCase(extension[i])) {
				fileType = extension[i];
				break;
			}
		}
		return fileType;
	}

	public static Document indexFile(File f) {
		Document doc = new Document();
		try {
			doc.add(new Field("name", f.getName(), Store.YES, Index.ANALYZED));
			doc.add(new Field("size", NumberTools.longToString(f.length()),
					Store.YES, Index.NOT_ANALYZED));
			doc.add(new Field("path", f.getAbsolutePath(), Store.YES,
					Index.NOT_ANALYZED));
			String fileType = validateFile(f);
			if (fileType.equals("txt")) {
				doc.add(new Field("content", ReadFile.readTxt(f), Store.YES,
						Index.ANALYZED));
			} else if (fileType.equals("pdf")) {
				doc.add(new Field("content", ReadFile.readPdf(f), Store.YES,
						Index.ANALYZED));
			} else if (fileType.equals("doc")) {
				doc.add(new Field("content", ReadFile.readWord(f), Store.YES,
						Index.ANALYZED));
			} else if (fileType.equals("htm")) {
				doc.add(new Field("content", ReadFile.readHtml(f), Store.YES,
						Index.ANALYZED));
			} else if(fileType.equals("xls")){
				doc.add(new Field("content", ReadFile.readExcel(f, fileType), Store.YES,
						Index.ANALYZED));
			}else {
				doc.add(new Field("content", f.getName(), Store.YES, Index.ANALYZED));
			}
		} catch (Exception e) {
			System.out.println("建立索引出错 : " + f.getAbsolutePath() + "");
			e.printStackTrace();
		}
		return doc;
	}
}

Java代码

public class ExcelReader {
// 创建文件输入流
private BufferedReader reader = null ;
// 文件类型
private String filetype;
// 文件二进制输入流
private InputStream is = null ;
// 当前的Sheet
private int currSheet;
// 当前位置
private int currPosition;
// Sheet数量
private int numOfSheets;
// HSSFWorkbook
HSSFWorkbook workbook = null ;
// 设置Cell之间以空格分割
private static String EXCEL_LINE_DELIMITER = " " ;
// 设置最大列数
// private static int MAX_EXCEL_COLUMNS = 64;
// 构造函数创建一个ExcelReader
public

luxiangxing

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
lucene入门

最近一直再研究lucene，把入门的程序和大家分享：对索引的操作类：Java代码 public class IndexDao { public IndexDao() { try { indexWriter = new IndexWri...
复制链接

扫一扫