因为lucene索引的时候是将String型的信息建立索引的,所以这里必须是将word/pdf/html等文件的内容转化问字符型。
lucene的jar包自己去下载。
首先是建立索引的代码:
public
class
TextFileIndexer {
public static void main(String[] args) throws Exception {
/* 指明要索引文件夹的位置,这里是d盘的s文件夹下 */
File fileDir = new File( " d://s " );
/* 这里放索引文件的位置 */
File indexDir = new File( " d://index " );
Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,
true );
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
// 增加document到索引去
System.out.println( " File正在被索引. " );
/*
* 注意要变的就是这里,路径和读取文件的方法
* */
String path = " d://s//2.doc " ;
String temp = ReadFile.readWord(path);
// String path ="d://s//index.htm";
// String temp = ReadFile.readHtml(path);
Document document = new Document();
Field FieldPath = new Field( " path " ,path,
Field.Store.YES, Field.Index.NO);
Field FieldBody = new Field( " body " , temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
// optimize()方法是对索引进行优化
indexWriter.optimize();
indexWriter.close();
// 测试一下索引的时间
long endTime = new Date().getTime();
System.out
.println( " 这花费了 "
+ (endTime - startTime)
+ " 毫秒来把文档增加到索引里面去! "
+ fileDir.getPath());
}
}
public static void main(String[] args) throws Exception {
/* 指明要索引文件夹的位置,这里是d盘的s文件夹下 */
File fileDir = new File( " d://s " );
/* 这里放索引文件的位置 */
File indexDir = new File( " d://index " );
Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,
true );
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
// 增加document到索引去
System.out.println( " File正在被索引. " );
/*
* 注意要变的就是这里,路径和读取文件的方法
* */
String path = " d://s//2.doc " ;
String temp = ReadFile.readWord(path);
// String path ="d://s//index.htm";
// String temp = ReadFile.readHtml(path);
Document document = new Document();
Field FieldPath = new Field( " path " ,path,
Field.Store.YES, Field.Index.NO);
Field FieldBody = new Field( " body " , temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
// optimize()方法是对索引进行优化
indexWriter.optimize();
indexWriter.close();
// 测试一下索引的时间
long endTime = new Date().getTime();
System.out
.println( " 这花费了 "
+ (endTime - startTime)
+ " 毫秒来把文档增加到索引里面去! "
+ fileDir.getPath());
}
}
上面已经注释了要换的地方,我们要做的就是换文件的路径和读取文件的方法。
下面来具体看下读取文件的方法
1.首先来看WORD文档:
我这里用的是poi,相关jar包自己去下载,然后加到工程中(以下所要用的jar包也是,不再重复说)。
来看相关代码:
public
static
String readWord(String path) {
StringBuffer content = new StringBuffer( "" ); // 文档内容
try {
HWPFDocument doc = new HWPFDocument( new FileInputStream(path));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs(); // 段落
for ( int i = 0 ; i < paragraphCount; i ++ ) { // 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
} catch (Exception e) {
}
return content.toString().trim();
}
StringBuffer content = new StringBuffer( "" ); // 文档内容
try {
HWPFDocument doc = new HWPFDocument( new FileInputStream(path));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs(); // 段落
for ( int i = 0 ; i < paragraphCount; i ++ ) { // 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
} catch (Exception e) {
}
return content.toString().trim();
}
2.PDF文件用的是PDFbox:
public
static
String readPdf(String path)
throws
Exception {
StringBuffer content = new StringBuffer( "" ); // 文档内容
FileInputStream fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
return content.toString().trim();
}
StringBuffer content = new StringBuffer( "" ); // 文档内容
FileInputStream fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
return content.toString().trim();
}
3.html文件:
public
static
String readHtml(String urlString) {
StringBuffer content = new StringBuffer( "" );
File file = new File(urlString);
FileInputStream fis = null ;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader( new InputStreamReader(
fis, " utf-8 " )); // 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
String line = null ;
while ((line = reader.readLine()) != null ) {
content.append(line + " /n " );
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
String contentString = content.toString();
return contentString;
}
StringBuffer content = new StringBuffer( "" );
File file = new File(urlString);
FileInputStream fis = null ;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader( new InputStreamReader(
fis, " utf-8 " )); // 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
String line = null ;
while ((line = reader.readLine()) != null ) {
content.append(line + " /n " );
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
String contentString = content.toString();
return contentString;
}
4.txt文件:
public
static
String readTxt(String path) {
StringBuffer content = new StringBuffer( "" ); // 文档内容
try {
FileReader reader = new FileReader(path);
BufferedReader br = new BufferedReader(reader);
String s1 = null ;
while ((s1 = br.readLine()) != null ) {
content.append(s1 + " /r " );
}
br.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return content.toString().trim();
}
StringBuffer content = new StringBuffer( "" ); // 文档内容
try {
FileReader reader = new FileReader(path);
BufferedReader br = new BufferedReader(reader);
String s1 = null ;
while ((s1 = br.readLine()) != null ) {
content.append(s1 + " /r " );
}
br.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return content.toString().trim();
}
接下来数搜索代码:
public
class
TestQuery {
public static void main(String[] args) throws IOException, ParseException {
Hits hits = null ;
// 搜索内容自己换
String queryString = " 根据国务院的决定 " ;
Query query = null ;
IndexSearcher searcher = new IndexSearcher( " d://index " ); // 这里注意索引存放的路径
Analyzer analyzer = new StandardAnalyzer();
try {
QueryParser qp = new QueryParser( " body " , analyzer);
/**
* 建索引的时候我们指定了body建立为内容,我们搜索的时候也是针对body的,所以
* QueryParser qp = new QueryParser("body", analyzer);
* 这句和建立索引时候
Field FieldBody = new Field("body", temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
*的这句的"body"是对应的。
*/
query = qp.parse(queryString);
} catch (ParseException e) {
System.out.println( " 异常 " );
}
if (searcher != null ) {
hits = searcher.search(query);
if (hits.length() > 0 ) {
System.out.println( " 找到: " + hits.length() + " 个结果! " );
for ( int i = 0 ; i < hits.length(); i ++ ) { // 输出搜索信息
Document document = hits.doc(i);
System.out.println( " contents: " + document.get( " body " ));
// 同样原理这里的document.get("body")就是取得建立在索引文件里面的额body的所有内容
// 你若想输出文件路径就用document.get("path")就可以了
}
} else {
System.out.println( " 0个结果! " );
}
}
}
public static void main(String[] args) throws IOException, ParseException {
Hits hits = null ;
// 搜索内容自己换
String queryString = " 根据国务院的决定 " ;
Query query = null ;
IndexSearcher searcher = new IndexSearcher( " d://index " ); // 这里注意索引存放的路径
Analyzer analyzer = new StandardAnalyzer();
try {
QueryParser qp = new QueryParser( " body " , analyzer);
/**
* 建索引的时候我们指定了body建立为内容,我们搜索的时候也是针对body的,所以
* QueryParser qp = new QueryParser("body", analyzer);
* 这句和建立索引时候
Field FieldBody = new Field("body", temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
*的这句的"body"是对应的。
*/
query = qp.parse(queryString);
} catch (ParseException e) {
System.out.println( " 异常 " );
}
if (searcher != null ) {
hits = searcher.search(query);
if (hits.length() > 0 ) {
System.out.println( " 找到: " + hits.length() + " 个结果! " );
for ( int i = 0 ; i < hits.length(); i ++ ) { // 输出搜索信息
Document document = hits.doc(i);
System.out.println( " contents: " + document.get( " body " ));
// 同样原理这里的document.get("body")就是取得建立在索引文件里面的额body的所有内容
// 你若想输出文件路径就用document.get("path")就可以了
}
} else {
System.out.println( " 0个结果! " );
}
}
}