一个小小的搜索例子,实现对某个文件夹下的文件进行搜索
这里只有主要代码,整个project在附件中,导入到MyEclipse中时根据自己的情况修改配置文件中paoding-dic-home.properties的地址,当然,前提是你必须有庖丁解牛的字典,在页面搜索“项目”,会出现结果(基本每个文件中都有项目这个词)
附件中有项目T_Search,文件lucene\data,索引\lucene\index
MIndexer.java:创建索引(对文件进行创建,先把文件内容读取成String)
public class MIndexer {
public void createIndex() {
long start = System.currentTimeMillis();
try {
// 获取Paoding中文分词器
Analyzer analyzer = new PaodingAnalyzer();
// indexWriter建立索引,E:\lucene\index建立索引的目录
IndexWriter writer = new IndexWriter("E:\\lucene\\index", analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED);
//E:\lucene\data建立索引的数据,主要是.txt、.pdf文件
indexDocs(writer, new File("E:\\lucene\\data"));
writer.optimize();
writer.close();
System.out.println("用时:" + (System.currentTimeMillis() - start) + " 毫秒");
} catch (IOException e) {
e.printStackTrace();
}
}
// 遍历文件夹文件,对需要的文件建立索引
static void indexDocs(IndexWriter writer, File file) throws IOException {
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {
if (file.getName().endsWith(".htm")
|| file.getName().endsWith(".html")
|| file.getName().endsWith(".jsp")
|| file.getName().endsWith(".php")
|| file.getName().endsWith(".txt")
|| file.getName().endsWith(".pdf")) {
try {
// 针对参数文件建立索引文档 ,一个Document就相当于一跳记录
Document doc = new Document();
// Field.Index.ANALYZED 文件名称 建立索引,分词
doc.add(new Field("filename", file.getCanonicalPath(),
Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
if(file.getName().endsWith(".pdf")){
doc.add(new Field("contents", pdf2txt(file),
Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
}else {
doc.add(new Field("contents", ReadFile(file),
Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
}
writer.addDocument(doc);
} catch (FileNotFoundException fnfe) {
;
}
}
}
}
}
// 用字符串形式,读取一个File的内容
public static String ReadFile(File f) {
String line = null;
StringBuffer temp = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(f), "UTF-8"));
while ((line = br.readLine()) != null) {
temp.append(line);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return temp.toString();
}
//若文件为pdf,就用这个读取
public static String pdf2txt(File pfile) {
String _content = "";
if (pfile.exists() && pfile.getName().lastIndexOf(".pdf") >= 1) {
String textFile = String.format("%s%s%s%s%s.txt",
pfile.getPath().substring(0,
pfile.getPath().lastIndexOf(pfile.getName())),
System.getProperty("file.separator"), "temp", System
.getProperty("file.separator"), pfile.getName()
.substring(0, pfile.getName().lastIndexOf(".pdf")));
if (!new File(textFile.substring(0, textFile.lastIndexOf(new File(
textFile).getName()))).exists()) {
new File(textFile.substring(0, textFile.lastIndexOf(new File(
textFile).getName()))).mkdirs();
}
PDDocument pdDoc = null;
COSDocument cosDoc = null;
try {
pdDoc = PDDocument.load(pfile);
PDFParser parser = new PDFParser(new FileInputStream(pfile));
parser.parse();
cosDoc = parser.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
_content = stripper.getText(new PDDocument(cosDoc));
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
cosDoc.close();
pdDoc.close();
if (new File(textFile).exists()) {
new File(textFile).delete();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
return _content;
}
}
MSearcher.java:搜索,返回符合条件的List
public class MSearcher {
public List<MBean> searchIndex(String keyword, boolean highlight,
int content_length, int start, int length) {
String indexpath = "E:\\lucene\\index"; // 索引所在目录
List<MBean> mList = new ArrayList<MBean>();
if (indexpath != null && new File(indexpath).exists()
&& keyword != null && !keyword.trim().equals("") && length > 0) {
start = (start > 0) ? start : 1;
String[] FIELD = { "filename", "contents" };
// 获取Paoding中文分词器
Analyzer analyzer = new PaodingAnalyzer();
FSDirectory directory;
IndexReader reader;
Searcher searcher;
try {
directory = FSDirectory.getDirectory(indexpath);
reader = IndexReader.open(directory);
String queryString = keyword;
/*
* 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 SHOULD表示查询条件为or
* MUST表示查询条件为and MUST_NOT表示查询条件为not
*/
BooleanClause.Occur[] flags = new BooleanClause.Occur[] {
BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };
Query query = MultiFieldQueryParser.parse(queryString, FIELD,
flags, analyzer);
searcher = new IndexSearcher(directory);
query = query.rewrite(reader);
//分页,取出前start + length - 1条数据
TopDocCollector collector = new TopDocCollector(start + length - 1);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
BoldFormatter formatter = new BoldFormatter();
Highlighter highlighter = new Highlighter(formatter,
new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(
content_length));
for (int i = start - 1; i < hits.length; i++) {
MBean mBean = new MBean();
Document doc = searcher.doc(hits[i].doc);
String _filename = doc.get(FIELD[0]);
String _contents = doc.get(FIELD[1]);
int maxNumFragmentsRequired = 5;
String fragmentSeparator = "...";
TermPositionVector tpv_filename = (TermPositionVector) reader
.getTermFreqVector(hits[i].doc, FIELD[0]);
TermPositionVector tpv_contents = (TermPositionVector) reader
.getTermFreqVector(hits[i].doc, FIELD[1]);
String high_filename = "";
String high_contents = "";
if (tpv_filename != null) {
TokenStream token_filename = TokenSources
.getTokenStream(tpv_filename);
high_filename = highlighter.getBestFragments(
token_filename, _filename,
maxNumFragmentsRequired, fragmentSeparator);
}
if (tpv_contents != null) {
TokenStream token_contents = TokenSources
.getTokenStream(tpv_contents);
high_contents = highlighter.getBestFragments(
token_contents, _contents,
maxNumFragmentsRequired, fragmentSeparator);
}
mBean.setFilename((high_filename != null && !high_filename
.equals("")) ? high_filename : _filename);
mBean.setContents((high_contents != null && !high_contents
.equals("")) ? high_contents
: (_contents.length() > content_length ? _contents
.substring(0, content_length) : _contents));
mList.add(mBean);
}
searcher.close();
reader.close();
} catch (ParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
return mList;
}
public Integer searchIndexLength(String keyword, boolean highlight,
int content_length, int start, int length, int maxLength) {
int _count = 0;
String indexpath = "E:\\lucene\\index";
if (indexpath != null && new File(indexpath).exists()
&& keyword != null && !keyword.trim().equals("") && length > 0) {
start = (start > 0) ? start : 1;
String[] FIELD = { "filename", "contents" };
Analyzer analyzer = new PaodingAnalyzer();
FSDirectory directory;
IndexReader reader;
Searcher searcher;
try {
directory = FSDirectory.getDirectory(indexpath);
reader = IndexReader.open(directory);
String queryString = keyword;
BooleanClause.Occur[] flags = new BooleanClause.Occur[] {
BooleanClause.Occur.SHOULD,
BooleanClause.Occur.SHOULD };
Query query = MultiFieldQueryParser.parse(queryString, FIELD,
flags, analyzer);
searcher = new IndexSearcher(reader);
query = query.rewrite(reader);
TopDocCollector collector = new TopDocCollector(maxLength);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
_count = hits.length;
searcher.close();
reader.close();
} catch (ParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
return _count;
}
}
Search.java:处理用户请求的Servlet
public class Search extends HttpServlet {
private static final Integer NUMBER = 10;//每页显示10条
private static final Integer CONTENT_LENGTH = 50;
private static final Boolean HIGHLIGHT = true;
private static final long serialVersionUID = 1L;
private MSearcher mSearcher = new MSearcher();
@Override
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
request.setCharacterEncoding("UTF-8");
String q = request.getParameter("q") != null ? request
.getParameter("q").trim() : request.getParameter("q");
System.out.println("----"+q);
List<MBean> mList = new ArrayList<MBean>();
List<PBean> pList = new ArrayList<PBean>();
int start = request.getParameter("start")!= null ? Integer
.valueOf(request.getParameter("start"))
: 0;
int all_count = 0;
all_count = mSearcher.searchIndexLength( q, HIGHLIGHT,
CONTENT_LENGTH, start, NUMBER, NUMBER * 1000);
mList = mSearcher.searchIndex( q, HIGHLIGHT,
CONTENT_LENGTH, start, NUMBER);
pList = getPageList(all_count, start);
if (start > NUMBER) {
request.setAttribute("previous", start - NUMBER);
}
if (start < all_count - NUMBER) {
request.setAttribute("next", NUMBER + (start != 0 ? start : 1));
}
request.setAttribute("q", q);
request.setAttribute("start", start);
request.setAttribute("pList", pList);
request.setAttribute("mList", mList.isEmpty() ? null : mList);
request.getRequestDispatcher("/index.jsp").forward(request, response);
}
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
doPost(request, response);
}
private static List<PBean> getPageList(int all_count, int start) {
MIndexer mIndexer = new MIndexer();
mIndexer.createIndex();
List<PBean> pList = new ArrayList<PBean>();
int all_page = (all_count <= 0) ? 1 : (all_count / NUMBER + (all_count
% NUMBER > 0 ? 1 : 0));
int now_page = (start <= 0) ? 1
: (start / NUMBER + (start % NUMBER > 0 ? 1 : 0));
for (int i = (now_page - 10 > 0 ? now_page - 10 : 1); i <= (((now_page + 9) <= all_page) ? (now_page + 9)
: all_page); i++) {
PBean pBean = new PBean();
pBean.setPage(i);
pBean.setStart((pBean.getPage() - 1) * NUMBER + 1);
pList.add(pBean);
}
return pList;
}
}