Dom4j解析xml并利用lucene建立引索并搜索
package jim.luceneXML;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.wltea.analyzer.lucene.IKAnalyzer;
//import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
@SuppressWarnings("unused")
public class MyTest {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
new MyLucene().creatIndex("test.xml");
} catch (DocumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
MySearch searcher = new MySearch();
searcher.Search();
}
}
class MyLucene{
Directory directory = null;//用来决定引索目录的储存方式
IndexWriter writer = null;//引索器
Document document = null;//引索文件
Field field = null;//字段信息
IndexWriterConfig iwc = null;//用来选择lucene的版本以及分词器的版本
String indexPath = "index";//引索目录的储存地址
String title = "title";//文件的标题
String content = "content";//文件的内容
String [] files = null;//用来记录文件夹里所有文件的地址
String XmlContent = null;//用来记录从XML文件中读取来的内容
String XmlName = null; //用来记录XML文件的节点名
int num= 0;
public void creatIndex(String fileName) throws DocumentException{
//构造器
try {
directory = FSDirectory.open(new File(indexPath));//创建directory,其储存方式为在硬盘上储存
iwc = new IndexWriterConfig(Version.LUCENE_35, new IKAnalyzer());//选择lucene的版本以及分词器的版本
writer = new IndexWriter(directory,iwc);//创建引索器
} catch (IOException e) {
System.out.println("创建Directory时发生错误!");
// TODO Auto-generated catch block
e.printStackTrace();
}
SAXReader saxReader = new SAXReader();
org.dom4j.Document doc = saxReader.read(new File(fileName));
//List list = doc.selectNodes("/books/book/title");
Element root = doc.getRootElement();
@SuppressWarnings("rawtypes")
Iterator iter = root.elementIterator();
while(iter.hasNext()){
Element rootElement = (Element)iter.next();
@SuppressWarnings("rawtypes")
Iterator childElementIter = rootElement.elementIterator();
document = new Document();//创建索引文件
while(childElementIter.hasNext()){
Element childElement = (Element)childElementIter.next();
System.out.println(childElement.getName()+": "+childElement.getText());
XmlContent = childElement.getText();
XmlName = childElement.getName();
if(XmlName.equals("id")){
field = new Field("id",String.valueOf(++num),Field.Store.YES,Index.NOT_ANALYZED);
}
else if(XmlName.equals("title")){
field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED);
}
else if(XmlName.equals("keywords")){
field = new Field(XmlName,XmlContent,Field.Store.YES,Index.ANALYZED);
}
else if(XmlName.equals("kind")){
field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED);
}
else if(XmlName.equals("describe")){
field = new Field(XmlName,XmlContent,Field.Store.YES,Index.ANALYZED);
}
else if(XmlName.equals("date")){
field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED);
}
else if(XmlName.equals("url")){
field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED);
}
else if(XmlName.equals("author")){
field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED);
}
else if(XmlName.equals("publisher")){
field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED);
}
document.add(field);
try {
writer.addDocument(document);
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
try {
writer.close();
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("Index is Created!");
}
}
package jim.luceneXML;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class MySearch {
Directory directory = null;//存储方式
String indexPath = "index";//引索存放的目录
IndexReader reader = null;//读入引索
IndexSearcher searcher = null;//确定搜索对象
QueryParser parser = null;//用于确定搜索时的引索的版本以及分词器
Query query = null;//记录要搜索的词语
TopDocs tds = null;//记录搜索后返回的结果
Document document = null;//存放搜索结果以便于提取结果
ScoreDoc[] sds = null;//存放TopDocs传来的内容(搜索结果)
public void Search(){
try {
directory = FSDirectory.open(new File(indexPath));
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println("创建Directory时发生错误!");
e.printStackTrace();
}//创建directory,其储存方式为在硬盘上储存
try {
reader = IndexReader.open(directory);
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
System.out.println("创建IndexReader时发生错误!");
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println("创建IndexReader时发生错误!");
e.printStackTrace();
}
searcher = new IndexSearcher(reader);
parser = new QueryParser(Version.LUCENE_35,"keywords",new IKAnalyzer());
try {
query = parser.parse("期中");
} catch (ParseException e) {
// TODO Auto-generated catch block
System.out.println("query = parser.parse(\"keyword\")时发生错误");
e.printStackTrace();
}
try {
tds = searcher.search(query,10);
} catch (IOException e) {
System.out.println("std = searcher.search(query,5);时发生错误");
// TODO Auto-generated catch block
e.printStackTrace();
}
sds = tds.scoreDocs;
System.out.println("一共搜索到: "+sds.length+" 条");
if(sds.length != 0){
for( ScoreDoc sd:sds){
try {
document = searcher.doc(sd.doc);
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
System.out.println("document = searcher.doc(sd.doc);时发生错误");
e.printStackTrace();
} catch (IOException e) {
System.out.println("document = searcher.doc(sd.doc);时发生错误");
e.printStackTrace();
}
String test = document.get("id");
System.out.println(test+document.get("keywords")+document.get("url"));
//System.out.println("Id: "+document.get("id")+" name: "+document.get("name")+" number: "+document.get("number"));
}
}
else
System.out.println("The word you enter can't be found!");
try {
reader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println("关闭reader时发生错误!");
e.printStackTrace();
}
System.out.println("Finished");
}
public void check() throws IOException{
directory = FSDirectory.open(new File("index"));
IndexReader reader = IndexReader.open(directory);
for(int i = 0;i<reader.numDocs();i++){
System.out.println(reader.document(i));
}
}
}
今天的收获:
今天对Dom4j解析xml有了一定得了解,并写了一个测试的程序.
今天的不足:
对Dom4j解析xml还有些不完美,做得不够细腻,明天希望这个问题能够改善.