上一篇已经搭建好了环境,接下来就该是测试及创建Lucene的索引库了。另外,使用的数据是腾讯新闻的订阅数据(http://rss.qq.com/news.htm)。
实体类
public class Article {
//新闻
private String id;
//文章标题
private String title;
//文章内容
private String content;
//文章链接
private String url;
//文章的时间
private String date;
//文章的作者
private String author;
//省略set。。。get方法
LuceneUtil
在Lucene对索引的操作需要获取其相应的对象,这里我将其封装到一个工具类中
public class LuceneUtils {
private static Directory directory = null;
private static IndexWriterConfig indexWriterConfig = null;
private static Analyzer analyzer = null;
private static Version matchVersion = null;
static{
try {
matchVersion = Version.LUCENE_5_5_4;
/**
* Contants.INDEXURL 索引保存的位置
* public static final String INDEXURL = "./index_dir/news";
*/
directory = FSDirectory.open(new File(Contants.INDEXURL).toPath());
//标准分词
analyzer = new StandardAnalyzer();
} catch (Exception e) {
e.printStackTrace();
}
}
public static Directory getDirectory() {
return directory;
}
/**
* 返回用于操作索引的对象
* @return
* @throws Exception
*/
public static IndexWriter getIndexWriter() throws Exception{
indexWriterConfig = new IndexWriterConfig(analyzer);
IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
return indexWriter;
}
/**
* 返回用于读取索引的对象
* @return
* @throws Exception
*/
public static IndexSearcher getIndexSearcher() throws Exception{
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
return indexSearcher;
}
/**
* 返回当前版本
* @return
*/
public static Version getMatchVersion() {
return matchVersion;
}
/**
* 返回当前使用的分词器
* @return
*/
public static Analyzer getAnalyzer() {
return analyzer;
}
}
Lucene的持久化类
对于lucene的索引,我的理解是就像数据库一样,可以对其进行CRUD。
public class LuceneDao {
/**
* 添加索引
* @param article
* @throws Exception
*/
public void addIndex(Article article) throws Exception{
//获取indexWrite对象
IndexWriter indexWriter = LuceneUtils.getIndexWriter();
/*
Lucene操作的都是Document对象所以这里需要将javaBean对象转化为Document对象
*/
Document document = ArticleUtil.articleToDocument(article);
//将document写入磁盘中
indexWriter.addDocument(document);
indexWriter.close();
}
}
将JavaBean转化成document
上一段有一个ArticleUtil类,这个类就是将Java对象转化为Document对象的类
public class ArticleUtil {
public static Document articleToDocument(Article article){
Document document = new Document();
IndexableField idField = new StringField("id",article.getId(),Store.YES);
IndexableField titleField = new StringField("title",article.getTitle(),Store.YES);
IndexableField authorField = new StringField("author",article.getAuthor(),Store.YES);
IndexableField contentField = new TextField("content",article.getContent(),Store.YES);
IndexableField urlField = new StringField("url",article.getUrl(),Store.YES);
IndexableField dateField = new StringField("date",article.getDate(),Store.YES);
document.add(idField);
document.add(titleField);
document.add(contentField);
document.add(authorField);
document.add(urlField);
document.add(dateField);
return document;
}
}
获取新闻
使用httpcomponents工具来获取网络资源,所以这里还需要导入相应的jar包
使用maven
<!-- HttpComponents -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
该工具是Apache组织的一个开源项目,所以可以直接到Apache.org下载,这里只需要导入这两个包就行
获取到资源后,因为返回的xml格式的数据,所以我们还需要对获取数据进行解析,这个使用dom4j。先导入相应的jar包
使用maven
<!-- dom -->
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
环境准备好后就是编写相应的代码了
public class HttpUtil {
public static List<Article> getNewByPath(String path) throws Exception {
List<Article> articles = new ArrayList<Article>();
// 使用httpcomponents发送请求
HttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(path);
HttpResponse response = httpClient.execute(httpGet);
// 使用sax解析器解析数据
SAXReader saxReader = new SAXReader();
Document document = saxReader.read(response.getEntity().getContent());
Element root = document.getRootElement();
Element element = root.element("channel");
List<Element> elements = element.elements("item");
for (Element e : elements) {
//封装解析的数据
Article article = new Article();
article.setId(UUID.randomUUID().toString());
article.setTitle(e.elementText("title"));
article.setContent(e.elementText("description"));
article.setUrl(e.elementText("link"));
article.setAuthor(e.elementText("author"));
article.setDate(e.elementText("pubDate"));
articles.add(article);
/*System.out.println("title===" + e.elementText("title"));
System.out.println("link===" + e.elementText("link"));
System.out.println("author===" + e.elementText("author"));
System.out.println("pubDate===" + e.elementText("pubDate"));
System.out.println("description===" + e.elementText("description"));
System.out.println("=================================================");
System.out.println();
System.out.println();*/
}
return articles;
}
//测试成功之后这段代码可以注释掉
public static void main(String[] args) throws Exception {
List<Article> articles = HttpUtil.getNewByPath("http://news.qq.com/newsgn/rss_newsgn.xml");
for(Article a : articles){
System.out.println(a.getTitle());
System.out.println(a.getDate());
System.out.println(a.getContent());
System.out.println("=====================");
}
}
}
运行结果
获取数据,并将获取的数据保存添加到索引库
使用单元测试的方式测试
@Test
public void testDao(){
String path = "http://news.qq.com/newsgn/rss_newsgn.xml";
try {
List<Article> articles = HttpUtil.getNewByPath(path);
for(Article a : articles){
luceneDao.addIndex(a);
}
} catch (Exception e) {
e.printStackTrace();
}
}
这里保存的路径是项目的根路径下,索引运行成功后再项目路径下会多出一个index_dir文件夹
到这里,索引库的创建就完成了。下一篇将介绍,索引库的查询。作为一个小白,希望大家多多支持。