Lucene进行中文分词索引构建时出现的问题，谁能帮我解答！

最新推荐文章于 2021-02-28 04:24:56 发布

iloveklg

最新推荐文章于 2021-02-28 04:24:56 发布

阅读量199

点赞数

分类专栏： Lucene

本文链接：https://blog.csdn.net/iloveklg/article/details/105337081

版权

Lucene 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

package IndexService;

/**
*
* @method Lucene索引操作对象工具类
* @author Mr yi
* @time 2019年5月23日
*/
public class LuceneUtils {

private static Directory directory = null;

// private static IndexWriterConfig indexWriterConfig = null;

private static Analyzer analyzer = null;

private static Version matchVersion = null;

//static 代码块随着类的加载，只加载一次。作用是初始化类。
static{
try {
   //在 6.6 以上版本中 version 不再是必要的，并且，存在无参构造方法，可以直接使用默认的 StandardAnalyzer 分词器。
matchVersion = Version.LUCENE_8_5_0;
//索引存放的位置，设置在当前目录中（项目根路径下）
// final String INDEXURL = "E://users//Administrator//eclipse-workspace//searchEngine//indexDir";
// directory = FSDirectory.open(new File(INDEXURL).toPath());/* Paths.get(INDEXURL)*/
//analyzer = new StandardAnalyzer(); // 标准分词器，适用于英文[支持中文采用的方法为单字切分。他会将词汇单元转换成小写形式，并去除停用词和标点符号]
//analyzer = new SmartChineseAnalyzer();//中文分词
//analyzer = new ComplexAnalyzer();//中文分词
   //analyzer = new IKAnalyzer();//中文分词
analyzer = new IKAnalyzer();//中文分词
} catch (Exception e) {
e.printStackTrace();
}
}

public static Directory getDirectory() {
return directory;
}

/**
*
* @method 返回用于操作索引的对象
* @author Mr yi
* @time 2019年5月23日
* @return
* @throws Exception
*/
public static IndexWriter getIndexWriter() throws Exception{
//创建索引写入配置
// indexWriterConfig = new IndexWriterConfig(analyzer);
//创建索引写入对象
// IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
IndexWriter indexWriter = new IndexWriter("E://users//Administrator//eclipse-workspace//searchEngine//indexDir",analyzer, true);
return indexWriter;
}

/**
*
* @method 返回用于读取索引的对象
* @author Mr yi
* @time 2019年5月23日
* @return
* @throws Exception
*/
public static IndexSearcher getIndexSearcher() throws Exception{
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
return indexSearcher;
}

/**
*
* @method 返回当前版本
* @author Mr yi
* @time 2019年5月23日
* @return
*/
public static Version getMatchVersion() {
return matchVersion;
}

/**
*
* @method 返回当前使用的分词器
* @author Mr yi
* @time 2019年5月23日
* @return
*/
public static Analyzer getAnalyzer() {
return analyzer;
}


/**
   *
   * @method 将Article数据转换为Document
   * @author Mr yi
   * @time 2019年5月23日
   * @param article 对象
   * @return
* @throws IOException
   */
   public static Document articleToDocument(News news) throws IOException{

       if(news==null)
       return null;

   Document document = new Document();
   StringField identifier = new StringField("newsId",news.getId(),Store.YES);
   StringField newsurl = new StringField("newsUrl",news.getUrl(),Store.YES);
   TextField newstitle = new TextField("newsTitle",news.getTitle(),Store.YES);
   TextField newsdate = new TextField("newsDate",news.getDate(),Store.YES);
   TextField newsbody = new TextField("newsBody",news.getBody(),Store.YES);
   long mills = System.currentTimeMillis();
   StringField indextime = new StringField("indexTime",mills+"",Store.YES);
   StringField newsdate2 = new StringField("newDate2",news.getDate().substring(0, 4)+news.getDate().substring(5, 7)+news.getDate().substring(8, 10),Store.YES);

   document.add(identifier);
   document.add(newsurl);
   document.add(newstitle);
   document.add(newsdate);
   document.add(newsbody);
   document.add(indextime);
   document.add(newsdate2);
   return document;
   }

/**
   *
   * @method 添加索引
   * @author Mr yi
   * @time 2019年5月24日
   * @param document
   * @throws Exception
   */
public static void addIndex(String path) throws Exception{
//获取indexWrite对象
IndexWriter indexWriter = LuceneUtils.getIndexWriter();

File folder = new File(path);
   if (folder.isDirectory()) {
   File[] list = folder.listFiles();
   for (File f : list) {
       File file = new File("E://users//Administrator//eclipse-workspace//searchEngine//dataDir//"+f.getName());
       Newsdao nd=new Newsdao();
    ArrayList<String> content = nd.readFileContent(file);
    String id = nd.readFileId(file);
    News news = new News();
    news.setId(id);
//    System.out.println(news.getId());
    news.setUrl(content.get(0).replace("url:", ""));
//    System.out.println(news.getUrl());
    news.setTitle(content.get(1).replace("title:", ""));
//    System.out.println(news.getTitle());
    news.setDate(content.get(2).replace("time:", ""));
//    System.out.println(news.getDate());
    news.setBody(content.get(3));
//    System.out.println(news.getBody());

Document document = LuceneUtils.articleToDocument(news);

try {
   //将document写入磁盘中
indexWriter.addDocument(document);
}finally {//定要注意关闭indexWrite. 包括异常下,用finally关闭.否则会导致下一次写索引失败.，修改程序后，直接删除write.lock文件后就可以
   indexWriter.close();
}
}}



}

/**
* 添加索引
* @param article
* @throws Exception
*/
public static void main(String[] args) throws Exception{
//    IndexWriter indexWriter = LuceneUtils.getIndexWriter();
//    News news = new News(); //这里需要给news 对象赋值
    String path = "E://users//Administrator//eclipse-workspace//searchEngine//dataDir";
    LuceneUtils.addIndex(path);

}

}

iloveklg

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene进行中文分词索引构建时出现的问题，谁能帮我解答！

package IndexService;/**** @method Lucene索引操作对象工具类* @author Mr yi* @time 2019年5月23日*/public class LuceneUtils { private static Directory directory = null;// private static Ind...
复制链接

扫一扫