一个自己用lucene写的敏感词查询

最新推荐文章于 2021-05-26 23:35:12 发布

枭鹏

最新推荐文章于 2021-05-26 23:35:12 发布

阅读量2.1k

点赞数 5

分类专栏： java的点点滴滴

本文链接：https://blog.csdn.net/u014306354/article/details/50255903

版权

java的点点滴滴专栏收录该内容

12 篇文章 0 订阅

订阅专栏

lucenepackage ddd.cms.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.nio.charset.StandardCharsets;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
* 敏感词管理，查询所有的敏感词，查看是否存在
* @author Administrator
*
*/
public class SensitiveWordUtil {
   private static String docsPath = "D:\\CMSDocs";
   private static String fileName = "analysis.txt";
   private static String indexPath = "D:\\CMSIndex";
   private static double baseScore = 0.02;

   public static void main(String[] args) throws Exception {

       /*String docsPath = "D:\\CMSDocs";
       String fileName = "analysis.txt";
       String indexPath = "D:\\CMSIndex";*/

       //startWork( indexPath,docsPath,fileName, content);
       String [] allSensitiveWord = {"使命","持续创新","没出事","十二号","辈数儿","否存在"};
       /*String sensitiveWord = null;
       for (int i = 0; i < allSensitiveWord.length; i++) {
           sensitiveWord = allSensitiveWord[i];
           judeIsExsist(sensitiveWord,indexPath);
       }*/
       //judeIsExsist(sensitiveWord,indexPath);
       String content = "bseurhg87e略后来持续改了良好社会了十二和谁零售额好了粉色，哦然否存在后首尔玫我们的使命：让IT更简单数据是未来商业的核心，IT是企业数据存储和分析的基础，SMARTX让IT更简单，帮助企业挖掘数据价值，让商业更有效率。我们相信： 1. 持续创新 SMARTX由热爱挑战的团队组建，永远解决用户最痛、最难的问题，不断创新为用户创造价值。 2. 简胜于繁不论是产品的设计实现，还是用户体验，我们把简单原则贯彻如一。让企业用更直觉、更简单的红色还是摩尔佛爱问还是没饿坏了分数而肥瘦儿";

       Map<String,Boolean> resMap = getSearchResult(allSensitiveWord,content);
       System.out.println(resMap);

   }
   /**
   * 将所有的敏感词纳入文件中查询，返回所有的敏感词的存在情况
   * @param allSensitiveWord
   * @param content
   * @return
   * @throws IOException
   * @throws ParseException
   */

   public static Map<String,Boolean> getSearchResult(String []allSensitiveWord,String content) throws IOException, ParseException{
       Map<String,Boolean> resultMap = new HashMap<String,Boolean>();
       if (content == null) {
           return null;
       }
       else{
           startWork( indexPath,docsPath,fileName, content);   //先做准备工作
           String sensitiveWord = null;
           for (int i = 0; i < allSensitiveWord.length; i++) {
               boolean isFind = false;
               sensitiveWord = allSensitiveWord[i];
               Map<String, Object> resMap= judeIsExsist(sensitiveWord,indexPath);
               ScoreDoc scoreDoc = (ScoreDoc) resMap.get("scoreDoc");
               int totalHits = (int) resMap.get("totalHits");
               if (totalHits >= 1 && scoreDoc.score >= baseScore) {
                   isFind = true;
               }
               else{
                   isFind = false;
               }
               resultMap.put(sensitiveWord, isFind);
           }
       }
       return resultMap;
   }

   /**
   * 做一些准备工作，将内容写入txt文件，并建立索引
   * @param indexPath
   * @param docsPath
   * @param fileName
   * @param content
   * @return
   */
   private static boolean startWork(String indexPath, String docsPath,String fileName,String content) {
       boolean create = true;
       String getContent = writeContentTotxt(docsPath, fileName, content);
       if(getContent != null){
           createIndex(indexPath,docsPath,create);
           return true;
       }
       else{
           System.out.println("写入文件失败！");
       }
       return false;
   }

   /**
   * 将文章写入txt文件的方法
   * @param filePath
   * @param fileName
   * @param content
   * @return
   */

   private static String writeContentTotxt(String filePath, String fileName,String content) {
       File fileFolder = new File(filePath);
       if (!fileFolder.exists() || !fileFolder.isDirectory()) {
           createFolder(filePath);
       }
       File contentFile = new File(filePath + "\\" + fileName);
       if (!contentFile.exists() || contentFile.isDirectory()) {
           try {
               createFile(filePath, fileName);
           } catch (IOException e) {
               e.printStackTrace();
           }
       }
       else{
           //如果没有该文件就要先创建该文件，如果有了就直接进行交给下一步
       }
       try {
           boolean isSuccess = printStream(filePath + "\\" + fileName, content, false); //将要搜索的文章先写入到txt临时文件中
           if (isSuccess) {
               return content;
           }
           else{
               return null;
           }
       } catch (IOException e) {
           e.printStackTrace();
       }
       return null;
   }

   /**
   * 用字符流写入的具体实现
   * @param filePath
   * @param content
   * @param isTrue
   * @return
   * @throws IOException
   */
   private static boolean printStream(String filePath, String content,boolean isTrue) throws IOException {
       boolean isSuccess = false;
       FileOutputStream out = new FileOutputStream(filePath, isTrue); // true意为是在其后追加，false会将原有的内容覆盖
       PrintStream p = new PrintStream(out, true);
       if (content != null) {
           p.println(content);
           isSuccess = true;
       } else {
           System.out.println("对不起你的要写入的内容为空");
       }
       out.close();
       p.close();
       return isSuccess;
   }
   /**
   * 建立索引的具体实现
   * @param indexPath
   * @param docsPath
   * @param create
   */
   private static void createIndex(String indexPath,String docsPath,boolean create){
        final File docDir = new File(docsPath);
        if (!docDir.exists() || !docDir.canRead()) {
            System.out.println("资源文件目录 '" + docDir.getAbsolutePath() + "' 不存在或不可读，请检查！");
            System.exit(1);
        }else{
           Date start = new Date();
            try {
                System.out.println("建立索引文件到该目录 '" + indexPath + "'...");

                Directory dir = FSDirectory.open(new File(indexPath));
                Analyzer analyzer = new StandardAnalyzer();
                IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_2, analyzer);
                if (create) {
                    // 创建新的索引文件，删除所有其他的索引文件
                    //（指的是该资源文件目录下的旧的索引文件，其他资源的索引文件不影响）
                    iwc.setOpenMode(OpenMode.CREATE);
                } else {
                    // 如果有旧的索引文件，则更新索引文件
                    iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
                }
                IndexWriter writer = new IndexWriter(dir, iwc);
                indexDocs(writer, docDir);
                writer.close();
                Date end = new Date();
                System.out.println(end.getTime() - start.getTime() + " total milliseconds");
            } catch (IOException e) {
                System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
            }
        }
   }

   /**
   * 索引文件的义务处理
   * @param writer
   * @param file
   * @throws IOException
   */
   private static void indexDocs(IndexWriter writer, File file) throws IOException {
        if (!file.canRead()) {
            return;
        }
        else if (file.isDirectory()) {
            String[] files = file.list();
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {
            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                return;
            }
            try {

                // 每一个文档最终被封装成了一个 Document 对象
                // Document 是用来描述文档的，这里的文档可以指一个 HTML 页面，一封电子邮件，或者是一个文本文件。
                // 一个 Document 对象由多个 Field 对象组成的。
                // 可以把一个 Document 对象想象成数据库中的一个记录，而每个 Field 对象就是记录的一个字段。
                Document doc = new Document();
                // Field 对象是用来描述一个文档的某个属性的，比如一封电子邮件的标题和内容可以用两个 Field 对象分别描述。
                Field pathField = new StringField("path", file.getPath(),Field.Store.YES);
                // pathField指的是资源文件的路径的field
                doc.add(pathField);
                // 这个field指的是最后的修改时间
                doc.add(new LongField("modified", file.lastModified(),Field.Store.NO));

                // 把资源文件中的内容分词后，索引到索引文件中，指定为UTF-8编码
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }
            } finally {
                fis.close();
            }
            }
        }
   /**
   * 创建文件
   * @param filePath
   * @param fileName
   * @throws IOException
   */
   private static void createFile(String filePath, String fileName) throws IOException {
       File file = new File(filePath+"\\"+fileName);
       if (!file.exists())
           file.createNewFile();
       else{
           return;
       }
   }

   /**
   * 创建文件夹
   * @param filePath
   * @return
   */
   private static String createFolder(String filePath) {
       File indexFile = new File(filePath);
       if (!indexFile.exists() || !indexFile.isDirectory()) {
           indexFile.mkdir();
       }
       else{
           //文件夹已经存在，直接返回
       }
       return filePath;
   }

   /**
   * lucene的具体查询方法
   * @param in
   * @param searcher
   * @param query
   * @param hitsPerPage
   * @param raw
   * @param interactive
   * @return
   * @throws IOException
   */
   public static Map<String, Object> doPagingSearch(BufferedReader in,IndexSearcher searcher, Query query, int hitsPerPage, boolean raw,boolean interactive) throws IOException {
       Map<String,Object> resMap = new HashMap<String,Object>();
       TopDocs results = searcher.search(query, 5 * hitsPerPage); // 对应的每一页
       ScoreDoc[] hitDocs = results.scoreDocs;

       int numTotalHits = results.totalHits;
       resMap.put("totalHits", numTotalHits);
       if (hitDocs.length!=0 && numTotalHits > 0 && hitDocs[0]!=null) {
           resMap.put("scoreDoc", hitDocs[0]);
           System.out.println(hitDocs[0].toString());
           System.out.println(numTotalHits + " total matching documents");
       }else{
           resMap.put("scoreDoc", null);
       }
       return resMap;
   }

   /**
   * 根据索引与关键字来查询是否存在
   * @param sensitiveWord
   * @param indexPath
   * @return
   * @throws IOException
   * @throws ParseException
   */
   private static Map<String,Object> judeIsExsist(String sensitiveWord, String indexPath)throws IOException, ParseException {
       String field = "contents";
       String queries = null;
       int repeat = 0;
       boolean raw = false;
       String queryString = null;
       int hitsPerPage = 10;

       IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
       IndexSearcher searcher = new IndexSearcher(reader);
       Analyzer analyzer = new StandardAnalyzer();

       BufferedReader in = null;
       in = new BufferedReader(new InputStreamReader(System.in,StandardCharsets.UTF_8));
       QueryParser parser = new QueryParser(field, analyzer);
       sensitiveWord = sensitiveWord.trim(); // 去除两边的看的空格
       Query query = parser.parse(sensitiveWord); // 创建查询器
       System.out.println("Searching for: " + query.toString(field));
       if (repeat > 0) { // repeat & time as benchmark
           Date start = new Date();
           for (int j = 0; j < repeat; j++) {
               searcher.search(query, null, 100);
           }
           Date end = new Date();
           System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
       }
       Map<String, Object> resMap = doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);
       reader.close();
       return resMap;
   }
}