今天需要索引的猪脚是以下几个字:【中石化华东运费表】
1. 原生 SmartChineseAnalyzer
通过下面代码可以看到分词效果
public static void main(String[] args) throws IOException {
String chineseText = "中石化华东运费表";
try (Analyzer ana = new SmartChineseAnalyzer();) {
TokenStream ts = ana.tokenStream("content", chineseText);
System.out.println("分词效果:");
doToken(ts);
} catch (IOException e) {
}
}
分词效果:
中石化|华东|运费表|
如果你检索: 石化,0条结果, 一定要检索‘中石化’才行,这不是我想要的。
2. 扩展SmartChineseAnalyzer
建立【石化】和【中石化】的同义词
builder.add(new CharsRef("石化"), new CharsRef("中石化"), true);
通过上面跑分词效果,依旧是: 中石化|华东|运费表|
如果同义词这样建立:
builder.add(new CharsRef("中石化"), new CharsRef("石化"), true);
分词效果:
石化|中石化|华东|运费表|
显然,对于已有索引库而言,同义词这2者前后顺序不一样,是有区别的;但是如果从头开始(索引库还没建立),搜石化 和 搜中石化 结果是一样。 你可以试试。
附上代码,在这里。
LuceneUtil.java
public class LuceneUtil {
Analyzer analyzer = new MySmartChineseAnalyzer(); //SmartChineseAnalyzer(){
/**
* 创建索引并存储
*
* @param title
* @param content
* @throws IOException
*/
public void index(String title, String content, int id, Date date, String path) throws IOException {
Directory directory = FSDirectory.open(Paths.get(path));
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
// 设置创建索引模式(在原来的索引的基础上创建或新增)
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter iwriter = new IndexWriter(directory, iwConfig);
Document doc = new Document();
// 添加Field
doc.add(new TextField("title", title, Field.Store.YES));
doc.add(new TextField("content", content, Field.Store.YES));
doc.add(new TextField("id", String.valueOf(id), Field.Store.YES));
doc.add(new TextField("date", DateTimeToString(date), Field.Store.YES));
iwriter.addDocument(doc);
//int numDocs = iwriter.numRamDocs();
//System.out.println("共索引了: " + numDocs + " 个对象");
// 提交事务
iwriter.commit();
// 关闭事务
iwriter.close();
}
/**
* 高亮处理
*
* @param query
* @param fieldName
* @param fieldContent
* @return
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public String displayHtmlHighlight(Query query, String fieldName, String fieldContent)
throws IOException, InvalidTokenOffsetsException {
// 设置高亮标签
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<font color='#CD4336'>", "</font>");
// 评分
QueryScorer scorer = new QueryScorer(query);
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
// 高亮分析器
Highlighter highlight = new Highlighter(formatter, scorer);
highlight.setTextFragmenter(fragmenter);
String str = highlight.getBestFragment(analyzer, fieldName, fieldContent);
return str;
}
/**
* 查询方法
*
* @param text
* @return
* @throws IOException
* @throws ParseException
* @throws InvalidTokenOffsetsException
*/
public List<Map<String, Object>> search(String text, String path)
throws IOException, ParseException, InvalidTokenOffsetsException {
Directory directory = FSDirectory.open(Paths.get(path));
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(ireader);
List<String> ids = new ArrayList<String>();
// 在title中进行搜索
QueryParser parserTitle = new QueryParser("title", analyzer);
// 搜索含有text的内容
Query queryTitle = parserTitle.parse(text);
List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
// 在content中进行搜索
QueryParser parser = new QueryParser("content", analyzer);
// 搜索含有text的内容
Query query = parser.parse(text);
// 搜索标题和显示条数
TopDocs tds = searcher.search(query, 100);
Map<String, Object> map = null;
// 在内容中查获找
for (ScoreDoc sd : tds.scoreDocs) {
String id = searcher.doc(sd.doc).get("id");
String title = searcher.doc(sd.doc).get("title");
String content = searcher.doc(sd.doc).get("content");
String date = searcher.doc(sd.doc).get("date");
String short_content = content;
//......省略处理
// 内容添加高亮
QueryParser qp = new QueryParser("content", analyzer);
// 将匹配到的text添加高亮处理
Query q = qp.parse(text);
String html_content = displayHtmlHighlight(q, "content", short_content);
if (ids.contains(id)) {
list.stream().filter(s -> id.equals(s.get("id").toString())).findFirst().get().put("content", html_content);
continue;
}
String short_title = title;
//......省略处理
map = new HashMap<String, Object>();
map.put("title", short_title);
map.put("content", html_content);
map.put("id", id);
map.put("date", date);
list.add(map);
}
return list;
}
/**
* 删除索引方法
*
* @param filed
* @param keyWord
* @throws IOException
*/
public void delete(String filed, String keyWord, String path) throws IOException {
Directory directory = FSDirectory.open(Paths.get(path));
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
// 创建或新增索引
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter iwriter = new IndexWriter(directory, iwConfig);
// 删除filed中含有keyWord的索引
iwriter.deleteDocuments(new Term(filed, keyWord));
// 提交事务
iwriter.commit();
// 关闭事务
iwriter.close();
}
private String DateTimeToString(Date date) {
String _pattern = "yyyy-MM-dd HH:mm:ss";
return date == null ? null : DateToString(date, _pattern);
}
private String DateToString(Date date, String pattern) {
String strDateTime = null;
SimpleDateFormat formater = new SimpleDateFormat(pattern);
strDateTime = date == null ? null : formater.format(date);
return strDateTime;
}
MySmartChineseAnalyzer.java
public class MySmartChineseAnalyzer extends Analyzer {
private static CharArraySet stopWords;
private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final String STOPWORD_FILE_COMMENT = "//";
public static CharArraySet getDefaultStopSet() {
return MySmartChineseAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET;
}
public MySmartChineseAnalyzer() {
this(true);
}
public MySmartChineseAnalyzer(boolean useDefaultStopWords) {
this.stopWords = useDefaultStopWords ? MySmartChineseAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET : CharArraySet.EMPTY_SET;
}
public MySmartChineseAnalyzer(CharArraySet stopWords) {
this.stopWords = stopWords == null ? CharArraySet.EMPTY_SET : stopWords;
}
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new HMMChineseTokenizer();
TokenStream result = new PorterStemFilter(tokenizer);
if (!this.stopWords.isEmpty()) {
result = new StopFilter((TokenStream)result, this.stopWords);
}
SynonymMap.Builder builder = new SynonymMap.Builder();
builder.add(new CharsRef("石化"), new CharsRef("中石化"), true);
builder.add(new CharsRef("运费"), new CharsRef("运费表"), true);
SynonymMap synonymMap = null;
try {
synonymMap = builder.build();
} catch (IOException ignored) {
}
assert synonymMap != null;
result = new SynonymGraphFilter(result, synonymMap, true);
return new TokenStreamComponents(tokenizer, (TokenStream)result);
}
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
private DefaultSetHolder() {
}
static CharArraySet loadDefaultStopWordSet() throws IOException {
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils.getDecodingReader(org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer.class, "stopwords.txt", StandardCharsets.UTF_8), "//"));
}
static {
try {
DEFAULT_STOP_SET = loadDefaultStopWordSet();
} catch (IOException var1) {
throw new RuntimeException("Unable to load default stopword set");
}
}
}