场景介绍:
在处理输入的文本时,需要将http://bit.ly/3ynriE等短连接转换为真实连接lucene.apache.org/solr
1,实现TokenFilter
package com.url.plugin;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.util.regex.Pattern;
public class ResolveUrlTokenFilter extends TokenFilter {
private final CharTermAttribute charTermAttribute=addAttribute(CharTermAttribute.class);
private final Pattern patternToMatchShortenedUrls;
public ResolveUrlTokenFilter(TokenStream input, Pattern patternToMatchShortenedUrls) {
super(input);
this.patternToMatchShortenedUrls = patternToMatchShortenedUrls;
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken())
return false;
//charTermAttribute会保存读取char
char[] term=charTermAttribute.buffer();
int len=term.length;
//构造字符串
String token=new String(term,0,len);
//匹配token中是否出现我们需要重构的场景
if(patternToMatchShortenedUrls.matcher(token).matches()){
charTermAttribute.setEmpty().append(resolveUrlToken(token));
}
return true;
}
private String resolveUrlToken(String token) {
//TODO 根据实际需求处理token
try {
if ("http://bit.ly/3ynriE".equals(token)) {
return "lucene.apache.org/solr";
} else if ("http://bit.ly/15tzw".equals(token)) {
return "manning.com";
}
} catch (Exception exc) {
// rather than failing analysis if you can't resolve the URL,
// you should log the error and return the un-resolved value
exc.printStackTrace();
}
return token;
}
}
2,实现TokenFilterFactory
package com.url.plugin;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
import java.util.regex.Pattern;
public class ResolveUrlTokenFilterFactory extends TokenFilterFactory {
private Pattern patternToMatchShortenedUrls;
public ResolveUrlTokenFilterFactory(Map<String, String> args) {
super(args);
assureMatchVersion();
//从solr读取的配置文件信息中获取正则表达式信息
String shortenedUrls=require(args,"shortenedUrlPattern");
patternToMatchShortenedUrls=Pattern.compile(shortenedUrls);
}
@Override
public TokenFilter create(TokenStream tokenStream) {
//创建ResolveUrlTokenFilter实例对象
return new ResolveUrlTokenFilter(tokenStream,patternToMatchShortenedUrls);
}
}
3,将其打成jar包
4,在solr的schema文件中添加如下内容
<fieldType name="text_plugin" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="com.url.plugin.ResolveUrlTokenFilterFactory" shortenedUrlPattern="http:\/\/bit.ly\/[\w\-]+" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
5,在solr的根目录下创建plugin文件夹,(位置同dist,contrib文件),并将3生成的jar放入其中
6,在solrconfg.xml中添加
<lib dir="../../../plugins/" regex=".*\.jar" />
7,java -jar start.jar