背景:
最近做了一个App需要使用的搜索建议的功能,效果就是我们再使用百度搜索的时候:
可以看到,每输入一个字符就会发送一个请求;
这个功能看似很高大上,其实做一个简单的实现还是很简单的。
原理:
这个技术,主要有以下的技术点:
Elasticsearch7.4.1(以下称为ES)推荐
ik中文分词插件
首先需要将数据存放到ES中,当然存放的时候需要做一些简单的处理,需要将搜索的field做分词放在列表中,存入到ES之中。由于数据爬取是采用的Python,所以在这里贴Python代码,数据采集是使用的JD的商品数据
//由传过来的字符生成suggest的数组
def gen_suggest(index, info_tuple):
# 由字符串生成建议
used_word = set()
suggest = []
for text, wight in info_tuple:
if text:
words = es.indices.analyze(index="jd_product", body={"analyzer":"ik_max_word", "text": "{0}".format(text)})
analyzed_word = set([r["token"] for r in words["tokens"] if len(r["token"])>1])
print(analyzed_word)
naw_words = analyzed_word - used_word
else:
naw_words = set()
if naw_words:
suggest.append({"input": list(naw_words),"weight":wight})
print(suggest)
return suggest
def save_ES(result):
jd = JD_Product()
jd.by_self = result["by-self"]
jd.comment_cnt = result["comment-cnt"]
jd.title = result["title"]
jd.pid = result["pid"]
jd.image_data_lazy_img = result["image-data-lazy-img"]
jd.image_src = result["image-src"]
jd.price = result["price"]
jd.shop_name = result["shop-name"]
# 生成建议的数据
jd.suggest = gen_suggest(JD_Product, ((jd.title, 10), (jd.shop_name, 7)))
jd.save()
此时存放在ES中的数据是这样的:
{
"_index": "jd_product",
"_type": "_doc",
"_id": "opSOOXEBy66jXuB0CIVc",
"_version": 1,
"_seq_no": 15536,
"_primary_term": 1,
"found": true,
"_source": {
"by_self": "",
"comment_cnt": "2200+",
"title": "京东国际贝玲妃(Benefit)妆前乳/打底霜 毛孔遮盖脸部底霜(反恐精英/反孔) 22ml 【部分满199减100】护肤彩妆一站购全,点击进入点击进入",
"pid": "25715154185",
"image_data_lazy_img": "//img10.360buyimg.com/n7/jfs/t1/105249/25/17056/74155/5e84303eE9afd3891/fd91ef9acfdff12f.jpg",
"image_src": "//img10.360buyimg.com/n7/jfs/t1/105249/25/17056/74155/5e84303eE9afd3891/fd91ef9acfdff12f.jpg",
"price": "¥198.00",
"shop_name": "星线美妆海外专营店",
"suggest": [
{
"input": [
"22",
"点击",
"22ml",
"精英",
"100",
"ml",
"反恐",
"进入",
"199",
"京东",
"遮盖",
"反恐精英",
"打底",
"国际",
"一站",
"部分",
"毛孔",
"护肤",
"脸部",
"benefit"
],
"weight": 10
},
{
"input": [
"专营店",
"海外",
"专营"
],
"weight": 7
}
]
}
}
可以看到tittle分词了;这样就可以实现API了。
实现:
suggest的API采用SpringBoot实现,由于ES版本比较高(7.4.1),所以使用原生的RestHighLevelClient来进行操作。
添加相关依赖:
org.elasticsearch
elasticsearch
7.4.1
org.elasticsearch.client
elasticsearch-rest-high-level-client
7.4.1
此外需要在pom.xml中添加以下配置,避免出错
1.8
7.4.1
ESClient工厂:
import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestClientBuilder;
import org.elasticsearch.client.RestHighLevelClient;
import java.io.IOException;
public class ESClientSpringFactory {
public static int CONNECT_TIMEOUT_MILLIS = 1000;
public static int SOCKET_TIMEOUT_MILLIS = 30000;
public static int CONNECTION_REQUEST_TIMEOUT_MILLIS = 500;
public static int MAX_CONN_PER_ROUTE = 10;
public static int MAX_CONN_TOTAL = 30;
private static HttpHost HTTP_HOST;
private RestClientBuilder builder;
private RestClient restClient;
private RestHighLevelClient restHighLevelClient;
private static ESClientSpringFactory esClientSpringFactory = new ESClientSpringFactory();
private ESClientSpringFactory(){}
public static ESClientSpringFactory build(HttpHost httpHost,
Integer maxConnectNum, Integer maxConnectPerRoute){
HTTP_HOST = httpHost;
MAX_CONN_TOTAL = maxConnectNum;
MAX_CONN_PER_ROUTE = maxConnectPerRoute;
return esClientSpringFactory;
}
public static ESClientSpringFactory build(HttpHost httpHost,Integer connectTimeOut, Integer socketTimeOut,
Integer connectionRequestTime,Integer maxConnectNum, Integer maxConnectPerRoute){
HTTP_HOST = httpHost;
CONNECT_TIMEOUT_MILLIS = connectTimeOut;
SOCKET_TIMEOUT_MILLIS = socketTimeOut;
CONNECTION_REQUEST_TIMEOUT_MILLIS = connectionRequestTime;
MAX_CONN_TOTAL = maxConnectNum;
MAX_CONN_PER_ROUTE = maxConnectPerRoute;
return esClientSpringFactory;
}
public void init(){
builder = RestClient.builder(HTTP_HOST);
setConnectTimeOutConfig();
setMutiConnectConfig();
restClient = builder.build();
restHighLevelClient = new RestHighLevelClient(builder);
System.out.println("init factory");
}
// 配置连接时间延时
public void setConnectTimeOutConfig(){
builder.setRequestConfigCallback(requestConfigBuilder -> {
requestConfigBuilder.setConnectTimeout(CONNECT_TIMEOUT_MILLIS);
requestConfigBuilder.setSocketTimeout(SOCKET_TIMEOUT_MILLIS);
requestConfigBuilder.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MILLIS);
return requestConfigBuilder;
});
}
// 使用异步httpclient时设置并发连接数
public void setMutiConnectConfig(){
builder.setHttpClientConfigCallback(httpClientBuilder -> {
httpClientBuilder.setMaxConnTotal(MAX_CONN_TOTAL);
httpClientBuilder.setMaxConnPerRoute(MAX_CONN_PER_ROUTE);
return httpClientBuilder;
});
}
public RestClient getClient(){
return restClient;
}
public RestHighLevelClient getRhlClient(){
return restHighLevelClient;
}
public void close() {
if (restClient != null) {
try {
restClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("close client");
}
}
ESConfig:
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Scope;
@Configuration
@Slf4j
@ComponentScan(basePackageClasses=ESClientSpringFactory.class)
public class ESConfig {
@Value("${elasticSearch.host}")
private String host;
@Value("${elasticSearch.port}")
private int port;
@Value("${elasticSearch.client.connectNum}")
private Integer connectNum;
@Value("${elasticSearch.client.connectPerRoute}")
private Integer connectPerRoute;
@Bean
public HttpHost httpHost() {
return new HttpHost(host, port, "http");
}
@Bean(initMethod = "init", destroyMethod = "close")
public ESClientSpringFactory getFactory() {
return ESClientSpringFactory.
build(httpHost(), connectNum, connectPerRoute);
}
@Bean
@Scope("singleton")
public RestClient getRestClient() {
return getFactory().getClient();
}
@Bean
@Scope("singleton")
public RestHighLevelClient getRHLClient() {
return getFactory().getRhlClient();
}
}
yaml中的配置数据:
elasticSearch.host=ip地址
elasticSearch.port=9200
elasticSearch.client.connectNum=10
elasticSearch.client.connectPerRoute=50
编写一个测试:
@Qualifier("getRHLClient")
@Autowired
RestHighLevelClient restHighLevelClient;
@Test
public void getSuggest()throws Exception{
String data = "手机";
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
// 构建模糊相关的参数
FuzzyOptions fuzzy = FuzzyOptions.builder().setFuzzyPrefixLength(1).setFuzziness(0).setFuzzyMinLength(3).build();
CompletionSuggestionBuilder completionSuggestionBuilder = new CompletionSuggestionBuilder("suggest");
SuggestionBuilder termSuggestionBuilder = SuggestBuilders.completionSuggestion("suggest").prefix(data,fuzzy);
SuggestBuilder suggestBuilder = new SuggestBuilder();
suggestBuilder.addSuggestion("my-suggest", termSuggestionBuilder);
searchSourceBuilder.suggest(suggestBuilder);
// 返回指定的字段
String[] incloud = {"shop_name","title"};
String[] excloud = {};
searchSourceBuilder.fetchSource(incloud,excloud);
SearchRequest searchRequest = new SearchRequest("jd_product"); //索引
searchRequest.source(searchSourceBuilder);
SearchResponse response = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
Suggest suggestions = response.getSuggest(); //SearchHits提供有关所有匹配的全局信息,例如总命中数或最高分数:
System.out.println("suggestions = " + suggestions);
}
获取到返回的数据: