场景:新增选择题,首先判断数据库中是否已存在,不存在才新增-
判断是否有重复试题-将库中所有试题加载进Es中,题干截取30字,各个选项截取30字逗号分割,截取是为了节省内存
新增试题的时候,将试题进行分词-取几个关键词进行ES查询即可-(查询的是疑似重复题,最终人为判断是否重复)
Es的映射如下
GET /ques620/_mapping
PUT ques620
{
"mappings": {
"properties": {
"id": {
"type": "long",
"index": false,
"doc_values": false
},
"labelcode": {
"type": "keyword"
},
"content": {
"type": "text",
"analyzer": "ik_smart"
},
"option": {
"type": "text",
"analyzer": "ik_smart"
}
}
}
加载所有试题入库(题干+选项)
//将试题加载到ES里面
@Test
public void uploadQuestionES() throws IOException {
for (int i = 0; i <2 ; i++) {
//获取要加载到ES里的数据,过滤到HTML,截取部分
List<QuestionMapping> questionMappingList= questionService.getESList(i);
System.out.println(questionMappingList.size());
System.out.println(JSONArray.toJSONString(questionMappingList));
//试题存储到ES中,指定索引下
elasticSaveService.saveProductAsIndices2(questionMappingList);
}
}
/**
* 为了去重使用-试题内容不全-被截取部分
* 过滤掉HTML标签-题干保留30个字,各个选项保留30个字
*
* @param pageNo
* @return
*/
public List<QuestionMapping> getESList(int pageNo){
Page<QuestionMapping> page = new Page<>(pageNo, 50);
List<QuestionMapping> quList= questionDao.queryPage(page);
quList.stream().forEach(item ->{
item.setSubCount(null);
List<QuestionKey> keyList= questionDao.queryKeyById(item.getId());
List<String> keyStr=new ArrayList<>();
keyList.stream().forEach(key ->{
keyStr.add(HtmlUtils.filterStr(key.getContent()));
});
String optionStr= String.join(",", keyStr);
//TODO 替换掉Html标签
String contentStr=item.getContent();
item.setContent(HtmlUtils.filterStr(contentStr));
item.setOption(optionStr);
});
return quList;
}
/**
* 批量存储Es
* @param questionMappings
* @return
* @throws IOException
*/
public boolean saveProductAsIndices2(List<QuestionMapping> questionMappings) throws IOException {
BulkRequest bulkRequest = new BulkRequest();//批量操作
for (QuestionMapping qu : questionMappings) {
IndexRequest indexRequest = new IndexRequest("ques620");//指定要插入到哪个索引下
indexRequest.id(String.valueOf(qu.getId()));
String s = JSON.toJSONString(qu);
indexRequest.source(s, XContentType.JSON);//需要保存到ES的内容,说明内容类型是json字符串
bulkRequest.add(indexRequest);//添加索引操作
}
//执行批量 BulkRequest bulkRequest, RequestOptions options
BulkResponse bulkResponse = restHighLevelClient.bulk(bulkRequest, GulimallElasticSearchConfig.COMMON_OPTIONS);
boolean hasFailures = bulkResponse.hasFailures();
List<String> collect = Arrays.asList(bulkResponse.getItems()).stream().map(item -> {
return item.getId();
}).collect(Collectors.toList());
log.info("入库完成:{}",collect);
return !hasFailures;
}
在ES中查找疑似重复题
//搜索相似题
@Test
public void getQueryQuestio() throws Exception {
String contentWord="小华家的电灯丝断了,他把灯泡晃了晃使灯丝又搭上了,再用的时候会发现:";
String keyWord="A.灯比原来亮了 B.灯比原来暗了 C.跟原来一样";
//获取题干关键字
Set<String> contentSet= elasticSaveService.getKeyWord(contentWord,3);
System.out.println(contentSet.toString());
//获取选项关键字
Set<String> keySet= elasticSaveService.getKeyWord(keyWord,3);
System.out.println(keySet.toString());
//在ES中查找疑似重复题
List<QuestionMapping> lis=elasticSaveService.queryByKeyword(contentSet,keySet);
System.out.println("170---"+JSONArray.toJSONString(lis));
}
/**
* 对字符串进行分词-获取关键字
* @param text
* @return
* @throws Exception
*/
public Set<String> getKeyWord(String text,int maxKeyNum)throws Exception{
AnalyzeRequest request = AnalyzeRequest.withGlobalAnalyzer("ik_smart",
text);
AnalyzeResponse response = restHighLevelClient.indices().analyze(request, RequestOptions.DEFAULT);
List<AnalyzeResponse.AnalyzeToken> tokens= response.getTokens();
List<String> keyWords=new ArrayList<>();
List<String> strList= tokens.stream().map(item->{ return item.getTerm();}).collect(Collectors.toList());
Set<String> needStr= HtmlUtils.filterNeedStr(strList,maxKeyNum);
return needStr;
}
/**
* 过滤字符串集合-获取needNum个较长的字符串-作为关键字搜索
* @param list
* @param needNum
* @return
*/
public static Set<String> filterNeedStr(List<String> list, int needNum){
Set<String> new3Str=new HashSet<>();
Set<String> new2Str=new HashSet<>();
Set<String> new1Str=new HashSet<>();
int defulte=3;//默认取3个关键字
if(needNum!=0){
defulte=needNum;
}
int a=0;
int b=0;
for (String s:list){
if(s.length()>2){
if(!new3Str.contains(s)){
a++;
new3Str.add(s);
}
if(a>defulte){
return new3Str;
}
}
if(s.length()==2){
new2Str.add(s);
}else if(s.length()==1){
new1Str.add(s);
}
}
if(new3Str.size()<defulte){
for (String s2: new2Str){
new3Str.add(s2);
if(new3Str.size()>defulte){
break;
}
}
}
if(new3Str.size()<defulte){
for (String s1: new1Str){
new3Str.add(s1);
if(new3Str.size()>6){
break;
}
}
}
return new3Str;
}
/**
* 根据关键字查询
* @param contentSet 标题关键字
* @param optionSet 选项关键字
* @return
*/
public List<QuestionMapping> queryByKeyword(Set<String> contentSet,Set<String> optionSet){
SearchRequest request = bulidSearchRequest(contentSet,optionSet);
try {
SearchResponse searchResponse = restHighLevelClient.search(request, GulimallElasticSearchConfig.COMMON_OPTIONS);
List<QuestionMapping> result = bulidSearchResult(searchResponse);
// System.out.println(JSONArray.toJSONString(result));
return result;
} catch (IOException e) {
e.printStackTrace();
}
// return searchResult;
return null;
}
//构建DSL查询
private SearchRequest bulidSearchRequest(Set<String> contentSet,Set<String> optionSet){
if(contentSet==null&&optionSet==null){
return null;
}
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
//1. 构建bool query
BoolQueryBuilder boolQueryBuilder = new BoolQueryBuilder();
//1.1 bool must
if (contentSet!=null&&contentSet.size()>0) {
for (String s:contentSet){
boolQueryBuilder.must(QueryBuilders.matchPhraseQuery("content", s));
}
} if(optionSet!=null&&optionSet.size()>0){
for (String p:optionSet){
boolQueryBuilder.must(QueryBuilders.matchPhraseQuery("option", p));
}
}
//1. bool query构建完成
searchSourceBuilder.query(boolQueryBuilder);
log.debug("构建的DSL语句 {}",searchSourceBuilder.toString());
System.out.println("构建的DSL语句 {}"+searchSourceBuilder.toString());
SearchRequest request = new SearchRequest(new String[]{"questions","quselect","ques620"}, searchSourceBuilder);
return request;
}
//查询结果的查看
private List<QuestionMapping> bulidSearchResult(SearchResponse searchResponse) {
List<QuestionMapping> result = new ArrayList<>();
SearchHits hits = searchResponse.getHits();
//1. 封装查询到的信息
if (hits.getHits() != null && hits.getHits().length > 0) {
List<SkuEsModel> skuEsModels = new ArrayList<>();
for (SearchHit hit : hits) {
String sourceAsString = hit.getSourceAsString();
QuestionMapping questionMapping = JSON.parseObject(sourceAsString, QuestionMapping.class);
//设置高亮属性
result.add(questionMapping);
}
}
return result;
}
//搜索相似题
@Test
public void getQueryQuestio() throws Exception {
String contentWord="下列哪项不符合全胃肠外营养所用的营养液的要求";
String keyWord="维生素和微量元素";
Long startTime=System.currentTimeMillis();
//获取题干关键字
Set<String> contentSet= elasticSaveService.getKeyWord(contentWord,3);
System.out.println("获取题干关键字---"+contentSet.toString());
//获取选项关键字
Set<String> keySet= elasticSaveService.getKeyWord(keyWord,3);
System.out.println("获取选项关键字----"+keySet.toString());
//在ES中查找疑似重复题
List<QuestionMapping> lis=elasticSaveService.queryByKeyword(contentSet,keySet);
System.out.println("170---"+JSONArray.toJSONString(lis));
Long endTime=System.currentTimeMillis();
System.out.println("查找用时-------"+(endTime-startTime));
}
获取题干关键字---[要求, 所用, 营养液, 不符合]
获取选项关键字----[微量元素, 和, 维生素]
构建的DSL语句 {}{"query":{"bool":{"must":[{"match_phrase":{"content":{"query":"要求","slop":0,"zero_terms_query":"NONE","boost":1.0}}},{"match_phrase":{"content":{"query":"所用","slop":0,"zero_terms_query":"NONE","boost":1.0}}},{"match_phrase":{"content":{"query":"营养液","slop":0,"zero_terms_query":"NONE","boost":1.0}}},{"match_phrase":{"content":{"query":"不符合","slop":0,"zero_terms_query":"NONE","boost":1.0}}},{"match_phrase":{"option":{"query":"微量元素","slop":0,"zero_terms_query":"NONE","boost":1.0}}},{"match_phrase":{"option":{"query":"和","slop":0,"zero_terms_query":"NONE","boost":1.0}}},{"match_phrase":{"option":{"query":"维生素","slop":0,"zero_terms_query":"NONE","boost":1.0}}}],"adjust_pure_negative":true,"boost":1.0}}}
170---[{"content":"下列哪项不符合全胃肠外营养所用的营养液的要求","id":19032,"labelCode":"A1","option":"{\"A\":\"每日供氮应达0.2~0.24 g/kg体重\",\"B\":\"适量补充胰岛素和脂肪乳剂\",\"C\":\"含有适量的电解质、维生素和微量元素\",\"D\":\"氮和热量之比为1:100kcal\",\"E\":\"所补充的必需氨基酸和非必需氨基酸的含量一般应为1:2\"}","parentId":0,"right":"D","rigthCode":8,"sourceOrgId":-100,"status":1,"subCount":0}]
查找用时-------382