知识库无非就是喂数据,训练,查询。
数据能力还是较差,毕竟没有几个G版本的。
原理制作索引
long beg = System.currentTimeMillis();
System.out.println("*** 索引制作开始! ***");
String path = "C:\\data\\index\\"+code+"\\train\\";
List<DocInfo> list1 = new ArrayList<>();
File pfile = new File(path); //获取其file对象
File[] fs = pfile.listFiles(); //遍历path下的文件和目录,放在File数组中
for(File f:fs) { //遍历File[]数组
if (!f.isDirectory()) { //若非目录(即文件),则打印
String fileName = f.getAbsolutePath();
String text = fileUtil.readTxtFile(fileName);
Pattern pattern = Pattern.compile("(\\d+、[^?]+?)[\\s\\S]*?(?=\\d+、|\\Z)", Pattern.MULTILINE);
Matcher matcher = pattern.matcher(text);
int b = 1;
while (matcher.find()) {
System.out.println("问题: " + matcher.group(1).trim());
System.out.println("答案: " + matcher.group(0).trim().replace(matcher.group(1).trim(),""));
System.out.println();
DocInfo docInfo = new DocInfo();
docInfo.setTitle(matcher.group(1).trim());
String part11 = "/api/"+code+"/v1/";
String part2 = b+"";
String url = part11 + part2;
docInfo.setUrl(url);
docInfo.setDocId(b);
docInfo.setContent(matcher.group(0).trim().replace(matcher.group(1).trim(),""));
list1.add(docInfo);
b++;
}
}
}
for (DocInfo f3 : list1){
index.addDoc(f3.getTitle(), f3.getUrl(), f3.getContent());
}
index.save();
long end = System.currentTimeMillis();
System.out.println("**** 索引制作完成! " + (end - beg) + "ms ****");
使用nlp分词查询索引,算比较人工智能。
// 1. [分词] 针对 query 这个查询词进行分词
List<Term> oldTerms = HanLP.segment(query);
List<Term> terms = new ArrayList<>();
// 针对分词结果, 使用暂停词表进行过滤
for (Term term : oldTerms){
if(stopWords.contains(term.word)){
continue;
}
terms.add(term);
}
// 2. [触发] 针对分词结果来查倒排
// List<Weight> allTermResult = new ArrayList<>();
// 搜索一个词的文档有 List<Weight> 个
// 搜索n个分词结果的文档有 List<List<Weight>> 个
List<List<Weight>> termResult = new ArrayList<>();
for (Term term : terms) {
String word = term.word;
// 虽然倒排索引中, 有很多的词, 但是这里的词一定都是之前的文档中存在的
List<Weight> invertedList = index.getInverted(word);
if (invertedList == null) {
// 说明这个词在所有文档中都不存在
continue;
}
termResult.add(invertedList);
}
// 3. [合并] 针对多个分词结果触发出的相同文档, 进行权重合并
List<Weight> allTermResult = mergeResult(termResult);
// 4. [排序] 针对触发的结果按照权重降序排序
allTermResult.sort(new Comparator<Weight>() {
@Override
public int compare(Weight o1, Weight o2) {
// 如果是升序排序: return o1.getWeight() - o2.getWeight()
// 如果是降序排序: return o2.getWeight() - o1.getWeight()
return o2.getWeight() - o1.getWeight();
}
});
// 5. [包装结果] 针对排序的结果, 去查正排, 构造出要返回的数据
List<Result> results = new ArrayList<>();
for (Weight weight : allTermResult) {
DocInfo docInfo = index.getDocInfo(weight.getDocId());
Result result = new Result();
result.setTitle(docInfo.getTitle());
result.setUrl(docInfo.getUrl());
result.setDesc(GenDes(docInfo.getContent(), terms));
results.add(result);
}