java 代码
- //提取内容关键字! 对关键字重要性做排序 筛选!
- TermFreqVector contentFreqVector = reader.getTermFreqVector(docId, "itemContent");
- if(contentFreqVector != null){
- //生成TermVector ArryList!
- List<termvector> contentVectorList = new ArrayList<termvector>(); </termvector></termvector>
- for (int i = 0; i < contentFreqVector.size(); i++) {
- String termWord=contentFreqVector.getTerms()[i];
- String regex="[a-zA-Z]{4,}|[\u4E00-\u9fa5]{2,5}"; //英文4个以上,中文2到5个
- boolean is = termWord.matches(regex);
- if(is==true){
- Term term = new Term("itemContent",contentFreqVector.getTerms()[i]);
- TermVector termVector = new TermVector();
- termVector.termWord = termWord;
- termVector.termFreq = contentFreqVector.getTermFrequencies()[i];
- // termVector.docFreq = searcher.docFreq(term);
- //一般来说IDF值越大,权重越高,该词在全网出现的频率越小
- termVector.IDF = searcher.getSimilarity().idf(term, searcher);
- Similarity sim = Similarity.getDefault();
- termVector.TF = sim.tf(contentFreqVector.getTermFrequencies()[i]);
- contentVectorList.add(termVector);
- }
- }
- //按照制定的算法进行排序! 算法还需要完善 需要考虑 TF/IDF 信息熵方面的东西!
- Comparator<termvector> comp = new TermComparator(); </termvector>
- Collections.sort(contentVectorList,comp);
- class TermComparator implements Comparator {
- public int compare(Object o1, Object o2) {
- TermVector t1 = (TermVector) o1;
- TermVector t2 = (TermVector) o2;
- if (t1.getWeight() < t2.getWeight())
- return 1;
- else
- return 0;
- }
- }
- //Document itemDoc = reader.document(docId);