tf idf java实现_java实现 tf-idf

最新推荐文章于 2023-01-28 18:41:41 发布

Ruyuan Zhang

最新推荐文章于 2023-01-28 18:41:41 发布

阅读量406

点赞数

文章标签： tf idf java实现

本文链接：https://blog.csdn.net/weixin_31975737/article/details/114087915

版权

该博客介绍了如何使用Java实现TF-IDF算法，通过Apache Solr客户端进行文本处理和索引构建。主要内容包括初始化Solr客户端、文档总数统计、索引文件、计算TF-IDF值等步骤，以提取文本中的重要关键字。

摘要由CSDN通过智能技术生成

importorg.apache.solr.client.solrj.SolrClient;importorg.apache.solr.client.solrj.SolrQuery;importorg.apache.solr.client.solrj.SolrServerException;importorg.apache.solr.client.solrj.impl.HttpSolrClient;importorg.apache.solr.client.solrj.request.AbstractUpdateRequest;importorg.apache.solr.client.solrj.request.ContentStreamUpdateRequest;importorg.apache.solr.client.solrj.response.QueryResponse;importorg.apache.solr.client.solrj.response.UpdateResponse;importorg.apache.solr.common.SolrDocument;importorg.apache.solr.common.SolrDocumentList;importorg.apache.solr.common.SolrInputDocument;importjava.io.File;importjava.io.IOException;importjava.text.SimpleDateFormat;import java.util.*;/*** @Author：sks

* @Description：根据tfidf理论计算，是一种用于信息检索与数据挖掘的常用加权技术。

* TF意思是词频(Term Frequency)，

* IDF意思是逆向文件频率(Inverse Document Frequency)。

* @Date：Created in 9:30 2018/1/10

* @Modified by：

**/

public classtfidf {private staticSolrClient solr;//单个文档中所有分词出现次数总和

private static int singleDocTotalCount = 0;//统计在单一文章中出现次数大于某个数的关键字，默认是3

private static int KEYWORD_MIN_COUNT_IN_SINGLE_DOC = 3;public static void main(String[] args) throwsSolrServerException,IOException {

List excludeInfo = new ArrayList();

excludeInfo.add("WORD模版");

excludeInfo.add("Page");

String urlString= "http://localhost:8983/solr/test";

String path= "D:/work/Solr/ImportData";

Init(urlString);//indexFilesSolrCell(path,excludeInfo);//try {//Thread.sleep(3000);//} catch (InterruptedException e) {//e.printStackTrace();//}// //资料库中文档总数

int docmentTotalCount = (int)getDocTotalCount();//setIdf(docmentTotalCount);//获取重要关键字

getImportanceKeywords(docmentTotalCount);

}/*** @Author：sks

* @Description：初始化solr客户端

* @Date：*/

public static voidInit(String urlString){

solr= newHttpSolrClient.Builder(urlString).build();

}/*** @Author：sks

* @Description：获取文档总数

* @Date：*/

private static long getDocTotalCount() throwsSolrServerException,IOException{long num = 0;try{

SolrQuery params= newSolrQuery();

params.set("q", "*:*");//params.setQuery("*:*");

QueryResponse rsp =solr.query(params);

SolrDocumentList docs=rsp.getResults();

num=docs.getNumFound();

}catch(SolrServerException e) {

e.printStackTrace();

}returnnum;

}/*** @Author：sks

* @Description：索引文件夹fileDirectory(不包含子文件夹)下的所有文件，

* @Date：*/

private static void indexFilesSolrCell(String fileDirectory,List excludeInfo) throwsIOException, SolrServerException{

File file= newFile(fileDirectory);

File[] files=file.listFiles();for(File f :files){

singleDocTotalCount= 0;

indexFilesSolrCell(f.getName(),f.toString(),excludeInfo);

}

}/*** @Author：sks

* @Description：索引文件，

* @Date：

* @fileName：文件名

* @path：文件路径(包含文件名)*/

private static void indexFilesSolrCell(String fileName, String path,ListexcludeInfo)throwsIOException, SolrServerException

{

ContentStreamUpdateRequest up= new ContentStreamUpdateRequest("/update/extract");

String contentType=getFileContentType(fileName);

up.addFile(newFile(path), contentType);

String fileType= fileName.substring(fileName.lastIndexOf(".")+1);

up.setParam("literal.id", fileName);

up.setParam("literal.path", path);//文件路径

up.setParam("fmap.content", "attr_content");//文件内容

up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);

solr.request(up);

String txt= getTextById(fileName,"attr_content",excludeInfo);if(txt.length()==0)

{

System.out.println("文件"+fileName+"索引失败");return;

}

delIndexByID(fileName);

Map temp = new HashMap();

temp.put("id",fileName);//文档id，用文件名作为ID

temp.put("text",txt);//文件文本

temp.put("fileType",fileType);//文件类型

temp.put("fileloadDate",GetCurrentDate());//上传日期//统计出现次数大于等于3的关键字

String keywords =getTopKeywords(KEYWORD_MIN_COUNT_IN_SINGLE_DOC,txt);

temp.put("wordCount",keywords);//统计出现次数大于等于3的关键字频率

String tf =getTopKeywordsFrequency(KEYWORD_MIN_COUNT_IN_SINGLE_DOC,txt);

temp.put("tf",tf);

updateMultiFieldData(temp);

}/*** @Author：sks

* @Description：根据id,获取字段field对应的数据

* @Date：*/

private static String getTextById(String id,String field,List excludeInfo) throwsSolrServerException,IOException {//使用这个对象做查询

SolrQuery params = newSolrQuery();//查询所有数据

params.setQuery("id:"+id);

params.setFields(field);

QueryResponse queryResponse=solr.query(params);//拿到数据集合,返回查询结果

List list =queryResponse.getResults();

String txt= "";for(SolrDocument solrDocument :list){if (solrDocument.size() >0){

txt=solrDocument.get(field).toString();break;

}

txt= txt.replace(" ","");

String[] txts= txt.split("\n");

StringBuilder sb= newStringBuilder();boolean bcontinue = false;for(String t:txts){if(t.length()==0){continue;

}

bcontinue= false;for(String m:excludeInfo) {if(t.indexOf(m)>-1)

{

bcontinue= true;break;

}

}if(bcontinue){continue;

}

sb.append(t);

sb.append("\n");

}returnsb.toString();

}/*** @Author：sks

* @Description：获取文本中出现次数较高的关键字

* @Date：

* @keywordCount：出现的次数

* @txt：文本信息*/

private static String getTopKeywords(int keywordCount,String txt) throwsSolrServerException,IOException{

Map totalMap =getAllWordsFromText(txt);

List> result =GetTopvalues(totalMap,keywordCount);

String keywords= "";

StringBuilder sb= newStringBuilder();for (Map.Entrylt : result) {

sb.append(lt.getKey());

sb.append(":");

sb.append(lt.getValue());

sb.append(",");

}

keywords=sb.toString();if(result.size()>1){

keywords= keywords.substring(0,keywords.length()-1);

}returnkeywords;

}/*** @Author：sks

* @Description：获取文本中出现次数较高的关键字及其词频

* @Date：

* @keywordCount：出现的次数

* @txt：文本信息*/

private static String getTopKeywordsFrequency(int keywordCount,String txt) throwsSolrServerException,IOException{

Map totalMap =getAllWordsFromText(txt);

List> result =GetTopvalues(totalMap,keywordCount);

StringBuilder sb= newStringBuilder();for (Map.Entrylt : result) {

sb.append(lt.getKey());

sb.append(":");float value = (float)lt.getValue()/singleDocTotalCount ;

sb.append(String.format("%.8f",value));

sb.append(",");

}

String keywords=sb.toString();if(result.size()>1){

keywords= keywords.substring(0,keywords.length()-1);

}returnkeywords;

}/*** @Author：sks

* @Description：更新多个字段数据，

* @Date：

* @maps：字段名和值键值对*/

private static void updateMultiFieldData( Map maps) throwsIOException, SolrServerException{

Set keys =maps.keySet();

SolrInputDocument doc= newSolrInputDocument();for(String key:keys){

doc.addField(key,maps.get(key));

}

solr.add(doc);

UpdateResponse rspCommit=solr.commit();

System.out.println("commit doc to index" + " result:" + rspCommit.getStatus() + " Qtime:" +rspCommit.getQTime());

}/*** @Author：sks

* @Description：把字符串分词，统计分词的重复次数，把分词和次数存在键值对里面

* @Date：*/

private static Map getAllWordsFromText(String txt) throwsSolrServerException, IOException {

List wlist = newArrayList();//由于文本太长，不能进行分词，所以要把文本拆分开分别进行分词，每500长度的文本作为一组进行分词

int l =txt.length();if(txt.length()>500) {

String[] txts= txt.split("\n");

String words= "";for (int i = 0; i < txts.length; i++) {if (words.length() < 500)

{

words+= txts[i] + "。";

}else{

wlist.add(words);

words= txts[i] + "。";

}

wlist.add(words);

}else{

wlist.add(txt);

}int count = 0;

Map rawMap = null;

List results = null;

Set keys = null;

Map totalMap = new HashMap();

NewsSummary obj= newNewsSummary();for(String txtline :wlist) {if (txtline != null && txtline.length()>0) {

results=obj.IKSegment(txtline);//results = getAnalysis(txtline);

rawMap =getWordsCount(results);

keys=rawMap.keySet();for(String key : keys)

{

singleDocTotalCount++;

count=rawMap.get(key);if(totalMap.containsKey(key))

{

count+=totalMap.get(key);

totalMap.remove(key);

}else{

totalMap.put(key,count);

}

}returntotalMap;

}/*** @Author：sks

* @Description：把列表中的数据存储在键值对里面，重复次数累加

* @Date：*/

private static Map getWordsCount(List txts) throwsSolrServerException, IOException {

Map resultMap = new HashMap();int count = 1;for(int i = 0;i

String key=txts.get(i) ;if(key.length()>1) {

count= 1;if(resultMap.containsKey(key)) {

count+=resultMap.get(key);

}

resultMap.remove(key);

resultMap.put(key, count);

}

}returnresultMap;

}/*** @Author：sks

* @Description：给map按照值降序排序，并取值大于topValue的键值对返回

* @Date：*/

private static List> GetTopvalues(Map hm,Integer topValue) throwsSolrServerException, IOException {

Map temp = new HashMap();

Set keys =hm.keySet();int value = 0;for(String key :keys)

{

value=hm.get(key);if (value >=topValue) {

temp.put(key,value);

}

}//这里将map.entrySet()转换成list

List> list = new ArrayList>(temp.entrySet());//然后通过比较器来实现排序

Collections.sort(list,new Comparator>() {//降序排序

public int compare(Map.Entryo1,

Map.Entryo2) {returno2.getValue().compareTo(o1.getValue());

}

});returnlist;

}/*** @Author：sks

* @Description：根据文件名获取文件的ContentType类型

* @Date：*/

public staticString getFileContentType(String filename) {

String contentType= "";

String prefix= filename.substring(filename.lastIndexOf(".") + 1);if (prefix.equals("xlsx")) {

contentType= "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";

}else if (prefix.equals("pdf")) {

contentType= "application/pdf";

}else if (prefix.equals("doc")) {

contentType= "application/msword";

}else if (prefix.equals("txt")) {

contentType= "text/plain";

}else if (prefix.equals("xls")) {

contentType= "application/vnd.ms-excel";

}else if (prefix.equals("docx")) {

contentType= "application/vnd.openxmlformats-officedocument.wordprocessingml.document";

}else if (prefix.equals("ppt")) {

contentType= "application/vnd.ms-powerpoint";

}else if (prefix.equals("pptx")) {

contentType= "application/vnd.openxmlformats-officedocument.presentationml.presentation";

}else{

contentType= "othertype";

}returncontentType;

}/*** @Author：sks

* @Description：删除指定ID的索引

* @Date：

* @id：索引ID*/

public static void delIndexByID(String id) throwsSolrServerException, IOException{

UpdateResponse ur=solr.deleteById(id);

System.out.println(ur);

UpdateResponse c=solr.commit();

System.out.println(c);

}/*** @Author：sks

* @Description：设置idf

* @Date：

* @docmentTotalCount：资料库中文档总数*/

private static void setIdf(int docmentTotalCount) throwsSolrServerException,IOException {

Map map =getidKeywordTFMap(docmentTotalCount);

Set keys =map.keySet();

String[] words= null;

String word= "";double tf = 0;double idf = 0;

StringBuilder sbtfidf= null;

StringBuilder sbidf= null;

String singleword= "";//region

for(String key:keys){

word=map.get(key);//去掉开头的[和结尾的]符合

word = word.substring(1,word.length()-2);

words= word.split(",");

sbtfidf= newStringBuilder();

sbidf= newStringBuilder();for(String w :words)

{

System.out.println(w);

tf= Float.parseFloat(w.split(":")[1]);

singleword= w.split(":")[0];

idf=getwordIdf(singleword,docmentTotalCount);

sbidf.append(singleword);

sbidf.append(":");

sbidf.append(getwordindocCount(singleword,docmentTotalCount));

sbidf.append(",");

sbtfidf.append(singleword);

sbtfidf.append(";");

sbtfidf.append(String .format("%.12f",tf*idf));

sbtfidf.append(",");

}//endregion

updateSingleData(key,"wordinDocCount",sbidf.toString());

updateSingleData(key,"tfIdf",sbtfidf.toString());

}

}/*** @Author：sks

* @Description：获取ID和keyword键值对

* @Date：*/

private static Map getidKeywordTFMap(int docmentTotalCount) throwsSolrServerException,IOException {//使用这个对象做查询

SolrQuery params = newSolrQuery();//查询所有数据

params.setQuery("*:*");//params.setQuery("id:5.15%2B袁纯子、王英%2B路透社报告：欧洲七大公共媒体数字化转型进路 (1).docx");//params.set("q", "*:*");

params.setFields("id,tf");

params.setStart(0);

params.setRows(docmentTotalCount);

QueryResponse rsp=solr.query(params);//SolrDocumentList docs = rsp.getResults();

List list =rsp.getResults();

Map idkeywordMap = new HashMap();

String result= "";

String id= "";//循环打印数据集合

for(SolrDocument sd : list) {if(sd.size()>1) {

idkeywordMap.put(sd.get("id").toString(), sd.get("tf").toString());

}

}returnidkeywordMap;

}/*** @Author：sks

* @Description：获取关键字的idf(inverse docment frequency) = log(文件总数/包含关键字的文件数)

* @Date：

* @word：关键字

* @docmentTotalCount：资料库中文档总数*/

private static double getwordIdf(String word,int docmentTotalCount) throwsSolrServerException,IOException {int count =getwordindocCount(word,docmentTotalCount);double idf = 0;if (count>0) {

idf= Math.log((double) docmentTotalCount /count);

}returnidf;

}/*** @Author：sks

* @Description：获取单词所在文档的个数

* @Date：

* @word：关键字

* @docmentTotalCount：资料库中文档总数*/

private static int getwordindocCount(String word,int docmentTotalCount) throwsSolrServerException,IOException {//使用这个对象做查询

SolrQuery params = newSolrQuery();//查询所有数据//params.setQuery("*:*");

params.setQuery("text:"+word);

params.setFields("freq:termfreq(text,'"+word+"')");//分页，默认是分页从0开始，每页显示10行

params.setStart(0);

params.setRows(docmentTotalCount);

QueryResponse queryResponse=solr.query(params);//拿到数据集合,返回查询结果

List list =queryResponse.getResults();int count = 0;//循环打印数据集合

for(SolrDocument solrDocument : list) {if(solrDocument.get("freq").toString()!="0"){

count++;

}

}returncount;

}/*** @Author：sks

* @Description：更新索引中单个属性数据

* @Date：

* @id：索引ID

* @fieldName：属性名称

* @fieldValue：属性值*/

public static void updateSingleData(String id,String fieldName,Object fieldValue) throwsSolrServerException,IOException{

Map oper = new HashMap();

oper.put("set", fieldValue);

SolrInputDocument doc= newSolrInputDocument();

doc.addField("id", id);

doc.addField(fieldName, oper);

UpdateResponse rsp=solr.add(doc);

System.out.println("update doc id:" + id + " result:" + rsp.getStatus() + " Qtime:" +rsp.getQTime());

UpdateResponse rspCommit=solr.commit();

System.out.println("commit doc to index" + " result:" + rspCommit.getStatus() + " Qtime:" +rspCommit.getQTime());

}/*** @Author：sks

* @Description：根据tf-idf 原理获取资料库中重要关键字，

* @Date：*/

private static void getImportanceKeywords(int docmentTotalCount) throwsSolrServerException,IOException {

Map map =getidKeywordTFMap(docmentTotalCount);

Set keys =map.keySet();

String[] words= null;

String word= "";double tf = 0;double idf = 0;double tfidf = 0;

String singleword= "";

Map keywordidfMap = new HashMap();//region

for(String key:keys){

word=map.get(key);//去掉开头的[和结尾的]符合

word = word.substring(1,word.length()-2);

words= word.split(",");for(String w :words)

{

tf= Float.parseFloat(w.split(":")[1]);

singleword= w.split(":")[0];

idf=getwordIdf(singleword,docmentTotalCount);

tfidf= tf *idf ;if(keywordidfMap.containsKey(singleword))

{if(keywordidfMap.get(singleword)>tfidf)

{

keywordidfMap.remove(singleword);

keywordidfMap.put(singleword,tfidf);

}

}else{

keywordidfMap.put(singleword,tfidf);

}

List> sortedSentList = new ArrayList>(keywordidfMap.entrySet());//按得分从高到底排序好的句子，句子编号与得分//System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");

Collections.sort(sortedSentList, new Comparator>(){//@Override

public int compare(Map.Entry o1, Map.Entryo2) {return o2.getValue() == o1.getValue() ? 0:

(o2.getValue()> o1.getValue() ? 1 : -1);

}

});for(Map.Entryentry:sortedSentList) {

System.out.println(entry.getKey()+ ":" +entry.getValue());

}

}/*** @Author：sks

* @Description：获取系统当天日期yyyy-mm-dd

* @Date：*/

private static String GetCurrentDate() throwsSolrServerException,IOException {

Date dt= newDate();//最后的aa表示“上午”或“下午” HH表示24小时制如果换成hh表示12小时制//SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss aa");