importorg.apache.solr.client.solrj.SolrClient;importorg.apache.solr.client.solrj.SolrQuery;importorg.apache.solr.client.solrj.SolrServerException;importorg.apache.solr.client.solrj.impl.HttpSolrClient;importorg.apache.solr.client.solrj.request.AbstractUpdateRequest;importorg.apache.solr.client.solrj.request.ContentStreamUpdateRequest;importorg.apache.solr.client.solrj.response.QueryResponse;importorg.apache.solr.client.solrj.response.UpdateResponse;importorg.apache.solr.common.SolrDocument;importorg.apache.solr.common.SolrDocumentList;importorg.apache.solr.common.SolrInputDocument;importjava.io.File;importjava.io.IOException;importjava.text.SimpleDateFormat;import java.util.*;/*** @Author:sks
* @Description:根据tfidf理论计算,是一种用于信息检索与数据挖掘的常用加权技术。
* TF意思是词频(Term Frequency),
* IDF意思是逆向文件频率(Inverse Document Frequency)。
* @Date:Created in 9:30 2018/1/10
* @Modified by:
**/
public classtfidf {private staticSolrClient solr;//单个文档中所有分词出现次数总和
private static int singleDocTotalCount = 0;//统计在单一文章中出现次数大于某个数的关键字,默认是3
private static int KEYWORD_MIN_COUNT_IN_SINGLE_DOC = 3;public static void main(String[] args) throwsSolrServerException,IOException {
List excludeInfo = new ArrayList();
excludeInfo.add("WORD模版");
excludeInfo.add("Page");
String urlString= "http://localhost:8983/solr/test";
String path= "D:/work/Solr/ImportData";
Init(urlString);//indexFilesSolrCell(path,excludeInfo);//try {//Thread.sleep(3000);//} catch (InterruptedException e) {//e.printStackTrace();//}// //资料库中文档总数
int docmentTotalCount = (int)getDocTotalCount();//setIdf(docmentTotalCount);//获取重要关键字
getImportanceKeywords(docmentTotalCount);
}/*** @Author:sks
* @Description:初始化solr客户端
* @Date:*/
public static voidInit(String urlString){
solr= newHttpSolrClient.Builder(urlString).build();
}/*** @Author:sks
* @Description:获取文档总数
* @Date:*/
private static long getDocTotalCount() throwsSolrServerException,IOException{long num = 0;try{
SolrQuery params= newSolrQuery();
params.set("q", "*:*");//params.setQuery("*:*");
QueryResponse rsp =solr.query(params);
SolrDocumentList docs=rsp.getResults();
num=docs.getNumFound();
}catch(SolrServerException e) {
e.printStackTrace();
}returnnum;
}/*** @Author:sks
* @Description:索引文件夹fileDirectory(不包含子文件夹)下的所有文件,
* @Date:*/
private static void indexFilesSolrCell(String fileDirectory,List excludeInfo) throwsIOException, SolrServerException{
File file= newFile(fileDirectory);
File[] files=file.listFiles();for(File f :files){
singleDocTotalCount= 0;
indexFilesSolrCell(f.getName(),f.toString(),excludeInfo);
}
}/*** @Author:sks
* @Description:索引文件,
* @Date:
* @fileName:文件名
* @path:文件路径(包含文件名)*/
private static void indexFilesSolrCell(String fileName, String path,ListexcludeInfo)throwsIOException, SolrServerException
{
ContentStreamUpdateRequest up= new ContentStreamUpdateRequest("/update/extract");
String contentType=getFileContentType(fileName);
up.addFile(newFile(path), contentType);
String fileType= fileName.substring(fileName.lastIndexOf(".")+1);
up.setParam("literal.id", fileName);
up.setParam("literal.path", path);//文件路径
up.setParam("fmap.content", "attr_content");//文件内容
up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);
solr.request(up);
String txt= getTextById(fileName,"attr_content",excludeInfo);if(txt.length()==0)
{
System.out.println("文件"+fileName+"索引失败");return;
}
delIndexByID(fileName);
Map temp = new HashMap();
temp.put("id",fileName);//文档id,用文件名作为ID
temp.put("text",txt);//文件文本
temp.put("fileType",fileType);//文件类型
temp.put("fileloadDate",GetCurrentDate());//上传日期//统计出现次数大于等于3的关键字
String keywords =getTopKeywords(KEYWORD_MIN_COUNT_IN_SINGLE_DOC,txt);
temp.put("wordCount",keywords);//统计出现次数大于等于3的关键字频率
String tf =getTopKeywordsFrequency(KEYWORD_MIN_COUNT_IN_SINGLE_DOC,txt);
temp.put("tf",tf);
updateMultiFieldData(temp);
}/*** @Author:sks
* @Description:根据id,获取字段field对应的数据
* @Date:*/
private static String getTextById(String id,String field,List excludeInfo) throwsSolrServerException,IOException {//使用这个对象做查询
SolrQuery params = newSolrQuery();//查询所有数据
params.setQuery("id:"+id);
params.setFields(field);
QueryResponse queryResponse=solr.query(params);//拿到数据集合,返回查询结果
List list =queryResponse.getResults();
String txt= "";for(SolrDocument solrDocument :list){if (solrDocument.size() >0){
txt=solrDocument.get(field).toString();break;
}
}
txt= txt.replace(" ","");
String[] txts= txt.split("\n");
StringBuilder sb= newStringBuilder();boolean bcontinue = false;for(String t:txts){if(t.length()==0){continue;
}
bcontinue= false;for(String m:excludeInfo) {if(t.indexOf(m)>-1)
{
bcontinue= true;break;
}
}if(bcontinue){continue;
}
sb.append(t);
sb.append("\n");
}returnsb.toString();
}/*** @Author:sks
* @Description:获取文本中出现次数较高的关键字
* @Date:
* @keywordCount:出现的次数
* @txt:文本信息*/
private static String getTopKeywords(int keywordCount,String txt) throwsSolrServerException,IOException{
Map totalMap =getAllWordsFromText(txt);
List> result =GetTopvalues(totalMap,keywordCount);
String keywords= "";
StringBuilder sb= newStringBuilder();for (Map.Entrylt : result) {
sb.append(lt.getKey());
sb.append(":");
sb.append(lt.getValue());
sb.append(",");
}
keywords=sb.toString();if(result.size()>1){
keywords= keywords.substring(0,keywords.length()-1);
}returnkeywords;
}/*** @Author:sks
* @Description:获取文本中出现次数较高的关键字及其词频
* @Date:
* @keywordCount:出现的次数
* @txt:文本信息*/
private static String getTopKeywordsFrequency(int keywordCount,String txt) throwsSolrServerException,IOException{
Map totalMap =getAllWordsFromText(txt);
List> result =GetTopvalues(totalMap,keywordCount);
StringBuilder sb= newStringBuilder();for (Map.Entrylt : result) {
sb.append(lt.getKey());
sb.append(":");float value = (float)lt.getValue()/singleDocTotalCount ;
sb.append(String.format("%.8f",value));
sb.append(",");
}
String keywords=sb.toString();if(result.size()>1){
keywords= keywords.substring(0,keywords.length()-1);
}returnkeywords;
}/*** @Author:sks
* @Description:更新多个字段数据,
* @Date:
* @maps:字段名和值键值对*/
private static void updateMultiFieldData( Map maps) throwsIOException, SolrServerException{
Set keys =maps.keySet();
SolrInputDocument doc= newSolrInputDocument();for(String key:keys){
doc.addField(key,maps.get(key));
}
solr.add(doc);
UpdateResponse rspCommit=solr.commit();
System.out.println("commit doc to index" + " result:" + rspCommit.getStatus() + " Qtime:" +rspCommit.getQTime());
}/*** @Author:sks
* @Description:把字符串分词,统计分词的重复次数,把分词和次数存在键值对里面
* @Date:*/
private static Map getAllWordsFromText(String txt) throwsSolrServerException, IOException {
List wlist = newArrayList();//由于文本太长,不能进行分词,所以要把文本拆分开分别进行分词,每500长度的文本作为一组进行分词
int l =txt.length();if(txt.length()>500) {
String[] txts= txt.split("\n");
String words= "";for (int i = 0; i < txts.length; i++) {if (words.length() < 500)
{
words+= txts[i] + "。";
}else{
wlist.add(words);
words= txts[i] + "。";
}
}
wlist.add(words);
}else{
wlist.add(txt);
}int count = 0;
Map rawMap = null;
List results = null;
Set keys = null;
Map totalMap = new HashMap();
NewsSummary obj= newNewsSummary();for(String txtline :wlist) {if (txtline != null && txtline.length()>0) {
results=obj.IKSegment(txtline);//results = getAnalysis(txtline);
rawMap =getWordsCount(results);
keys=rawMap.keySet();for(String key : keys)
{
singleDocTotalCount++;
count=rawMap.get(key);if(totalMap.containsKey(key))
{
count+=totalMap.get(key);
totalMap.remove(key);
}else{
totalMap.put(key,count);
}
}
}
}returntotalMap;
}/*** @Author:sks
* @Description:把列表中的数据存储在键值对里面,重复次数累加
* @Date:*/
private static Map getWordsCount(List txts) throwsSolrServerException, IOException {
Map resultMap = new HashMap();int count = 1;for(int i = 0;i
String key=txts.get(i) ;if(key.length()>1) {
count= 1;if(resultMap.containsKey(key)) {
count+=resultMap.get(key);
}
resultMap.remove(key);
resultMap.put(key, count);
}
}returnresultMap;
}/*** @Author:sks
* @Description:给map按照值降序排序,并取值大于topValue的键值对返回
* @Date:*/
private static List> GetTopvalues(Map hm,Integer topValue) throwsSolrServerException, IOException {
Map temp = new HashMap();
Set keys =hm.keySet();int value = 0;for(String key :keys)
{
value=hm.get(key);if (value >=topValue) {
temp.put(key,value);
}
}//这里将map.entrySet()转换成list
List> list = new ArrayList>(temp.entrySet());//然后通过比较器来实现排序
Collections.sort(list,new Comparator>() {//降序排序
public int compare(Map.Entryo1,
Map.Entryo2) {returno2.getValue().compareTo(o1.getValue());
}
});returnlist;
}/*** @Author:sks
* @Description:根据文件名获取文件的ContentType类型
* @Date:*/
public staticString getFileContentType(String filename) {
String contentType= "";
String prefix= filename.substring(filename.lastIndexOf(".") + 1);if (prefix.equals("xlsx")) {
contentType= "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
}else if (prefix.equals("pdf")) {
contentType= "application/pdf";
}else if (prefix.equals("doc")) {
contentType= "application/msword";
}else if (prefix.equals("txt")) {
contentType= "text/plain";
}else if (prefix.equals("xls")) {
contentType= "application/vnd.ms-excel";
}else if (prefix.equals("docx")) {
contentType= "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
}else if (prefix.equals("ppt")) {
contentType= "application/vnd.ms-powerpoint";
}else if (prefix.equals("pptx")) {
contentType= "application/vnd.openxmlformats-officedocument.presentationml.presentation";
}else{
contentType= "othertype";
}returncontentType;
}/*** @Author:sks
* @Description:删除指定ID的索引
* @Date:
* @id:索引ID*/
public static void delIndexByID(String id) throwsSolrServerException, IOException{
UpdateResponse ur=solr.deleteById(id);
System.out.println(ur);
UpdateResponse c=solr.commit();
System.out.println(c);
}/*** @Author:sks
* @Description:设置idf
* @Date:
* @docmentTotalCount:资料库中文档总数*/
private static void setIdf(int docmentTotalCount) throwsSolrServerException,IOException {
Map map =getidKeywordTFMap(docmentTotalCount);
Set keys =map.keySet();
String[] words= null;
String word= "";double tf = 0;double idf = 0;
StringBuilder sbtfidf= null;
StringBuilder sbidf= null;
String singleword= "";//region
for(String key:keys){
word=map.get(key);//去掉开头的[和结尾的]符合
word = word.substring(1,word.length()-2);
words= word.split(",");
sbtfidf= newStringBuilder();
sbidf= newStringBuilder();for(String w :words)
{
System.out.println(w);
tf= Float.parseFloat(w.split(":")[1]);
singleword= w.split(":")[0];
idf=getwordIdf(singleword,docmentTotalCount);
sbidf.append(singleword);
sbidf.append(":");
sbidf.append(getwordindocCount(singleword,docmentTotalCount));
sbidf.append(",");
sbtfidf.append(singleword);
sbtfidf.append(";");
sbtfidf.append(String .format("%.12f",tf*idf));
sbtfidf.append(",");
}//endregion
updateSingleData(key,"wordinDocCount",sbidf.toString());
updateSingleData(key,"tfIdf",sbtfidf.toString());
}
}/*** @Author:sks
* @Description:获取ID和keyword键值对
* @Date:*/
private static Map getidKeywordTFMap(int docmentTotalCount) throwsSolrServerException,IOException {//使用这个对象做查询
SolrQuery params = newSolrQuery();//查询所有数据
params.setQuery("*:*");//params.setQuery("id:5.15%2B袁纯子、王英%2B路透社报告:欧洲七大公共媒体数字化转型进路 (1).docx");//params.set("q", "*:*");
params.setFields("id,tf");
params.setStart(0);
params.setRows(docmentTotalCount);
QueryResponse rsp=solr.query(params);//SolrDocumentList docs = rsp.getResults();
List list =rsp.getResults();
Map idkeywordMap = new HashMap();
String result= "";
String id= "";//循环打印数据集合
for(SolrDocument sd : list) {if(sd.size()>1) {
idkeywordMap.put(sd.get("id").toString(), sd.get("tf").toString());
}
}returnidkeywordMap;
}/*** @Author:sks
* @Description:获取关键字的idf(inverse docment frequency) = log(文件总数/包含关键字的文件数)
* @Date:
* @word:关键字
* @docmentTotalCount:资料库中文档总数*/
private static double getwordIdf(String word,int docmentTotalCount) throwsSolrServerException,IOException {int count =getwordindocCount(word,docmentTotalCount);double idf = 0;if (count>0) {
idf= Math.log((double) docmentTotalCount /count);
}returnidf;
}/*** @Author:sks
* @Description:获取单词所在文档的个数
* @Date:
* @word:关键字
* @docmentTotalCount:资料库中文档总数*/
private static int getwordindocCount(String word,int docmentTotalCount) throwsSolrServerException,IOException {//使用这个对象做查询
SolrQuery params = newSolrQuery();//查询所有数据//params.setQuery("*:*");
params.setQuery("text:"+word);
params.setFields("freq:termfreq(text,'"+word+"')");//分页,默认是分页从0开始,每页显示10行
params.setStart(0);
params.setRows(docmentTotalCount);
QueryResponse queryResponse=solr.query(params);//拿到数据集合,返回查询结果
List list =queryResponse.getResults();int count = 0;//循环打印数据集合
for(SolrDocument solrDocument : list) {if(solrDocument.get("freq").toString()!="0"){
count++;
}
}returncount;
}/*** @Author:sks
* @Description:更新索引中单个属性数据
* @Date:
* @id:索引ID
* @fieldName:属性名称
* @fieldValue:属性值*/
public static void updateSingleData(String id,String fieldName,Object fieldValue) throwsSolrServerException,IOException{
Map oper = new HashMap();
oper.put("set", fieldValue);
SolrInputDocument doc= newSolrInputDocument();
doc.addField("id", id);
doc.addField(fieldName, oper);
UpdateResponse rsp=solr.add(doc);
System.out.println("update doc id:" + id + " result:" + rsp.getStatus() + " Qtime:" +rsp.getQTime());
UpdateResponse rspCommit=solr.commit();
System.out.println("commit doc to index" + " result:" + rspCommit.getStatus() + " Qtime:" +rspCommit.getQTime());
}/*** @Author:sks
* @Description:根据tf-idf 原理 获取资料库中重要关键字,
* @Date:*/
private static void getImportanceKeywords(int docmentTotalCount) throwsSolrServerException,IOException {
Map map =getidKeywordTFMap(docmentTotalCount);
Set keys =map.keySet();
String[] words= null;
String word= "";double tf = 0;double idf = 0;double tfidf = 0;
String singleword= "";
Map keywordidfMap = new HashMap();//region
for(String key:keys){
word=map.get(key);//去掉开头的[和结尾的]符合
word = word.substring(1,word.length()-2);
words= word.split(",");for(String w :words)
{
tf= Float.parseFloat(w.split(":")[1]);
singleword= w.split(":")[0];
idf=getwordIdf(singleword,docmentTotalCount);
tfidf= tf *idf ;if(keywordidfMap.containsKey(singleword))
{if(keywordidfMap.get(singleword)>tfidf)
{
keywordidfMap.remove(singleword);
keywordidfMap.put(singleword,tfidf);
}
}else{
keywordidfMap.put(singleword,tfidf);
}
}
}
List> sortedSentList = new ArrayList>(keywordidfMap.entrySet());//按得分从高到底排序好的句子,句子编号与得分//System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");
Collections.sort(sortedSentList, new Comparator>(){//@Override
public int compare(Map.Entry o1, Map.Entryo2) {return o2.getValue() == o1.getValue() ? 0:
(o2.getValue()> o1.getValue() ? 1 : -1);
}
});for(Map.Entryentry:sortedSentList) {
System.out.println(entry.getKey()+ ":" +entry.getValue());
}
}/*** @Author:sks
* @Description:获取系统当天日期yyyy-mm-dd
* @Date:*/
private static String GetCurrentDate() throwsSolrServerException,IOException {
Date dt= newDate();//最后的aa表示“上午”或“下午” HH表示24小时制 如果换成hh表示12小时制//SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss aa");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String day=sdf.format(dt);returnday;
}
}