tf idf java实现_TFIDF的JAVA实现---[转]

该博客详细介绍了如何在Java中实现TF-IDF算法,包括词汇表的生成、词频计算、逆文档频率计算以及相似度计算。通过示例代码展示了从文本处理到最终得到相似度矩阵的完整过程。
摘要由CSDN通过智能技术生成

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Dictionary;

import java.util.Enumeration;

import java.util.Hashtable;

import java.util.List;

public class TFIDFMeasure {

private String[] _docs;

private String[][] _ngramDoc;

private int _numDocs = 0;

private int _numTerms = 0;

private ArrayList> _terms;

private int[][] _termFreq;

private float[][] _termWeight;

private int[] _maxTermFreq;

private int[] _docFreq;

ITokeniser _tokenizer = null;

private Dictionary

Object> _wordsIndex = new

Hashtable();

public TFIDFMeasure(String[] documents, ITokeniser tokeniser)

{

System.out.println("TFIDFMeasure()");

_docs = documents;//将传进来的docs[]字符串数组的值赋给_docs

_numDocs = documents.length;// _numDocs的值为文档的个数

System.out.println("documents.length------------------>"+documents.length);

_tokenizer = tokeniser;// _tokenizer为文本处理后的结果

System.out.println("start Init()");

myInit();

}

private void GeneratNgramText() {

}

private ArrayList

GenerateTerms(String[] docs) {

ArrayList uniques = new

ArrayList();

_ngramDoc = new

String[_numDocs][];//给字符串数组_ngramDoc赋值,第一维是文档的个数

System.out.println("_ngramDoc = new

String[_numDocs][]-----------_ngramDoc的值= "+_ngramDoc);

for (int i = 0; i < docs.length; i++) {

System.out.println(i + ":start tokenizer!");

List words =

_tokenizer.partition(docs[i]);//获取稳定 i 的/处理后的字符list

System.out.println("words.size(): " + words.size());

for (int j = 0; j < words.size(); j++)//将文档 i

中重复的词去掉,并存入 ArrayList uniques;

if (!uniques.contains(words.get(j)))

uniques.add(words.get(j));

}

return uniques;

}

private static Object

AddElement(Dictionary

collection, Object key,

Object newValue) {

Object element = collection.get(key);

collection.put(key, newValue);

return element;

}

private int GetTermIndex(String term) {

Object index = _wordsIndex.get(term);

if (index == null)

return -1;

return (Integer) index;

}

private void myInit() {

System.out.println("generate terms……");

_terms = GenerateTerms(_docs);//把重复的词去掉,并将值赋给 _terms

System.out.println("after generate ,terms.size()" +

_terms.size());

_numTerms = _terms.size();// _numTerms 为去重后数组的列数,也就是词的个数

System.out.println("_terms.size()---------->"+_numTerms);

_maxTermFreq = new int[_numDocs];

_docFreq = new int[_numTerms];

_termFreq = new int[_numTerms][];

_termWeight = new float[_numTerms][];

for (int i = 0; i < _terms.size(); i++) {

_termWeight[i] = new float[_numDocs];

_termFreq[i] = new int[_numDocs];

AddElement(_wordsIndex, _terms.get(i),

i);//将词_terms.get(i),存入哈希表的第i个位置

}

GenerateTermFrequency();

GenerateTermWeight();

}

private float Log(float num) {

return (float) Math.log(num);// log2

}

private void GenerateTermFrequency() {

for (int i = 0; i < _numDocs; i++) {

String curDoc = _docs[i];//定义字符串 curDoc,值是第i个文档;

Dictionary freq =

GetWordFrequency(curDoc);

Enumeration enums =

freq.keys();

while (enums.hasMoreElements()) {

String word = (String) enums.nextElement();

int wordFreq = (Integer) freq.get(word);

int termIndex = GetTermIndex(word);

if (termIndex == -1)

continue;

_termFreq[termIndex][i] = wordFreq;

_docFreq[termIndex]++;

if (wordFreq > _maxTermFreq[i])

_maxTermFreq[i] = wordFreq;

}

_maxTermFreq[i] = Integer.MIN_VALUE;

}

}

private void GenerateTermWeight() {

for (int i = 0; i < _numTerms; i++)

{//_numTerms是词的个数

for (int j = 0; j < _numDocs; j++)

_termWeight[i][j] = ComputeTermWeight(i, j);

}

}

private float GetTermFrequency(int term, int doc) {

int freq = _termFreq[term][doc];

int maxfreq = _maxTermFreq[doc];

return ((float) freq / (float) maxfreq);

}

private float GetInverseDocumentFrequency(int term) {

int df = _docFreq[term];

return Log((float) (_numDocs) / (float) df);

}

private float ComputeTermWeight(int term, int doc) {

float tf = GetTermFrequency(term, doc);

float idf = GetInverseDocumentFrequency(term);

return tf * idf;

}

private float[] GetTermVector(int doc) {

float[] w = new float[_numTerms];

for (int i = 0; i < _numTerms; i++)

w[i] = _termWeight[i][doc];

return w;

}

public double[] GetTermVector2(int doc) {

double[] ret = new double[_numTerms];

float[] w = GetTermVector(doc);

for (int i = 0; i < ret.length; i++) {

ret[i] = w[i];

}

// for (double d : ret) {

// //System.out.println(d);

// }

return ret;

}

public double GetSimilarity(int doc_i, int doc_j) {

double[] vector1 = GetTermVector2(doc_i);

double[] vector2 = GetTermVector2(doc_j);

return TermVector.ComputeCosineSimilarity(vector1,

vector2);

}

private Dictionary

Object> GetWordFrequency(String input) {

String convertedInput = input.toLowerCase();

List temp = new

ArrayList(_tokenizer

.partition(convertedInput));

String[] words = new String[temp.size()];

temp.toArray(words);

Arrays.sort(words);

String[] distinctWords = GetDistinctWords(words);

Dictionary result

= new Hashtable

Object>();

for (int i = 0; i < distinctWords.length; i++)

{

Object tmp;

tmp = CountWords(distinctWords[i], words);

result.put(distinctWords[i], tmp);

}

return result;

}

private static String[] GetDistinctWords(String[] input) {

if (input == null)

return new String[0];

else {

List list = new

ArrayList();

for (int i = 0; i < input.length; i++)

if (!list.contains(input[i])) // N-GRAM SIMILARITY?

list.add(input[i]);

String[] v = new String[list.size()];

return list.toArray(v);

}

}

private int CountWords(String word, String[] words) {

int itemIdx = Arrays.binarySearch(words, word);

if (itemIdx > 0)

while (itemIdx > 0

&&

words[itemIdx].equals(word))

itemIdx--;

int count = 0;

while (itemIdx < words.length

&& itemIdx >= 0)

{

if (words[itemIdx].equals(word))

count++;

itemIdx++;

if (itemIdx < words.length)

if (!words[itemIdx].equals(word))

break;

}

return count;

}

public int get_numTerms() {

return _numTerms;

}

public void set_numTerms(int terms) {

_numTerms = terms;

}

}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值