原文地址:余弦相似度-Java代码
作者:刘_学
//大三Java作业, 仅供参考,
// @刘_学
package cn;
import java.util.ArrayList;
public class CosSimilarity
{
public CosSimilarity()
{
// TODO Auto-generated constructor stub
}
public static void main(String[] args)
{
// TODO Auto-generated method stub
String D1 = "I don't loves DataBase but java.";
String D2 = "I don't like CaoJiba but JianXu.";
PorterStemmer s = new PorterStemmer(); // 单词词形规范化
D1 = s.getStemmer(D1);
D2 = s.getStemmer(D2);
String[] s1, s2;
s1 = D1.split(" ");
s2 = D2.split(" ");
// 测试进行单词词形规范化 System.out.println(D1);
// System.out.println(D2);
ArrayList a = new ArrayList(); // 创建动态数组,记录不同的单词
ArrayList aNum = new ArrayList(); // 创建动态数组,统计不同的单词各自出现的次数
ArrayList b = new ArrayList();
ArrayList bNum = new ArrayList();
for (int i = 0; i < s1.length; i++) // 将s1复制到动态数组a, 且词频统计数组初始化
{
a.add(s1[i]);
aNum.add(i, 1);
}
for (int i = 0; i < a.size() - 1; i++) // 记录a不同单词且统计词频
{
int tem = 1; // -----------------------暂存词频
for (int j = i + 1; j < a.size(); j++)
{
if (a.get(i).equalsIgnoreCase(a.get(j)))
{
tem++;
aNum.set(i, tem);
a.remove(j);
aNum.remove(j);
}
}
}
for (int i = 0; i < s2.length; i++) // 将s2复制到动态数组b, 且词频统计数组初始化
{
b.add(s2[i]);
bNum.add(i, 1);
}
for (int i = 0; i < b.size() - 1; i++) // 记录b不同单词且统计词频
{
int tem = 1; // -----------------------暂存词频
for (int j = i + 1; j < b.size(); j++)
{
if (b.get(i).equalsIgnoreCase(b.get(j)))
{
tem++;
bNum.set(i, tem);
b.remove(j);
bNum.remove(j);
}
}
}
double denominator = 0; // 计算W1K×W2K
for (int i = 0; i < a.size(); i++) // 计算W1K×W2K
{
for (int j = 0; j < b.size(); j++)
{
if (a.get(i).equals(b.get(j)))
denominator += ((double) aNum.get(i) * (double) bNum.get(j));
}
}
double sqW1 = 0, sqW2 = 0; // 计算两个向量的模
for (int i = 0; i < aNum.size(); i++)
{
sqW1 += (double) aNum.get(i) * (double) aNum.get(i);
}
for (int i = 0; i < bNum.size(); i++)
{
sqW2 += (double) bNum.get(i) * (double) bNum.get(i);
}
System.out.println("余弦相似度为" + denominator / Math.sqrt(sqW1 * sqW2)); // 输出结果
}
}
PorterStemmer相关代码,将下列文字copy到记事本,然后后缀修改为java即可
package cn;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
public class PorterStemmer
{
private char[] b;
private int i,
private static final int INC = 50;
public PorterStemmer()
{
b = new char[INC];
}
public void add(char ch)
{
if (i == b.length)
}
public void add(char[] w, int wLen)
{
if (i+wLen >= b.length)
}
public String toString() { return new String(b,0,i_end); }
public int getResultLength() { return i_end; }
public char[] getResultBuffer() { return b; }
private final boolean cons(int i)
{
switch (b[i])
}
private final int m()
{
int n = 0;
}
private final boolean vowelinstem()
{
int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
}
private final boolean doublec(int j)
{
if (j < 1) return false;
}
private final boolean cvc(int i)
{
if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
}
private final boolean ends(String s)
{
int l = s.length();
}
private final void setto(String s)
{
int l = s.length();
}
private final void r(String s) { if (m() > 0) setto(s); }
private final void step1()
{
if (b[k] == 's')
}
private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
private final void step3() { if (k == 0) return; switch (b[k-1])
{
} }
private final void step4() { switch (b[k])
{
} }
private final void step5()
{
if (k == 0) return; switch (b[k-1])
}
private final void step6()
{
j = k;
}
public void stem()
{
k = i - 1;
}
public String getStemmer(String originaltext){
String stemtext="";
//
Stemmer s = new Stemmer();
//
for (int i = 0; i < args.length; i++)
//
try
//
{
//
FileInputStream in = new FileInputStream(args[i]);
//
System.out.print(u);
//
System.out.print((char)ch);
//
System.out.println("error reading " + args[i]);
//
break;
//
}
//
catch (FileNotFoundException e)
//
{
System.out.println("file " + args[i] + " not found");
//
break;
//
}
}
public static void main(String[] args)
{
PorterStemmer s = new PorterStemmer();
// System.out.println(s.getStemmer("Test program for demonstrating the Stemmer.
It reads text from a list of files, stems each word, and writes the result to standard output. Note that the word stemmed is expected to be in lower case: forcing lower case must be done outside the Stemmer class."));
System.out.println(s.getStemmer("parallel computer"));
System.out.println(s.getStemmer("parallel computing"));
System.out.println(s.getStemmer("pens"));
System.out.println(s.getStemmer("pen"));
}
}