基于字符串的相似度度量,利用NGgramL和余弦相似度去实现

1.主方法(注释了代码多为了测试以及方法说明,打印函数也主要是为了调试)

package com.tylg.test1;
/**
 * 主方法
 * @author 蒋承材
 */
import java.io.FileInputStream;
import java.io.FileNotFoundException;

import org.apache.jena.ontology.DatatypeProperty;
import org.apache.jena.ontology.Individual;
import org.apache.jena.ontology.ObjectProperty;
import org.apache.jena.ontology.OntClass;
import org.apache.jena.ontology.OntModel;
import org.apache.jena.ontology.OntModelSpec;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.util.iterator.ExtendedIterator;


public class Test {

	public static void main(String[] args) {
		OntologyParsing op = new OntologyParsing();			//构建本体模型对象
		String file101 = "D:\\eclipse\\testing\\101\\onto.rdf";
		String file201 = "D:\\eclipse\\testing\\201\\onto.rdf";
		VectorSpaceMatrix vsm = new VectorSpaceMatrix();
		System.out.println("我们要解析的文件是:" + file101);
		/**
		 * 解析本体的方法调用如下,op为本体模型对象,file'XXX为被解析文件的位置
		 */
		// parsingOfClassAboutSpecifiedFile(op, file101).length;			//返回的为Class数组
		// parsingOfDatatypeProperityAboutSpecifiedFile(op, file101).length;			//返回的为DatatypeProperity数组
		// parsingOfObjectProperityAboutSpecifiedFile(op, file101).length;			//返回的为ObjectProperity数组
		
		System.out.println("我们要解析的文件是:" + file201);
		// parsingOfClassAboutSpecifiedFile(op, file201).length;		
		// parsingOfDatatypeProperityAboutSpecifiedFile(op, file201).length;		
		// parsingOfObjectProperityAboutSpecifiedFile(op, file201).length;
		
		/**
		 * double [][]classesAboutNGramLMATRIX, double [][]datatypeproperityAboutNGramLMATRIX, double [][]objectproperityAboutNGramLMATRIX
		 * 以上三个二维数组的内容都是相似度矩阵值,分别为两个本体class,dataproperity,objectproperity中各个元素关于NGramL的相似度
		 */
		double [][]classesAboutNGramLMATRIX = new double[parsingOfClassAboutSpecifiedFile(op, file101).length][parsingOfClassAboutSpecifiedFile(op, file201).length];
		classesAboutNGramLMATRIX = vsm.vectorSpaceMatrixOfNGramL(parsingOfClassAboutSpecifiedFile(op, file101), parsingOfClassAboutSpecifiedFile(op, file201), 3);
		double [][]datatypeproperityAboutNGramLMATRIX = new double[parsingOfDatatypeProperityAboutSpecifiedFile(op, file101).length][parsingOfDatatypeProperityAboutSpecifiedFile(op, file201).length];
		datatypeproperityAboutNGramLMATRIX = vsm.vectorSpaceMatrixOfNGramL(parsingOfClassAboutSpecifiedFile(op, file101), parsingOfDatatypeProperityAboutSpecifiedFile(op, file201), 3);
		double [][]objectproperityAboutNGramLMATRIX = new double[parsingOfObjectProperityAboutSpecifiedFile(op, file101).length][parsingOfObjectProperityAboutSpecifiedFile(op, file201).length];
		objectproperityAboutNGramLMATRIX = vsm.vectorSpaceMatrixOfNGramL(parsingOfObjectProperityAboutSpecifiedFile(op, file101), parsingOfObjectProperityAboutSpecifiedFile(op, file201), 3);
		/**
		 * 三个for循环单纯为了检验关于NGramL相似度矩阵的值,作用就是为了调试时打印输出
		 */
		System.out.println("\n*****************************\n*    关于NGramL的相似度矩阵         *\n*****************************");
		System.out.println("\n关于class的NGramL相似度矩阵如下:");
		for(int i = 0; i < classesAboutNGramLMATRIX.length; i++) {
			for(int j = 0; j < classesAboutNGramLMATRIX[0].length; j++) {
				System.out.print(classesAboutNGramLMATRIX[i][j] + "  ");
			}
			System.out.println();
		}
		System.out.println("\n关于datatypeproperity的NGramL相似度矩阵如下:");
		for(int i = 0; i < datatypeproperityAboutNGramLMATRIX.length; i++) {
			for(int j = 0; j < datatypeproperityAboutNGramLMATRIX[0].length; j++) {
				System.out.print(datatypeproperityAboutNGramLMATRIX[i][j] + "  ");
			}
			System.out.println();
		}
		System.out.println("\n关于objectproperity的NGramL相似度矩阵如下:");
		for(int i = 0; i < objectproperityAboutNGramLMATRIX.length; i++) {
			for(int j = 0; j < objectproperityAboutNGramLMATRIX[0].length; j++) {
				System.out.print(objectproperityAboutNGramLMATRIX[i][j] + "  ");
			}
			System.out.println();
		}
		/**
		 * double [][]classesAboutCosSimilarMATRIX, double [][]datatypeproperityAboutCosSimilarMATRIX, double [][]objectproperityAboutCosSimilarMATRIX
		 * 以上三个二维数组的内容都是相似度矩阵值,分别为两个本体class,dataproperity,objectproperity中各个元素关于CosSimilar的相似度
		 */
		double [][]classesAboutCosSimilarMATRIX = new double[parsingOfClassAboutSpecifiedFile(op, file101).length][parsingOfClassAboutSpecifiedFile(op, file201).length];
		classesAboutCosSimilarMATRIX = vsm.vectorSpaceMatrixOfCosSimilar(parsingOfClassAboutSpecifiedFile(op, file101), parsingOfClassAboutSpecifiedFile(op, file201));
		double [][]datatypeproperityAboutCosSimilarMATRIX = new double[parsingOfDatatypeProperityAboutSpecifiedFile(op, file101).length][parsingOfDatatypeProperityAboutSpecifiedFile(op, file201).length];
		datatypeproperityAboutCosSimilarMATRIX = vsm.vectorSpaceMatrixOfCosSimilar(parsingOfClassAboutSpecifiedFile(op, file101), parsingOfDatatypeProperityAboutSpecifiedFile(op, file201));
		double [][]objectproperityAboutCosSimilarMATRIX = new double[parsingOfObjectProperityAboutSpecifiedFile(op, file101).length][parsingOfObjectProperityAboutSpecifiedFile(op, file201).length];
		objectproperityAboutCosSimilarMATRIX = vsm.vectorSpaceMatrixOfCosSimilar(parsingOfObjectProperityAboutSpecifiedFile(op, file101), parsingOfObjectProperityAboutSpecifiedFile(op, file201));
		/**
		 * 三个for循环单纯为了检验关于CosSimilar相似度矩阵的值,作用就是为了调试时打印输出
		 */
		System.out.println("\n\n********************************\n*    关于CosSimilar的相似度矩阵         *\n********************************");
		System.out.println("\n关于class的CosSimilar相似度矩阵如下:");
		for(int i = 0; i < classesAboutCosSimilarMATRIX.length; i++) {
			for(int j = 0; j < classesAboutCosSimilarMATRIX[0].length; j++) {
				System.out.print(classesAboutCosSimilarMATRIX[i][j] + "  ");
			}
			System.out.println();
		}
		System.out.println("\n关于datatypeproperity的CosSimilar相似度矩阵如下:");
		for(int i = 0; i < datatypeproperityAboutCosSimilarMATRIX.length; i++) {
			for(int j = 0; j < datatypeproperityAboutCosSimilarMATRIX[0].length; j++) {
				System.out.print(datatypeproperityAboutCosSimilarMATRIX[i][j] + "  ");
			}
			System.out.println();
		}
		System.out.println("\n关于objectproperity的CosSimilar相似度矩阵如下:");
		for(int i = 0; i < objectproperityAboutCosSimilarMATRIX.length; i++) {
			for(int j = 0; j < objectproperityAboutCosSimilarMATRIX[0].length; j++) {
				System.out.print(objectproperityAboutCosSimilarMATRIX[i][j] + "  ");
			}
			System.out.println();
		}		
		/*
		OntModel ontModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM);
		try {
			ontModel.read(new FileInputStream("D:\\eclipse\\testing\\101\\onto.rdf"), "");
		} catch (FileNotFoundException e1) {
			e1.printStackTrace();
		}
		*/
		
		/*
		ExtendedIterator exIter = ontModel.listClasses();
		System.out.print("这些类名全称(class)是:");
		int i1 = 0;
		while (exIter.hasNext()) {
			OntClass oc = (OntClass) exIter.next();
			if ((oc.getURI() != null) && (oc.getURI().toString().startsWith("http://oaei"))) {
				i1++;		
				String s1 = oc.getURI() + "";
				String s2 = s1.substring(0, s1.indexOf("#"));
				String s3 = s1.substring(s2.length() + 1, s1.length());
				System.out.println(s3);
			}
		}
		System.out.println("class的个数为" + i1 + "个");
		System.out.println("---------------------------------------------------");
		
		ExtendedIterator exIter1 = ontModel.listDatatypeProperties();
		int i2 = 0;
		System.out.print("这些DatatypeProperty是:");       
		while (exIter1.hasNext()) {
			DatatypeProperty oc = (DatatypeProperty) exIter1.next();
			if (oc.getURI() != null && (oc.getURI().toString().startsWith("http://oaei"))) {//字符串截取判断
				i2++;				
				String s1 = oc.getURI() + "";
				String s2 = s1.substring(0, s1.indexOf("#"));
				String s3 = s1.substring(s2.length() + 1, s1.length());
				System.out.println(s3);
			}
		}
		System.out.println("DatatypeProperty的个数为" + i2 + "个");
		System.out.println("---------------------------------------------------");
		ExtendedIterator exIter2 = ontModel.listObjectProperties();
		int i3 = 0;
		System.out.print("这些ObjectProperty是:");
		while (exIter2.hasNext()) {
			ObjectProperty oc = (ObjectProperty) exIter2.next();
			if (oc.getURI() != null && (oc.getURI().toString().startsWith("http://oaei"))) {
				i3++;			
				String s1 = oc.getURI() + "";
				String s2 = s1.substring(0, s1.indexOf("#"));
				String s3 = s1.substring(s2.length() + 1, s1.length());
				System.out.println(s3);
			}
		}
		System.out.println("ObjectProperty的个数为" + i3 + "个");
		*/
	}

	public static String[] parsingOfObjectProperityAboutSpecifiedFile(OntologyParsing op, String file1) {
		int i3 = op.weGetObjectProperties(op.buildModel(file1)).length;		//本体模型以参数形式传入,获得关于DatatypeProperity的数组长度
		String []s3 = new String[i3];									//创建对应长度的数组
		s3 = op.weGetObjectProperties(op.buildModel(file1));				//是中存放的就是各个ObjectProperity
		/*
		System.out.println("ObjectProperities有" + i3 + "个分别为:");
		for (String string : s3) {
			System.out.print(" " + string);
		}
		System.out.println("\n---------------------------------------");
		*/
		return s3;
	}

	public static String[] parsingOfDatatypeProperityAboutSpecifiedFile(OntologyParsing op, String file1) {
		int i2 = op.weGetDatatypeProperities(op.buildModel(file1)).length;	//本体模型以参数形式传入,获得关于DatatypeProperity的数组长度
		String []s2 = new String[i2];									//创建对应长度的数组
		s2 = op.weGetDatatypeProperities(op.buildModel(file1));				//是中存放的就是各个DatatypeProperity
		/*
		System.out.println("DatatypeProperities有" + i2 + "个分别为:");
		for (String string : s2) {
			System.out.print(" " + string);
		}
		System.out.println("\n---------------------------------------");
		*/
		return s2;
	}

	public static String[] parsingOfClassAboutSpecifiedFile(OntologyParsing op, String file1) {
		int i1 = op.weGetClasses(op.buildModel(file1)).length;	//本体模型以参数形式传入,获得关于Classes的数组长度
		String []s1 = new String[i1];						//创建对应长度的数组
		s1 = op.weGetClasses(op.buildModel(file1));				//是中存放的就是各个Class
		/*
		System.out.println("Classes有" + i1 + "个分别为:");
		for (String string : s1) {
			System.out.print(" " + string);
		}
		System.out.println("\n---------------------------------------");
		*/
		return s1;
	}

}

2.该类是为了提供关于余弦相似度和NGramL的计算,输入都为两个字符串数组,输出为二维数组(即相似度矩阵)

package com.tylg.test1;
/**
 * 计算相似度矩阵,传入的为两个字符串数组,返回值为相似度的double类型的值。
 * @author 蒋承材
 */
public class VectorSpaceMatrix {
	/**
	 * 
	 * @param s1为字符串一维数组
	 * @param s2为字符串一维数组
	 * @return 返回类型为二维数组,该二维数组是关于NGramL的相似度矩阵
	 */
	public static double [][] vectorSpaceMatrixOfNGramL(String []s1, String []s2, int n) {
		int len1 = s1.length;
		int len2 = s2.length;
		double [][]result = new double[len1][len2];
		WaysOfSimilary wos = new WaysOfSimilary();
		for(int i = 0; i < len1; i++) {
			for(int j = 0; j < len2; j++) {
				result[i][j] = wos.NGramL(s1[i], s2[j], n);
			}
		}
		return result;
	}
	/**
	 * 
	 * @param s1为字符串一维数组
	 * @param s2为字符串一维数组
	 * @return 返回类型为二维数组,该二维数组是关于cosSimilar(余弦相似度)的相似度矩阵
	 */
	public static double[][] vectorSpaceMatrixOfCosSimilar(String []s1, String []s2) {
		int len1 = s1.length;
		int len2 = s2.length;
		double [][]result = new double[len1][len2];
		WaysOfSimilary wos = new WaysOfSimilary();
		for(int i = 0; i < len1; i++) {
			for(int j = 0; j < len2; j++) {
				result[i][j] = wos.cosSimilar(s1[i], s2[j]);
			}
		}
		return result;
	}
}

3.将指定文件的本体进行解析,首个方法为本体模型构建,后面的三个方法为解析方法,在此解析的为class,dataproperity以及objectproperity。

package com.tylg.test1;
/**
 * 本体解析出来的东西放入各自数组当中
 */
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;

import org.apache.jena.ontology.DatatypeProperty;
import org.apache.jena.ontology.ObjectProperty;
import org.apache.jena.ontology.OntClass;
import org.apache.jena.ontology.OntModel;
import org.apache.jena.ontology.OntModelSpec;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.util.iterator.ExtendedIterator;

public class OntologyParsing {
	public static OntModel buildModel(String s) {
		OntModel ontModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM);
		try {
			ontModel.read(new FileInputStream(s), "");
		} catch (FileNotFoundException e1) {
			e1.printStackTrace();
		}
		return ontModel;
	}

	/**
	 * 
	 * @param ontModel为本体模型,是本体解析的基础
	 * @return 为一个一维数组,该数组存放的是本体中的Class,展现形式为字符串
	 *         该方法利用List集合先添加符合要求的Class,然后将其转化为对应长度的数组,以返回值类型返回
	 */
	public String[] weGetClasses(OntModel ontModel) {
		List<String> list = new ArrayList<>();
		ExtendedIterator exIter = ontModel.listClasses();
		// System.out.print("这些类名全称(class)是:");
		int i1 = 0;
		while (exIter.hasNext()) {
			OntClass oc = (OntClass) exIter.next();
			if ((oc.getURI() != null) && (oc.getURI().toString().startsWith("http://oaei"))) {
				i1++;
				String s1 = oc.getURI() + "";
				String s2 = s1.substring(0, s1.indexOf("#"));
				String s3 = s1.substring(s2.length() + 1, s1.length());
				list.add(s3);
			}
		}
		// System.out.println("class的个数为" + i1 + "个");
		String[] classes = (String[]) list.toArray(new String[i1]);
		return classes;
	}

	/**
	 * 
	 * @param ontModel为本体模型,是本体解析的基础
	 * @return 为一个一维数组,该数组存放的是本体中的DatatypeProperity,展现形式为字符串
	 *         该方法利用List集合先添加符合要求的DatatypeProperity,然后将其转化为对应长度的数组,以返回值类型返回
	 */
	public String[] weGetDatatypeProperities(OntModel ontModel) {
		List<String> list = new ArrayList<>();
		int i2 = 0;
		ExtendedIterator exIter1 = ontModel.listDatatypeProperties();
		// System.out.print("这些DatatypeProperty是:");
		while (exIter1.hasNext()) {
			DatatypeProperty oc = (DatatypeProperty) exIter1.next();
			if (oc.getURI() != null && (oc.getURI().toString().startsWith("http://oaei"))) {// 字符串截取判断
				i2++;
				String s1 = oc.getURI() + "";
				String s2 = s1.substring(0, s1.indexOf("#"));
				String s3 = s1.substring(s2.length() + 1, s1.length());
				// System.out.println(s3);
				list.add(s3);
			}
		}
		String[] datatypeproperities = (String[]) list.toArray(new String[i2]);
		// System.out.println("DatatypeProperty的个数为" + i2 + "个");
		return datatypeproperities;
	}
	/**
	 * 
	 * @param ontModel为本体模型,是本体解析的基础
	 * @return 为一个一维数组,该数组存放的是本体中的ObjectProperity,展现形式为字符串
	 *         该方法利用List集合先添加符合要求的ObjectProperity,然后将其转化为对应长度的数组,以返回值类型返回
	 */
	public String[] weGetObjectProperties(OntModel ontModel) {
		ExtendedIterator exIter2 = ontModel.listObjectProperties();
		int i3 = 0;
		List<String> list = new ArrayList<>();
		// System.out.print("这些ObjectProperty是:");
		while (exIter2.hasNext()) {
			ObjectProperty oc = (ObjectProperty) exIter2.next();
			if (oc.getURI() != null && (oc.getURI().toString().startsWith("http://oaei"))) {
				i3++;
				String s1 = oc.getURI() + "";
				String s2 = s1.substring(0, s1.indexOf("#"));
				String s3 = s1.substring(s2.length() + 1, s1.length());
				// System.out.println(s3);
				list.add(s3);
			}
		}
		String []objectProperities = (String[]) list.toArray(new String[i3]);
		// System.out.println("ObjectProperty的个数为" + i3 + "个");
		return objectProperities;
	}

}

4. 以下的两个方法为计算两个字符串的NGramL和余弦相似度,返回的为【0, 1】区间上的一个小数(需要调用自定义的数学工具类中小数位数保留方法)

package com.tylg.test1;
/**
 * 相关的相似度计算方法
 * @author 蒋承材
 */
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.lucene.search.spell.NGramDistance;

import com.tylg.math.MathTool;

public class WaysOfSimilary {

	// n为1、2、3时的数值分别代表 1-gram, 2-gram, 3-gram
	public static double NGramL(String a, String b, int n) {
		NGramDistance ng = new NGramDistance(n);
		return MathTool.decimalType(ng.getDistance(a, b));
	}

	// 计算两个英文字符串的相似度,未添加权重的余弦相似度计算
	public static double cosSimilar(String str1, String str2) {
		// 创建向量空间模型, 使用map集合实现, 键为词项, 值为长度为2的数组,存放着对应词项在字符串中的出现次数
		Map<Character, int[]> vectorSpace = new HashMap<>();
		// 为了避免频繁产生局部变量, 所以将itemCountArray声明在此
		int[] itemCountArray = null;
		// 以空格为分隔符, 分解字符串
		char[] strArray = str1.toCharArray();
		for (int i = 0; i < strArray.length; i++) {
			if (vectorSpace.containsKey(strArray[i])) { // 当包含这个键时
				++(vectorSpace.get(strArray[i])[0]); // 获取该键对应下的值-(该值为一个数组),然后将值-(数组的第一个元素)加一
			} else {
				itemCountArray = new int[2];
				itemCountArray[0] = 1;
				itemCountArray[1] = 0;
				vectorSpace.put(strArray[i], itemCountArray);
			}
		}

		strArray = str2.toCharArray();
		for (int i = 0; i < strArray.length; i++) {
			if (vectorSpace.containsKey(strArray[i])) {
				++(vectorSpace.get(strArray[i])[1]);
			} else {
				itemCountArray = new int[2];
				itemCountArray[0] = 0;
				itemCountArray[1] = 1;
				vectorSpace.put(strArray[i], itemCountArray);
			}
		}

		// 计算相似度
		double vector1Modulo = 0.00;// 量1的模
		double vector2Modulo = 0.00;// 量2的模
		double vectorProduct = 0.00;// 向量积
		Iterator iter = vectorSpace.entrySet().iterator();

		for (Character key : vectorSpace.keySet()) {
			itemCountArray = vectorSpace.get(key);
			vector1Modulo += itemCountArray[0] * itemCountArray[0];
			vector2Modulo += itemCountArray[1] * itemCountArray[1];
			vectorProduct += itemCountArray[0] * itemCountArray[1];
		}

		vector1Modulo = Math.sqrt(vector1Modulo);
		vector2Modulo = Math.sqrt(vector2Modulo);

		// 返回相似度
		return MathTool.decimalType(vectorProduct / (vector1Modulo * vector2Modulo));

	}
}

5. 该类是为了提供一些自定义的数学方法,decimalType就是保留小数位数的方法。

package com.tylg.math;

import java.math.BigDecimal;

public class MathTool {
	public static double decimalType(double x) {
		BigDecimal b = new BigDecimal(x); 
		double f = b.setScale(2, BigDecimal.ROUND_HALF_UP).doubleValue();
		return f;
	} 
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值