使用VSM计算文档相似度的一个程序示例

最新推荐文章于 2021-02-28 13:55:33 发布

neo_2011

最新推荐文章于 2021-02-28 13:55:33 发布

阅读量5.1k

点赞数

分类专栏： IR 文章标签：文档 string null integer import 数据结构

本文链接：https://blog.csdn.net/neo_2011/article/details/6940455

版权

IR 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

以前课程作业写的一个小程序，复习一下知识点……

VSM（vector space model，向量空间模型）http://en.wikipedia.org/wiki/Vector_space_model

本例中使用的TF-IDF权重模型公式为sim(q,d)等于

TF_doc表示文档中词项出现的次数，TF_q表示查询中词项出现的次数，df(w)表示出现词项的文档数，df(w)越小，则对应词项的IDF区分度越高。

程序主要涉及到预处理语料，设计统计词频文档数的数据结构，计算相似度。

一段语料的例子：

19980101-01-001-007/m  １９９７年/t  ，/w  是/v  中国/ns  发展/vn  历史/n  上/f  非常/d  重要/a  的/u  很/d  不/d  平凡/a  的/u  一/m  年/q  。/w  中国/ns  人民/n  决心/d  继承/v  邓/nr  小平/nr  同志/n  的/u  遗志/n  ，/w  继续/v  把/p  建设/v  有/v  中国/ns  特色/n  社会主义/n  事业/n  推向/v  前进/v  。/w  [中国/ns  政府/n]nt  顺利/ad  恢复/v  对/p  香港/ns  行使/v  主权/n  ，/w  并/c  按照/p  “/w  一国两制/j  ”/w  、/w  “/w  港人治港/l  ”/w  、/w  高度/d  自治/v  的/u  方针/n  保持/v  香港/ns  的/u  繁荣/an  稳定/an  。/w  [中国/ns  共产党/n]nt  成功/a  地/u  召开/v  了/u  第十五/m  次/q  全国/n  代表大会/n  ，/w  高举/v  邓小平理论/n  伟大/a  旗帜/n  ，/w  总结/v  百年/m  历史/n  ，/w  展望/v  新/a  的/u  世纪/n  ，/w  制定/v  了/u  中国/ns  跨/v  世纪/n  发展/v  的/u  行动/vn  纲领/n  。/w

最初我是用java字符串的正在表达式处理的，主要的一行代码是

str =str =str.replaceAll("(/|]|\\[)[A-Za-z]{0,2}", "") + '\n';

整个java程序15行。后来学习了一点sed，该预处理可以只用一条命令完成

sed -r 's/(\/|\[|\])[A-Za-z]{0,2}//g' 原始语料.txt > 原始语料.edited.txt

准备一个中文停用词表。

主要的数据结构：

import java.util.*;

public class TermInfo {
	int totalCount;
	Map<String, Integer> inDocInfo;
	public TermInfo() {
		totalCount = 0;
		inDocInfo = new HashMap<String, Integer>();
	}
}

import java.util.*;

public class ArticleInfo {
	int length;
	Vector<String> termVec;
	public ArticleInfo() {
		length = 0;
		termVec = new Vector<String>();
	}
}

import java.util.*;

public class ArticleSim {
	TreeMap<String, Double> dKeySim;
	public ArticleSim() {
		dKeySim = new TreeMap<String, Double>();
	}
}

计算相似度的主程序：

	import java.io.*;
	import java.util.*;

	public class Similarity extends ArticleInfo {
		private static final int ARTICLE_KEY_LENGTH = 15; // 文档key的长度
		private static final double THRESHOLD = 0.2; // 文档长度归一化阀值
		private static double averageArticleLen; // 文档平均长度
		private static Set<String> stopwordSet; // 停用词集合
		private static TreeMap<String, String> articleMap; // 文档key与内容的映射集
		private static TreeMap<String, TermInfo> termInfoMap; // 词项与其相关统计信息的映射集
		private static TreeMap<String, ArticleInfo> articleInfoMap; // 文档key与其相关统计信息（长度、词项集）的映射集
		private static TreeSet<String> termIntersection; // 文档词项交集

		public static void main(String args[]) {
			setStopwordSet();
			Similarity s = new Similarity();
			articleMap = s.setArticleMap();
			termInfoMap = s.setTermInfoSet();
			articleInfoMap = s.setArticleInfoMap();
			long sTime = System.currentTimeMillis();
			String qKey = null;
			qKey = articleMap.firstKey();
			for (int i = 0; i < 10; i++) {
				outputAnArticleSimArray(qKey);
				qKey = articleMap.higherKey(qKey);
			}
			outputAnArticleSimArray("19980101-03-012");
			long eTime = System.currentTimeMillis();
			System.out.println("Computing first 100 Articles' Similarity used "
					+ (eTime - sTime) + "ms" + '\n');
			// outputAnArticleSimArray("19980101-01-001");
			// double similarity =
			// s.computeSimilarity("19980101-01-001","19980101-01-002");
			// System.out.println(similarity);
			// sim.intersectTerm("19980101-01-001","19980101-01-002");
		}

		public static void setStopwordSet() {
			long sReadTime = System.currentTimeMillis();
			BufferedReader bufferedIn = null;
			String str = "";
			stopwordSet = new HashSet<String>();
			try {
				bufferedIn = new BufferedReader(new FileReader(
						"ChineseStopword.txt"));
				while ((str = bufferedIn.readLine()) != null) {
					stopwordSet.add(str);
				}
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
			long eReadTime = System.currentTimeMillis();
			System.out.println("stopword count is " + stopwordSet.size());
			System.out
					.println("Reading the Chinese stopwords into the Set container used "
							+ (eReadTime - sReadTime) + "ms" + '\n');
		}

		public TreeMap<String, String> setArticleMap() {
			long sReadTime = System.currentTimeMillis();
			BufferedReader bufferedIn = null;
			String str = "";
			TreeMap<String, String> articles = new TreeMap<String, String>();
			try {
				bufferedIn = new BufferedReader(new FileReader("Edited.txt"));
				while ((str = bufferedIn.readLine()) != null) {
					if (str.length() > 0) {
						String articleKey = str
								.substring(0, ARTICLE_KEY_LENGTH);// 前15位字符串做Key
						String articleContent = str
								.substring(ARTICLE_KEY_LENGTH + 4);
						if (articles.isEmpty()
								|| !articles.containsKey(articleKey))
							articles.put(articleKey, articleContent);
						else {
							String tempStr = articles.get(articleKey)
									+ articleContent;
							articles.put(articleKey, tempStr);
						}
					}
				}
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
			long eReadTime = System.currentTimeMillis();
			System.out.println("Article count is " + articles.size());
			System.out.println("Building the articleMap used "
					+ (eReadTime - sReadTime) + "ms" + '\n');
			return articles;
		}

		public TreeMap<String, TermInfo> setTermInfoSet() {
			long sTime = System.currentTimeMillis();
			TreeMap<String, TermInfo> terms = new TreeMap<String, TermInfo>();// 新建TreeMap对象
			String currentArticleKey = articleMap.firstKey();// 取文档集最小的键值
			while (currentArticleKey != null) {
				String currentArticleContent = articleMap
						.get(currentArticleKey);// 文档正文内容
				int len = 0;// 文档中非空字符数，没有区分字符和汉字，没有排除停用词
				for (int i = 0; i != currentArticleContent.length() - 1;) {// 遍历文档字符串
					String str = currentArticleContent.substring(i, i + 1);
					String tempTerm = "";
					if (str.equals(" ")) {
						i++;
						continue;
					}
					while (!str.equals(" ")) {
						tempTerm += str;
						len++;
						i++;
						str = currentArticleContent.substring(i, i + 1);
					}
					if (stopwordSet.contains(tempTerm))
						continue;// 遇到停用词，换下一个非空词项
					else {
						if (!terms.keySet().contains(tempTerm)) {// 加入新词项
							TermInfo aTermInfo = new TermInfo();
							aTermInfo.totalCount = 1;
							aTermInfo.inDocInfo.put(currentArticleKey, 1);
							terms.put(tempTerm, aTermInfo);
						} else {// 更新词项信息
							TermInfo bTermInfo = terms.get(tempTerm);
							bTermInfo.totalCount++;
							if (bTermInfo.inDocInfo.get(currentArticleKey) == null)
								bTermInfo.inDocInfo.put(currentArticleKey, 1);
							else {
								Integer tempInDocCount = bTermInfo.inDocInfo
										.get(currentArticleKey) + 1;
								bTermInfo.inDocInfo.put(currentArticleKey,
										tempInDocCount);
							}
							terms.put(tempTerm, bTermInfo);
						}
					}
				}
				currentArticleKey = articleMap.higherKey(currentArticleKey);// 取下一个严格高于当前键值的最小键值
			}
			long eTime = System.currentTimeMillis();
			System.out.println("Term total count is " + terms.size());
			System.out.println("Building the termInfoMap used "
					+ (eTime - sTime) + "ms" + '\n');
			return terms;
		}

		public TreeMap<String, ArticleInfo> setArticleInfoMap() {
			long sTime = System.currentTimeMillis();
			TreeMap<String, ArticleInfo> articleInfos = new TreeMap<String, ArticleInfo>();// 新建TreeMap对象
			String currentArticleKey = articleMap.firstKey();// 取文档集最小的键值
			int allArticleLength = 0;
			int len = 0;
			while (currentArticleKey != null) {
				String currentArticleContent = articleMap
						.get(currentArticleKey);// 文档正文内容
				if (articleInfos.get(currentArticleKey) == null) {
					len = 0;
				} else
					len = articleInfos.get(currentArticleKey).length;// 文档中非空字符数，没有区分字符和汉字，没有排除停用词
				for (int i = 0; i != currentArticleContent.length() - 1;) {// 遍历文档字符串
					String str = currentArticleContent.substring(i, i + 1);
					String tempTerm = "";
					if (str.equals(" ")) {
						i++;
						continue;
					}
					while (!str.equals(" ")) {
						tempTerm += str;
						len++;
						i++;
						str = currentArticleContent.substring(i, i + 1);
					}
					if (stopwordSet.contains(tempTerm))
						continue;// 遇到停用词，换下一个非空词项
					else {
						if (!articleInfos.keySet().contains(currentArticleKey)) {// 加入新词项
							ArticleInfo anArticleInfo = new ArticleInfo();
							anArticleInfo.length = len;
							anArticleInfo.termVec.add(tempTerm);
							articleInfos.put(currentArticleKey, anArticleInfo);
						} else {// 更新词项信息
							ArticleInfo bArticleInfo = articleInfos
									.get(currentArticleKey);
							bArticleInfo.length = len;
							bArticleInfo.termVec.add(tempTerm);
							articleInfos.put(currentArticleKey, bArticleInfo);
						}
					}
				}
				allArticleLength += articleInfos.get(currentArticleKey).length;
				currentArticleKey = articleMap.higherKey(currentArticleKey);// 取下一个严格高于当前键值的最小键值
			}
			long eTime = System.currentTimeMillis();
			averageArticleLen = allArticleLength / articleMap.size();
			System.out.println("Building the articleInfoMap used "
					+ (eTime - sTime) + "ms" + '\n');
			return articleInfos;
		}

		public static void intersectTerm(String q, String d) {// 求词项集合交集
		// long sTime = System.currentTimeMillis();
			termIntersection = new TreeSet<String>();
			TreeSet<String> tempSet = new TreeSet<String>();
			if (articleInfoMap.get(q) == null || articleInfoMap.get(d) == null) {
				System.out.println("Invalid article key.");
				System.exit(0);
			}
			for (int i = 0; i != articleInfoMap.get(q).termVec.size(); i++)
				tempSet.add(articleInfoMap.get(q).termVec.get(i));
			for (int i = 0; i != articleInfoMap.get(d).termVec.size(); i++) {
				boolean flag = tempSet.contains(articleInfoMap.get(d).termVec
						.get(i));
				if (flag == true) {
					termIntersection.add(articleInfoMap.get(d).termVec.get(i));
				}
			}
			// long eTime = System.currentTimeMillis();
			// System.out.println("文档"+q+"和文档"+d+"的词项交集有"+termIntersection.size()+"项。");
			// System.out.println(termIntersection);
			// System.out.println("Intersecting two termVecs used "+(eTime -
			// sTime)+"ms"+'\n');
			// return termIntersection;
		}

		public static double computeSimilarity(String qKey, String dKey) {// 计算两文档间的相似度，文档Key为参数
		// long sTime = System.currentTimeMillis();
			double sim = 0;
			intersectTerm(qKey, dKey);
			Iterator<String> it = termIntersection.iterator();
			while (it.hasNext()) {
				String commonTerm = it.next();
				int cwd = termInfoMap.get(commonTerm).inDocInfo.get(dKey);
				int cwq = termInfoMap.get(commonTerm).inDocInfo.get(qKey);
				int dfw = termInfoMap.get(commonTerm).inDocInfo.size();
				sim += (1 + Math.log(1 + Math.log(cwd)))
						/ (1 - THRESHOLD + THRESHOLD
								* articleInfoMap.get(dKey).length
								/ averageArticleLen) * cwq
						* Math.log((articleMap.size() + 1) / dfw);
			}
			// long eTime = System.currentTimeMillis();
			// System.out.println("Computing a similarity used "+(eTime -
			// sTime)+"ms"+'\n');
			// System.out.println("Sim is "+sim+'\n');
			return sim;
		}

		public static void outputAnArticleSimArray(String articleKey) {
			// long sTime = System.currentTimeMillis();
			TreeMap<String, Vector<ArticleSim>> articleSimArray = new TreeMap<String, Vector<ArticleSim>>();
			Vector<ArticleSim> anArticleSimVec = new Vector<ArticleSim>();
			double sim = 0;
			String qKey = articleKey;
			String dKey = articleMap.firstKey();
			while (dKey != null) {
				ArticleSim anArticleSim = new ArticleSim();
				if (dKey.equals(qKey))
					sim = 0;
				else
					sim = computeSimilarity(qKey, dKey);
				anArticleSim.dKeySim.put(dKey, sim);
				anArticleSimVec.add(anArticleSim);
				articleSimArray.put(qKey, anArticleSimVec);
				dKey = articleMap.higherKey(dKey);
			}
			// long eTime = System.currentTimeMillis();
			// System.out.println("Computing an ArticleSimArray used "+(eTime -
			// sTime)+"ms"+'\n');
			int index = 0;
			dKey = articleMap.firstKey();
			for (; dKey != null;) {
				System.out.println(qKey
						+ "->"
						+ dKey
						+ " = "
						+ articleSimArray.get(qKey).get(index).dKeySim
								.get(dKey));
				dKey = articleMap.higherKey(dKey);
				index++;
			}
		}
	}

neo_2011

关注

0
点赞
踩
9

收藏

觉得还不错? 一键收藏
3
评论
使用VSM计算文档相似度的一个程序示例

以前课程作业写的一个小程序，复习一下知识点……VSM（vector space model，向量空间模型）http://en.wikipedia.org/wiki/Vector_space_model本例中使用的TF-IDF权重模型公式为sim(q,d)等于TFdoc表示文档中词项出现的次数，TFq表示查询中词项出现的次数，df(w)表示出现词项的文档数，d
复制链接

扫一扫