以前课程作业写的一个小程序,复习一下知识点……
VSM(vector space model,向量空间模型)http://en.wikipedia.org/wiki/Vector_space_model
本例中使用的TF-IDF权重模型公式为sim(q,d)等于
TFdoc表示文档中词项出现的次数,TFq表示查询中词项出现的次数,df(w)表示出现词项的文档数,df(w)越小,则对应词项的IDF区分度越高。
程序主要涉及到预处理语料,设计统计词频文档数的数据结构,计算相似度。
一段语料的例子:
19980101-01-001-007/m 1997年/t ,/w 是/v 中国/ns 发展/vn 历史/n 上/f 非常/d 重要/a 的/u 很/d 不/d 平凡/a 的/u 一/m 年/q 。/w 中国/ns 人民/n 决心/d 继承/v 邓/nr 小平/nr 同志/n 的/u 遗志/n ,/w 继续/v 把/p 建设/v 有/v 中国/ns 特色/n 社会主义/n 事业/n 推向/v 前进/v 。/w [中国/ns 政府/n]nt 顺利/ad 恢复/v 对/p 香港/ns 行使/v 主权/n ,/w 并/c 按照/p “/w 一国两制/j ”/w 、/w “/w 港人治港/l ”/w 、/w 高度/d 自治/v 的/u 方针/n 保持/v 香港/ns 的/u 繁荣/an 稳定/an 。/w [中国/ns 共产党/n]nt 成功/a 地/u 召开/v 了/u 第十五/m 次/q 全国/n 代表大会/n ,/w 高举/v 邓小平理论/n 伟大/a 旗帜/n ,/w 总结/v 百年/m 历史/n ,/w 展望/v 新/a 的/u 世纪/n ,/w 制定/v 了/u 中国/ns 跨/v 世纪/n 发展/v 的/u 行动/vn 纲领/n 。/w
最初我是用java字符串的正在表达式处理的,主要的一行代码是
str =str =str.replaceAll("(/|]|\\[)[A-Za-z]{0,2}", "") + '\n';
整个java程序15行。后来学习了一点sed,该预处理可以只用一条命令完成sed -r 's/(\/|\[|\])[A-Za-z]{0,2}//g' 原始语料.txt > 原始语料.edited.txt
准备一个中文停用词表。
import java.util.*;
public class TermInfo {
int totalCount;
Map<String, Integer> inDocInfo;
public TermInfo() {
totalCount = 0;
inDocInfo = new HashMap<String, Integer>();
}
}
import java.util.*;
public class ArticleInfo {
int length;
Vector<String> termVec;
public ArticleInfo() {
length = 0;
termVec = new Vector<String>();
}
}
import java.util.*;
public class ArticleSim {
TreeMap<String, Double> dKeySim;
public ArticleSim() {
dKeySim = new TreeMap<String, Double>();
}
}
计算相似度的主程序:
import java.io.*;
import java.util.*;
public class Similarity extends ArticleInfo {
private static final int ARTICLE_KEY_LENGTH = 15; // 文档key的长度
private static final double THRESHOLD = 0.2; // 文档长度归一化阀值
private static double averageArticleLen; // 文档平均长度
private static Set<String> stopwordSet; // 停用词集合
private static TreeMap<String, String> articleMap; // 文档key与内容的映射集
private static TreeMap<String, TermInfo> termInfoMap; // 词项与其相关统计信息的映射集
private static TreeMap<String, ArticleInfo> articleInfoMap; // 文档key与其相关统计信息(长度、词项集)的映射集
private static TreeSet<String> termIntersection; // 文档词项交集
public static void main(String args[]) {
setStopwordSet();
Similarity s = new Similarity();
articleMap = s.setArticleMap();
termInfoMap = s.setTermInfoSet();
articleInfoMap = s.setArticleInfoMap();
long sTime = System.currentTimeMillis();
String qKey = null;
qKey = articleMap.firstKey();
for (int i = 0; i < 10; i++) {
outputAnArticleSimArray(qKey);
qKey = articleMap.higherKey(qKey);
}
outputAnArticleSimArray("19980101-03-012");
long eTime = System.currentTimeMillis();
System.out.println("Computing first 100 Articles' Similarity used "
+ (eTime - sTime) + "ms" + '\n');
// outputAnArticleSimArray("19980101-01-001");
// double similarity =
// s.computeSimilarity("19980101-01-001","19980101-01-002");
// System.out.println(similarity);
// sim.intersectTerm("19980101-01-001","19980101-01-002");
}
public static void setStopwordSet() {
long sReadTime = System.currentTimeMillis();
BufferedReader bufferedIn = null;
String str = "";
stopwordSet = new HashSet<String>();
try {
bufferedIn = new BufferedReader(new FileReader(
"ChineseStopword.txt"));
while ((str = bufferedIn.readLine()) != null) {
stopwordSet.add(str);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
long eReadTime = System.currentTimeMillis();
System.out.println("stopword count is " + stopwordSet.size());
System.out
.println("Reading the Chinese stopwords into the Set container used "
+ (eReadTime - sReadTime) + "ms" + '\n');
}
public TreeMap<String, String> setArticleMap() {
long sReadTime = System.currentTimeMillis();
BufferedReader bufferedIn = null;
String str = "";
TreeMap<String, String> articles = new TreeMap<String, String>();
try {
bufferedIn = new BufferedReader(new FileReader("Edited.txt"));
while ((str = bufferedIn.readLine()) != null) {
if (str.length() > 0) {
String articleKey = str
.substring(0, ARTICLE_KEY_LENGTH);// 前15位字符串做Key
String articleContent = str
.substring(ARTICLE_KEY_LENGTH + 4);
if (articles.isEmpty()
|| !articles.containsKey(articleKey))
articles.put(articleKey, articleContent);
else {
String tempStr = articles.get(articleKey)
+ articleContent;
articles.put(articleKey, tempStr);
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
long eReadTime = System.currentTimeMillis();
System.out.println("Article count is " + articles.size());
System.out.println("Building the articleMap used "
+ (eReadTime - sReadTime) + "ms" + '\n');
return articles;
}
public TreeMap<String, TermInfo> setTermInfoSet() {
long sTime = System.currentTimeMillis();
TreeMap<String, TermInfo> terms = new TreeMap<String, TermInfo>();// 新建TreeMap对象
String currentArticleKey = articleMap.firstKey();// 取文档集最小的键值
while (currentArticleKey != null) {
String currentArticleContent = articleMap
.get(currentArticleKey);// 文档正文内容
int len = 0;// 文档中非空字符数,没有区分字符和汉字,没有排除停用词
for (int i = 0; i != currentArticleContent.length() - 1;) {// 遍历文档字符串
String str = currentArticleContent.substring(i, i + 1);
String tempTerm = "";
if (str.equals(" ")) {
i++;
continue;
}
while (!str.equals(" ")) {
tempTerm += str;
len++;
i++;
str = currentArticleContent.substring(i, i + 1);
}
if (stopwordSet.contains(tempTerm))
continue;// 遇到停用词,换下一个非空词项
else {
if (!terms.keySet().contains(tempTerm)) {// 加入新词项
TermInfo aTermInfo = new TermInfo();
aTermInfo.totalCount = 1;
aTermInfo.inDocInfo.put(currentArticleKey, 1);
terms.put(tempTerm, aTermInfo);
} else {// 更新词项信息
TermInfo bTermInfo = terms.get(tempTerm);
bTermInfo.totalCount++;
if (bTermInfo.inDocInfo.get(currentArticleKey) == null)
bTermInfo.inDocInfo.put(currentArticleKey, 1);
else {
Integer tempInDocCount = bTermInfo.inDocInfo
.get(currentArticleKey) + 1;
bTermInfo.inDocInfo.put(currentArticleKey,
tempInDocCount);
}
terms.put(tempTerm, bTermInfo);
}
}
}
currentArticleKey = articleMap.higherKey(currentArticleKey);// 取下一个严格高于当前键值的最小键值
}
long eTime = System.currentTimeMillis();
System.out.println("Term total count is " + terms.size());
System.out.println("Building the termInfoMap used "
+ (eTime - sTime) + "ms" + '\n');
return terms;
}
public TreeMap<String, ArticleInfo> setArticleInfoMap() {
long sTime = System.currentTimeMillis();
TreeMap<String, ArticleInfo> articleInfos = new TreeMap<String, ArticleInfo>();// 新建TreeMap对象
String currentArticleKey = articleMap.firstKey();// 取文档集最小的键值
int allArticleLength = 0;
int len = 0;
while (currentArticleKey != null) {
String currentArticleContent = articleMap
.get(currentArticleKey);// 文档正文内容
if (articleInfos.get(currentArticleKey) == null) {
len = 0;
} else
len = articleInfos.get(currentArticleKey).length;// 文档中非空字符数,没有区分字符和汉字,没有排除停用词
for (int i = 0; i != currentArticleContent.length() - 1;) {// 遍历文档字符串
String str = currentArticleContent.substring(i, i + 1);
String tempTerm = "";
if (str.equals(" ")) {
i++;
continue;
}
while (!str.equals(" ")) {
tempTerm += str;
len++;
i++;
str = currentArticleContent.substring(i, i + 1);
}
if (stopwordSet.contains(tempTerm))
continue;// 遇到停用词,换下一个非空词项
else {
if (!articleInfos.keySet().contains(currentArticleKey)) {// 加入新词项
ArticleInfo anArticleInfo = new ArticleInfo();
anArticleInfo.length = len;
anArticleInfo.termVec.add(tempTerm);
articleInfos.put(currentArticleKey, anArticleInfo);
} else {// 更新词项信息
ArticleInfo bArticleInfo = articleInfos
.get(currentArticleKey);
bArticleInfo.length = len;
bArticleInfo.termVec.add(tempTerm);
articleInfos.put(currentArticleKey, bArticleInfo);
}
}
}
allArticleLength += articleInfos.get(currentArticleKey).length;
currentArticleKey = articleMap.higherKey(currentArticleKey);// 取下一个严格高于当前键值的最小键值
}
long eTime = System.currentTimeMillis();
averageArticleLen = allArticleLength / articleMap.size();
System.out.println("Building the articleInfoMap used "
+ (eTime - sTime) + "ms" + '\n');
return articleInfos;
}
public static void intersectTerm(String q, String d) {// 求词项集合交集
// long sTime = System.currentTimeMillis();
termIntersection = new TreeSet<String>();
TreeSet<String> tempSet = new TreeSet<String>();
if (articleInfoMap.get(q) == null || articleInfoMap.get(d) == null) {
System.out.println("Invalid article key.");
System.exit(0);
}
for (int i = 0; i != articleInfoMap.get(q).termVec.size(); i++)
tempSet.add(articleInfoMap.get(q).termVec.get(i));
for (int i = 0; i != articleInfoMap.get(d).termVec.size(); i++) {
boolean flag = tempSet.contains(articleInfoMap.get(d).termVec
.get(i));
if (flag == true) {
termIntersection.add(articleInfoMap.get(d).termVec.get(i));
}
}
// long eTime = System.currentTimeMillis();
// System.out.println("文档"+q+"和文档"+d+"的词项交集有"+termIntersection.size()+"项。");
// System.out.println(termIntersection);
// System.out.println("Intersecting two termVecs used "+(eTime -
// sTime)+"ms"+'\n');
// return termIntersection;
}
public static double computeSimilarity(String qKey, String dKey) {// 计算两文档间的相似度,文档Key为参数
// long sTime = System.currentTimeMillis();
double sim = 0;
intersectTerm(qKey, dKey);
Iterator<String> it = termIntersection.iterator();
while (it.hasNext()) {
String commonTerm = it.next();
int cwd = termInfoMap.get(commonTerm).inDocInfo.get(dKey);
int cwq = termInfoMap.get(commonTerm).inDocInfo.get(qKey);
int dfw = termInfoMap.get(commonTerm).inDocInfo.size();
sim += (1 + Math.log(1 + Math.log(cwd)))
/ (1 - THRESHOLD + THRESHOLD
* articleInfoMap.get(dKey).length
/ averageArticleLen) * cwq
* Math.log((articleMap.size() + 1) / dfw);
}
// long eTime = System.currentTimeMillis();
// System.out.println("Computing a similarity used "+(eTime -
// sTime)+"ms"+'\n');
// System.out.println("Sim is "+sim+'\n');
return sim;
}
public static void outputAnArticleSimArray(String articleKey) {
// long sTime = System.currentTimeMillis();
TreeMap<String, Vector<ArticleSim>> articleSimArray = new TreeMap<String, Vector<ArticleSim>>();
Vector<ArticleSim> anArticleSimVec = new Vector<ArticleSim>();
double sim = 0;
String qKey = articleKey;
String dKey = articleMap.firstKey();
while (dKey != null) {
ArticleSim anArticleSim = new ArticleSim();
if (dKey.equals(qKey))
sim = 0;
else
sim = computeSimilarity(qKey, dKey);
anArticleSim.dKeySim.put(dKey, sim);
anArticleSimVec.add(anArticleSim);
articleSimArray.put(qKey, anArticleSimVec);
dKey = articleMap.higherKey(dKey);
}
// long eTime = System.currentTimeMillis();
// System.out.println("Computing an ArticleSimArray used "+(eTime -
// sTime)+"ms"+'\n');
int index = 0;
dKey = articleMap.firstKey();
for (; dKey != null;) {
System.out.println(qKey
+ "->"
+ dKey
+ " = "
+ articleSimArray.get(qKey).get(index).dKeySim
.get(dKey));
dKey = articleMap.higherKey(dKey);
index++;
}
}
}