1、项向量是一组由项-频率对组成的集合。
1)设一个文档只包括cat和dog两个项,一共有2个文档,向量可表示为图形上的有方向的直线,一个向量就是一个文档。2个项因为是一个二维空间,cat为y轴,dog为x轴。向量为从(0,0)出发到(x,y)截止。x表示dog在该向量表示的文档中出现的频率,y表示cat在该向量表示的文档中出现的频率。
如果是3个文档,则有三个向量,这三个向量表示为3条从原点出发的直线,在第一象限。如果有3个项,5个文档,则表示为一个三维空间,空间内有5条向量,分别表示5个文档。
2)向量之间的夹角越接近,这2个向量的特征就越相似,这2个文档就越相似.
2、查找相似书
1)
public class BooksLikeThis{
public static void main()throws IOException{
String indexDir=System.getProperty("index.dir");
FSDirectory directory=FSDirectory.getDirectory(indexDir,false);
IndexReader reader=IndexReader.open(directory);
int numDocs=reader.maxDoc();
BooksLikeThis blt=new BooksLinkThis(reader);
for(int i=0;i<numDocs;i++){
System.out.println();
Document doc=reader.document(i);
System.out.println(doc.get("title"));
//查找与这本书类似的书,遍历每一本书
Document[] docs=blt.docsLike(i,10);
if (docs.length==0){
System.out.println(" None likethis");
}
for(int j=0;j<docs.length;j++){
Document likeThisDoc=docs[j];
System.out.println("->"+likeThisDoc.get("title"));
}
}
}
private IndexReaderreader;
private IndexSearchersearcher;
publicBooksLinkeThis(IndexReader reader){
this.reader=reader;
searcher=newIndexSearcher(reader);
}
public Document[]docsLike() throws IOException{
Document doc=reader.document(id);
//对作者相同的书进行因子增强,一本书可以有多个作者
String[] authors=doc.getValues("author");
BooleanQuery authorquery=new BooleanQuery();
for (int i=0;i<authors.length;i++){
String author=authors[i];
authorQuery.add(newTermQuery(new Term("author",author)),false,false);
}
authorQuery.setBoost(2.0f);
//使用项向量,项为subject,getTermFreqVector得到项的频率
TermFreqVector vector=reader.getTermFreqVector(id,"subject");
BooleanQuerysubjectQuery=new BooleanQuery();
for (int j=0;j<vector.size();j++){
TermQuerytq=new TermQuery(new Term("subject",vector.getTerms()[j]));
subjectQuery.add(tq,false,false);
}
//创造最终查询对象
BooleanQuery likeThisQuery=new BooleanQuery();
likeThisQuery.add(authorQuery,false,false);
likeThisQuery.add(subjectQuery,false,false);
likeThisQuery.add(newTermQuery(newTerm("isbn",dco.get("isbn"))),false,true);
Hitshits=searcher.search(likeThisQuery);
int size=max;
if (max>hits.length()) size=hits.length();
Document[]docs=new Document(size);
for(int i=0;i<size;i++){
docs[i]=hits.doc[i];
}
return docs;
}
}
2)按向量角计主题中包括extreme、agile、methodology,则这本书属于/technology/computers/programming/methodology分类。
public void testCategorization() throws Exception{
assertEquals("/technology/computers/programming/methodology",getCategory("extremeagilemethodology"));
}
为每个类别建立向量
public class CategorizerTest extends testcase{
Map categoryMap;
protected void setUp()throws Exception{
super.setUp();
categoryMap=new TreeMap();
buildCategoryVectors();
}
}
private void buildCategoryVectors() throws IOException{
IndexReader reader=IndexReader.open(directory);
intmaxDoc=reader.maxDoc();
for (inti=0;i<macDoc;i++){
if (!reader.isDeleted(i)){
Document doc=reader.document(i);
String category=doc.get("category");
Map vectorMap=(Map) categoryMap.get(category);
if (vectorMap==null){
vectorMap=new TreeMap();
categoryMap.put(category,vectorMap);
}
TermFreqVectortermFreqVector=reader.getTermFreqVector(i,"subject");
addTermFreqToMap(vectorMap,termFreqVector);//将文档各个项的频率加入
//到分类中。
}
}
}
private void addTermFreqToMap(Map vectorMap,TermFreqVectortermFreq){
String[] terms=termFreqVector.getTerms();
int[] freqs=termFreqVector.getTermFrequencies();
for (int i=0;i<term.length;i++){
Stringterm=terms[i];
if (vectorMap.contiansKey(term)){
Integer value=(Integer) vectorMap.get(term);
vectorMap.put(term,newInteger(value.intValue()+freqs[i]));
}
else {
vectorMap.put(term,new Integer(freq[i]));
}
}
}
得到新书与每个类别向量之间的夹角,找到最匹配的类别
private String getCategory(String subject){
String[]words=subject.split(" ");
IteratorcategoryIterator=categoryMap.keySet().iterator();
doublebestAngle=Double.MAX_VALUE;
StringbestCategory=null;
while(categoryIterator.hasNext()){
Stringcategory=(String) categoryIterator.next();
double angle=computeAngle(words,category);
if (angle<bestAngle){
bestAngle=angle;
bestCategory=category;
}
}
return bestCategory;
}
计算向量夹角
private double computAngle(String[] words,String category){
MapvectorMap=(Map) categoryMap.get(category);
intdtProduct=0;
intsumOfSquares=0;
for (inti=0;i<words.length;i++){
String word=words[i];
int categoryWrodFreq=0;
if (vectorMap.containsKey(word)){
categoryWordFreq=((Integer)vectorMap.get(word)).intValue();
}
doProduct+=categoryWordFreq;
sumOfSquares+=categoryWrodFreq*categoryWordFreq;
}
doubledenominator;
if(sumOfSquares==words.length){
denominator=sumOfSquares;
}else{
denominator=Math.sqrt(sumOfSquares)+Math.sqrt(words.length);
}
double ratio=dotProduct/denomiator;
returnMath.acos(ratio);
}