Similarity Computing based on TF-IDF and Cosine theorem

最新推荐文章于 2022-11-04 16:04:24 发布

SeasonRun

最新推荐文章于 2022-11-04 16:04:24 发布

阅读量164

点赞数

分类专栏： BigData 文章标签： Similarity Computing TF-IDF Cosine theorem

本文链接：https://blog.csdn.net/haiziccc/article/details/98622594

版权

BigData 专栏收录该内容

39 篇文章 0 订阅

订阅专栏

This is my first program for similarity computing, and will continuous optimization.

Steps:

1.Enough articles and split to words, at the same time you can get a word_dict(I think it's better with different category);

2.Convert these articles to one article with different paragraph;

3.Compute the idf of all words, get the idf_map;

4.Split words of articles to be analyzed;

5.Compute the TF of word_dict, not the word in articles to be analyzed, if the word in word_dict not in article then it's TF value will be marked 0, else it will be the actual value in article. Get the tf_map;

6.Through idf_map and tf_map, then we can get the tfidf_map;

7.Remove the values equal 0 both in two vector to reduce the amount of computing;

8.Use Cosine theorem to compute the similarity between two articles.

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Attach test code

1. convert

#!/usr/bin/python

import os
import sys
import gzip

test_dir = sys.argv[1]

def get_file_handler(f):
file_in = open(f, 'r')
return file_in

index = 0
for fd in os.listdir(test_dir):

txt_list = []

    file_fd = get_file_handler(test_dir + '/' + fd)
    for line in file_fd:
        txt_list.append(line.strip())

print '\t'.join([str(index), ' '.join(txt_list)])

index += 1

2.idf score

#!/usr/bin/python

import sys
import math

current_word = None
count_pool = []
sum = 0

docs_cnt = 524
file1 ='d:/share/testresult2'
file = open(file1, 'r', encoding='UTF-8')

for line in file:
    ss = line.strip().split('\t')
    if len(ss) != 2:
        continue

word, val = ss

if current_word == None:
current_word = word

    if current_word != word:
        for count in count_pool:
            sum += count
        idf_score = math.log(float(docs_cnt) / (float(sum) + 1))
        print("%s\t%s" % (current_word, idf_score))

        current_word = word
        count_pool = []
        sum = 0
    if val != "":
        count_pool.append(int(val))

for count in count_pool:
sum += count
idf_score = math.log(float(docs_cnt) / (float(sum) + 1))
print("%s\t%s" % (current_word, idf_score))

3.word_dict

#!/usr/local/bin/python
import sys

word_set = set()
for line in sys.stdin:
        ss = line.strip().split('\t')
        if len(ss) != 2:
                continue
        word = ss[0].strip()
        word_set.add(word)
for word in word_set:
        print(word)

4.split

#!/usr/bin/python

import sys

for line in sys.stdin:
    ss = line.strip().split('\t', 1)
    if len(ss) != 2:
        continue
    doc_index = ss[0].strip()
    doc_context = ss[1].strip()

word_list = doc_context.split(' ')

    word_set = set()
    for word in word_list:
        word_set.add(word)

for word in word_set:
print('\t'.join([word, "1"]))

5.tfidf_cos

#!/usr/bin/python

import os
import sys
import gzip
import math
#suppose
file_input_fd = 'd:/share/testresult5' #待分析文章
idf_dict_fd = 'd:/share/testresult3' #idf库
word_dict = 'd:/share/word_dict' #词袋（在词库的基础上分类得到）

def get_file_handler(f):
file_in = open(f, 'r', encoding='UTF-8')
return file_in
#create idf_map
idf_map = {}

whitelist_tf_map = {}
tfidf_map = {}
def get_tfidf_map( tf_map ):
    for wd in get_file_handler(word_dict):
        yy = wd.strip()
        if yy not in tf_map:
            whitelist_tf_map[yy] = 0
        else:
            for w, tf in tf_map.items():
                if w == yy:
                    whitelist_tf_map[yy] = tf

    for w, tf in whitelist_tf_map.items():
        if w not in idf_map:
            continue
        idf = idf_map[w]
        tfidf_score = tf * idf
        tfidf_map[w] = tfidf_score
    return tfidf_map

for line in get_file_handler(idf_dict_fd):
    ss = line.strip().split('\t')
    if len(ss) != 2:
        continue
    word = ss[0].strip()
    idf = ss[1].strip()
    idf_map[word] = float(idf)
#create tf_map
word_score_list = []
for line in get_file_handler(file_input_fd):
        ss = line.strip().split('\t')
        if len(ss) != 2:
            continue

        docid = ss[0].strip()
        context = ss[1].strip()
        tf_map = {}
        for t_word in context.strip().split(' '):
            if t_word not in tf_map:
                tf_map[t_word] = 0
            tf_map[t_word] += 1
        tfidf_result = get_tfidf_map(tf_map)
        tmp_list = []
        for key, val in tfidf_result.items():
            tmp_list.append((key, val))
            #final_list = sorted(tmp_list, key=lambda x: x[1], reverse=True)[:20]
            final_list = tmp_list[:20000]

for t in final_list:
word_score_list.append(':'.join([docid, t[0], str(t[1])]))

for a in word_score_list:
    ss = a.strip().split(':')
    c1 = int(ss[0].strip())
    c2 = ss[2].strip()
    c3 = ss[1].strip()
    # if c1 == 0 and c2 == '0.0':
    for b in word_score_list:
        xx = b.strip().split(':')
        d1 = int(xx[0].strip())
        d2 = xx[2].strip()
        d3 = xx[1].strip()
        if c1 == 0 and d1 == 1 and c2 == '0.0' and d2 == '0.0' and c3 == d3:
                word_score_list.remove(a)
                word_score_list.remove(b)

#分别输出两篇文章的向量
vector1 = []
vector11 = []
vector2 = []
vector22 = []
for t in word_score_list:
    ww = t.strip().split(':')
    w1 = int(ww[0].strip())
    if w1 == 0:
        vector1.append(':'.join([str(ww[1]), str(ww[2])]))
        vector11.append([str(ww[2])])
    else:
        vector2.append(':'.join([str(ww[1]), str(ww[2])]))
        vector22.append([str(ww[2])])
print('0' + '\t' + ','.join(vector1))
print(vector11)
print('1' + '\t' + ','.join(vector2))
print(vector22)

#计算两篇文章的余弦相似度
sum = 0
s1 = 0
s2 = 0
for x in vector11:
    for y in vector22:
        xi = list(map(int, map(eval, x)))[0]
        yi = list(map(int, map(eval, y)))[0]
        sum += xi * yi
        s1 += xi * xi
        s2 += yi * yi

similarity = abs( sum / ( math.sqrt(s1) * math.sqrt(s2) +1))
print('this is the last result', str(similarity))

SeasonRun

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
Similarity Computing based on TF-IDF and Cosine theorem

This is my first program for similarity computing, and will continuous optimization.Steps:1.Enough articles and split to words, at the same time you can get a word_dict(I think it's better with ...
复制链接

扫一扫