Similarity Computing based on TF-IDF and Cosine theorem

This  is my first program for  similarity computing, and will continuous optimization.

Steps:

1.Enough articles and split to words, at the same time you can get a word_dict(I think it's better with different category);

2.Convert these articles to one article with different paragraph;

3.Compute the idf of all words, get the idf_map;

4.Split words of articles to be analyzed;

5.Compute the TF of word_dict, not the word in articles to be analyzed, if the word in word_dict not in article then it's TF value will be marked 0, else it will be the actual value in article. Get the tf_map;

6.Through idf_map and tf_map, then we can get the tfidf_map;

7.Remove the values equal 0 both in two vector to reduce the amount of computing;

8.Use Cosine theorem to compute the similarity between two articles.

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Attach test code

1. convert

#!/usr/bin/python

import os
import sys
import gzip


test_dir = sys.argv[1]

def get_file_handler(f):
    file_in = open(f, 'r')
    return file_in

index = 0
for fd in os.listdir(test_dir):

    txt_list = []

    file_fd = get_file_handler(test_dir + '/' + fd)
    for line in file_fd:
        txt_list.append(line.strip())


    print '\t'.join([str(index), ' '.join(txt_list)])

    index += 1

 

2.idf score

#!/usr/bin/python

import sys
import math

current_word = None
count_pool = []
sum = 0

docs_cnt = 524
file1 ='d:/share/testresult2'
file = open(file1, 'r', encoding='UTF-8')

for line in file:
    ss = line.strip().split('\t')
    if len(ss) != 2:
        continue

    word, val = ss

    if current_word == None:
        current_word = word

    if current_word != word:
        for count in count_pool:
            sum += count
        idf_score = math.log(float(docs_cnt) / (float(sum) + 1))
        print("%s\t%s" % (current_word, idf_score))

        current_word = word
        count_pool = []
        sum = 0
    if val != "":
        count_pool.append(int(val))


for count in count_pool:
    sum += count
idf_score = math.log(float(docs_cnt) / (float(sum) + 1))
print("%s\t%s" % (current_word, idf_score))


3.word_dict

#!/usr/local/bin/python
import sys

word_set = set()
for line in sys.stdin:
        ss = line.strip().split('\t')
        if len(ss) != 2:
                continue
        word = ss[0].strip()
        word_set.add(word)
for word in word_set:
        print(word)


4.split

#!/usr/bin/python

import sys

for line in sys.stdin:
    ss = line.strip().split('\t', 1)
    if len(ss) != 2:
        continue
    doc_index = ss[0].strip()
    doc_context = ss[1].strip()

    word_list = doc_context.split(' ')

    word_set = set()
    for word in word_list:
        word_set.add(word)

    for word in word_set:
        print('\t'.join([word, "1"]))


5.tfidf_cos

#!/usr/bin/python

import os
import sys
import gzip
import math
#suppose
file_input_fd = 'd:/share/testresult5'  #待分析文章
idf_dict_fd = 'd:/share/testresult3'    #idf库
word_dict = 'd:/share/word_dict'        #词袋(在词库的基础上分类得到)

def get_file_handler(f):
    file_in = open(f, 'r', encoding='UTF-8')
    return file_in
#create idf_map
idf_map = {}

whitelist_tf_map = {}
tfidf_map = {}
def get_tfidf_map( tf_map ):
    for wd in get_file_handler(word_dict):
        yy = wd.strip()
        if yy not in tf_map:
            whitelist_tf_map[yy] = 0
        else:
            for w, tf in tf_map.items():
                if w == yy:
                    whitelist_tf_map[yy] = tf

    for w, tf in whitelist_tf_map.items():
        if w not in idf_map:
            continue
        idf = idf_map[w]
        tfidf_score = tf * idf
        tfidf_map[w] = tfidf_score
    return tfidf_map


for line in get_file_handler(idf_dict_fd):
    ss = line.strip().split('\t')
    if len(ss) != 2:
        continue
    word = ss[0].strip()
    idf = ss[1].strip()
    idf_map[word] = float(idf)
#create tf_map
word_score_list = []
for line in get_file_handler(file_input_fd):
        ss = line.strip().split('\t')
        if len(ss) != 2:
            continue

        docid = ss[0].strip()
        context = ss[1].strip()
        tf_map = {}
        for t_word in context.strip().split(' '):
            if t_word not in tf_map:
                tf_map[t_word] = 0
            tf_map[t_word] += 1
        tfidf_result = get_tfidf_map(tf_map)
        tmp_list = []
        for key, val in tfidf_result.items():
            tmp_list.append((key, val))
            #final_list = sorted(tmp_list, key=lambda x: x[1], reverse=True)[:20]
            final_list = tmp_list[:20000]

        for t in final_list:
            word_score_list.append(':'.join([docid, t[0], str(t[1])]))


for a in word_score_list:
    ss = a.strip().split(':')
    c1 = int(ss[0].strip())
    c2 = ss[2].strip()
    c3 = ss[1].strip()
    # if c1 == 0 and c2 == '0.0':
    for b in  word_score_list:
        xx = b.strip().split(':')
        d1 = int(xx[0].strip())
        d2 = xx[2].strip()
        d3 = xx[1].strip()
        if c1 == 0 and d1 == 1 and c2 == '0.0' and d2 == '0.0' and c3 == d3:
                word_score_list.remove(a)
                word_score_list.remove(b)


#分别输出两篇文章的向量
vector1 = []
vector11 = []
vector2 = []
vector22 = []
for t in word_score_list:
    ww = t.strip().split(':')
    w1 = int(ww[0].strip())
    if w1 == 0:
        vector1.append(':'.join([str(ww[1]), str(ww[2])]))
        vector11.append([str(ww[2])])
    else:
        vector2.append(':'.join([str(ww[1]), str(ww[2])]))
        vector22.append([str(ww[2])])
print('0' + '\t' + ','.join(vector1))
print(vector11)
print('1' + '\t' + ','.join(vector2))
print(vector22)

#计算两篇文章的余弦相似度
sum = 0
s1 = 0
s2 = 0
for x in vector11:
    for y in vector22:
        xi =  list(map(int, map(eval, x)))[0]
        yi = list(map(int, map(eval, y)))[0]
        sum += xi * yi
        s1 += xi * xi
        s2 += yi * yi

similarity = abs( sum / ( math.sqrt(s1) * math.sqrt(s2) +1))
print('this is the last result', str(similarity))

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

SeasonRun

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值