This is my first program for similarity computing, and will continuous optimization.
Steps:
1.Enough articles and split to words, at the same time you can get a word_dict(I think it's better with different category);
2.Convert these articles to one article with different paragraph;
3.Compute the idf of all words, get the idf_map;
4.Split words of articles to be analyzed;
5.Compute the TF of word_dict, not the word in articles to be analyzed, if the word in word_dict not in article then it's TF value will be marked 0, else it will be the actual value in article. Get the tf_map;
6.Through idf_map and tf_map, then we can get the tfidf_map;
7.Remove the values equal 0 both in two vector to reduce the amount of computing;
8.Use Cosine theorem to compute the similarity between two articles.
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Attach test code
1. convert
#!/usr/bin/python
import os
import sys
import gzip
test_dir = sys.argv[1]
def get_file_handler(f):
file_in = open(f, 'r')
return file_in
index = 0
for fd in os.listdir(test_dir):
txt_list = []
file_fd = get_file_handler(test_dir + '/' + fd)
for line in file_fd:
txt_list.append(line.strip())
print '\t'.join([str(index), ' '.join(txt_list)])
index += 1
2.idf score
#!/usr/bin/python
import sys
import math
current_word = None
count_pool = []
sum = 0
docs_cnt = 524
file1 ='d:/share/testresult2'
file = open(file1, 'r', encoding='UTF-8')
for line in file:
ss = line.strip().split('\t')
if len(ss) != 2:
continue
word, val = ss
if current_word == None:
current_word = word
if current_word != word:
for count in count_pool:
sum += count
idf_score = math.log(float(docs_cnt) / (float(sum) + 1))
print("%s\t%s" % (current_word, idf_score))
current_word = word
count_pool = []
sum = 0
if val != "":
count_pool.append(int(val))
for count in count_pool:
sum += count
idf_score = math.log(float(docs_cnt) / (float(sum) + 1))
print("%s\t%s" % (current_word, idf_score))
3.word_dict
#!/usr/local/bin/python
import sys
word_set = set()
for line in sys.stdin:
ss = line.strip().split('\t')
if len(ss) != 2:
continue
word = ss[0].strip()
word_set.add(word)
for word in word_set:
print(word)
4.split
#!/usr/bin/python
import sys
for line in sys.stdin:
ss = line.strip().split('\t', 1)
if len(ss) != 2:
continue
doc_index = ss[0].strip()
doc_context = ss[1].strip()
word_list = doc_context.split(' ')
word_set = set()
for word in word_list:
word_set.add(word)
for word in word_set:
print('\t'.join([word, "1"]))
5.tfidf_cos
#!/usr/bin/python
import os
import sys
import gzip
import math
#suppose
file_input_fd = 'd:/share/testresult5' #待分析文章
idf_dict_fd = 'd:/share/testresult3' #idf库
word_dict = 'd:/share/word_dict' #词袋(在词库的基础上分类得到)
def get_file_handler(f):
file_in = open(f, 'r', encoding='UTF-8')
return file_in
#create idf_map
idf_map = {}
whitelist_tf_map = {}
tfidf_map = {}
def get_tfidf_map( tf_map ):
for wd in get_file_handler(word_dict):
yy = wd.strip()
if yy not in tf_map:
whitelist_tf_map[yy] = 0
else:
for w, tf in tf_map.items():
if w == yy:
whitelist_tf_map[yy] = tf
for w, tf in whitelist_tf_map.items():
if w not in idf_map:
continue
idf = idf_map[w]
tfidf_score = tf * idf
tfidf_map[w] = tfidf_score
return tfidf_map
for line in get_file_handler(idf_dict_fd):
ss = line.strip().split('\t')
if len(ss) != 2:
continue
word = ss[0].strip()
idf = ss[1].strip()
idf_map[word] = float(idf)
#create tf_map
word_score_list = []
for line in get_file_handler(file_input_fd):
ss = line.strip().split('\t')
if len(ss) != 2:
continue
docid = ss[0].strip()
context = ss[1].strip()
tf_map = {}
for t_word in context.strip().split(' '):
if t_word not in tf_map:
tf_map[t_word] = 0
tf_map[t_word] += 1
tfidf_result = get_tfidf_map(tf_map)
tmp_list = []
for key, val in tfidf_result.items():
tmp_list.append((key, val))
#final_list = sorted(tmp_list, key=lambda x: x[1], reverse=True)[:20]
final_list = tmp_list[:20000]
for t in final_list:
word_score_list.append(':'.join([docid, t[0], str(t[1])]))
for a in word_score_list:
ss = a.strip().split(':')
c1 = int(ss[0].strip())
c2 = ss[2].strip()
c3 = ss[1].strip()
# if c1 == 0 and c2 == '0.0':
for b in word_score_list:
xx = b.strip().split(':')
d1 = int(xx[0].strip())
d2 = xx[2].strip()
d3 = xx[1].strip()
if c1 == 0 and d1 == 1 and c2 == '0.0' and d2 == '0.0' and c3 == d3:
word_score_list.remove(a)
word_score_list.remove(b)
#分别输出两篇文章的向量
vector1 = []
vector11 = []
vector2 = []
vector22 = []
for t in word_score_list:
ww = t.strip().split(':')
w1 = int(ww[0].strip())
if w1 == 0:
vector1.append(':'.join([str(ww[1]), str(ww[2])]))
vector11.append([str(ww[2])])
else:
vector2.append(':'.join([str(ww[1]), str(ww[2])]))
vector22.append([str(ww[2])])
print('0' + '\t' + ','.join(vector1))
print(vector11)
print('1' + '\t' + ','.join(vector2))
print(vector22)
#计算两篇文章的余弦相似度
sum = 0
s1 = 0
s2 = 0
for x in vector11:
for y in vector22:
xi = list(map(int, map(eval, x)))[0]
yi = list(map(int, map(eval, y)))[0]
sum += xi * yi
s1 += xi * xi
s2 += yi * yi
similarity = abs( sum / ( math.sqrt(s1) * math.sqrt(s2) +1))
print('this is the last result', str(similarity))