2021.3.25项目阶段报告

最新推荐文章于 2022-07-19 17:23:27 发布

梦平

最新推荐文章于 2022-07-19 17:23:27 发布

阅读量140

点赞数

分类专栏：基于文本分析的批量代码语法检测和实验报告查重系统

本文链接：https://blog.csdn.net/qq_43533083/article/details/115226853

版权

TF-IDF 余弦相似度文本分析算法实现相似度计算

关键词由CSDN通过智能技术生成

基于文本分析的批量代码语法检测和实验报告查重系统专栏收录该内容

22 篇文章 3 订阅

订阅专栏

提要

最大的收获就是实现了自己的TFIDF，这样的情况下，进行文本相似度的分析就更加好了，本篇博客主要内容会包括
- 自己的TFIDF算法的实现
- 一个使用自己TFIDF的余弦相似度算法的实现

本周总结

1、TFIDF算法的实现，主要思路就是根据tf-idf算法的内容进行程序的设计，然后提供必要的接口方便别的程序的调用。
这部分代码借鉴了知乎中的一篇文章，给出地址：
https://zhuanlan.zhihu.com/p/26766008
详细的实现方式可以借鉴上述文章，不过他的是英文版，我的是中文版。

import math
from collections import Counter

import jieba
#标点符合去掉
excludes = {'，','。','/','《','》','？','；','‘','：','“','【','】','{','}',
            '、','|','！','@','#','￥','%','……','&','*','（','）','-','=',
            '——','+','·','~','”',
            ',','.','/','<','>','?',';','\'',':','"','[',']','{','}','\\','|',
            '~','`','!','@','#','$','%','^','&','*','(',')','_','-','+','=',
            ' ','\n'}

# tfidf 算法
def tf(word,count):
    return count[word]/sum(count.values())

def n_containing(word,count_list):
    return sum(1 for count in count_list if word in count)

def idf(word,count_list):
    return math.log10(len(count_list)/(1+n_containing(word,count_list)))

def tfidf(word,count,count_list):
    return tf(word,count)*idf(word,count_list)

def getThetxt(str):
    d1 = open(str,encoding='utf-8').read()

    for word in excludes:
        d1 = d1.replace(word,'')
    #print(d1)
    return d1

#数据处理，将字符串分词
def dataprocess(data):
    sentence_list = [word for word in jieba.cut(data)]
    count = Counter(sentence_list)
    return count

#计算数据库中第location+1的TFIDF
# location 是int database 是list [获得字符串,去掉标点，尚未分词]
def getTheTFIDF(location,database):

    if location>=len(database):
        print("location is wrong!")
        return ;

    wordlist = []
    for data in database :
        wordlist.append(dataprocess(data))

    for i,count in enumerate(wordlist):

        if i==location:
            print("Top words in document{}:".format(i + 1))
            # 计算每个词的tfidf
            #   出现负值，原因 idf是 负的，所有文档中都有，所以应该挑选所有大于0的

            # scores = {word: tfidf(word, count, wordlist) for word in count}
            scores = {}
            for word in count:
                width = tfidf(word, count, wordlist)
                if width > 0:
                    scores[word] = width
                else:
                    scores[word] = 0

            sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            for word, score in sorted_words[:10]:
                print('\tWord:{},TF-IDF:{}'.format(word, round(score, 5)))
            return sorted_words;

2、通过上述算法实现自己的TFIDF-cosine,但是运行结果不尽人意，因为自己的库太小了，所以导致idf的计算不合适。基本的思路和之前的余弦相似度类似。

import math

from tfidfOfM import getTheTFIDF

excludes = {'，','。','/','《','》','？','；','‘','：','“','【','】','{','}',
            '、','|','！','@','#','￥','%','……','&','*','（','）','-','=',
            '——','+','·','~','”',
            ',','.','/','<','>','?',';','\'',':','"','[',']','{','}','\\','|',
            '~','`','!','@','#','$','%','^','&','*','(',')','_','-','+','=',
            ' ','\n'}

def getThetxt(str):
    d1 = open(str,encoding='utf-8').read()

    for word in excludes:
        d1 = d1.replace(word,'')
    
    return d1

def compute_cosine(dic_f, dic_s):

#   print("dic_f:{}".format(dic_f))
#   print("dic_s:{}".format(dic_s))

    first_dict = {}
    for word in dic_f:
            first_dict[word[0]] = word[1]

    second_dict = {}
    for word in dic_s:
            second_dict[word[0]] = word[1]
    #得到词向量（两者的所有的）
    keyWords = []

    for i in range(len(dic_f)):
        keyWords.append(dic_f[i][0])    #添加数组（都是字）

    for i in range(len(dic_s)):
        if dic_s[i][0] in keyWords: #有的话啥也不干
            pass
        else:                       #没有就加入
            keyWords.append(dic_s[i][0])
    vect_f = []
    vect_s = []

    for word in keyWords:
        if word in first_dict:
            vect_f.append(first_dict[word])
        else:
            vect_f.append(0)
        if word in second_dict:
            vect_s.append(second_dict[word])
        else:
            vect_s.append(0)
    print("vect_f:{}".format(vect_f))
    print("vect_s:{}".format(vect_s))

    #开始计算余弦相似度
    sum = 0
    sq1 = 0
    sq2 = 0
    for i in range(len(vect_f)):
        sum += vect_f[i] * vect_s[i]
        sq1 += pow(vect_f[i],2)
        sq2 += pow(vect_s[i],2)
    try:    #round() 四舍五入,结果保留两位小数
        result = round(float(sum)/(math.sqrt(sq1)*math.sqrt(sq2)),2)
    except ZeroDivisionError:
        result = 0.0
    return result

if __name__ == '__main__':

    first = 'C:/Users/60917/Desktop/a.txt'
    second = 'C:/Users/60917/Desktop/b.txt'
    third = 'C:/Users/60917/Desktop/c.txt'
    forth = 'C:/Users/60917/Desktop/d.txt'
    # 获得字符串,去掉标点，尚未分词
    f = getThetxt(first)
    s = getThetxt(second)
    t = getThetxt(third)
    fo = getThetxt(forth)
    database = [f,s,t,fo ]
    Tfidf_list = []
    for i in range(len(database)):
        Tfidf_list.append(getTheTFIDF(i,database))

    print(compute_cosine(Tfidf_list[0],Tfidf_list[1]))
    print(compute_cosine(Tfidf_list[1],Tfidf_list[2]))