数据挖掘笔记-寻找相似文章-Python

相关原理知识在数据挖掘笔记-寻找相似文章-Java这篇文章中已经介绍过了。这里只是记录用Python语言实现。分词器用的是结巴分词器python包。

代码托管:https://github.com/fighting-one-piece/repository-datamining.git

class Doc:
    
    def __init__(self, name):
        self._name = name
     
    def setName(self, name):
        self._name = name
    
    def getName(self):
        return self._name
    
    def setCategory(self, category):
        self._category = category
        
    def getCategory(self):
        return self._category
        
    def setWords(self, words):
        self._words = words
        
    def getWords(self):
        return self._words
    
    def setTfidfWords(self, tfidfWords):
        self._tfidfWords = tfidfWords
        
    def getTfidfWords(self):
        return self._tfidfWords
    
    def getSortedTfidfWords(self):
        results = [sorted(self._tfidfWords.items(), key=lambda i : i[1], reverse=True), ]
        return results
    
    def setCHIWords(self, chiWords):
        self._chiWords = chiWords
        
    def getCHIWords(self):
        return self._chiWords

    def setSimilarities(self, similarities):
        self._similarities = similarities
        
    def getSimilarities(self):
        return self._similarities
    
class DocSimilarity:
    
    def getName1(self):
        return self._name1

    def setName1(self, name1):
        self._name1 = name1
        
    def getName2(self):
        return self._name2

    def setName2(self, name2):
        self._name2 = name2
    
    def getVector1(self):
        return self._vector1
    
    def setVector1(self, vector1):
        self._vector1 = vector1
        
    def getVector2(self):
        return self._vector2
    
    def setVector2(self, vector2):
        self._vector2 = vector2
        
    def getCosine(self):
        return self._cosine
        
    def setCosine(self, cosine):
        self._cosine = cosine
    
        
class DocHelper:
    
    @staticmethod
    def genDocs(path):
        docs = []
        DocHelper.genDocsIterator(path, docs)
        return docs
    
    @staticmethod
    def genDocsIterator(path, docs):
        if os.path.isdir(path):
            for subPathName in os.listdir(path):
                subPath = os.path.join(path, subPathName)
                DocHelper.genDocsIterator(subPath, docs)
        else:
            name = path[path.rfind('\\') + 1 : path.rfind('.')]
            doc = Doc(name)
            doc.setCategory(path.split('\\')[-2])
            doc.setWords(WordUtils.splitFile(path));
            docs.append(doc)
    
    @staticmethod
    def docHasWord(doc, word):
        for dword in doc.getWords():
            if dword == word:
                return True
        return False
    
    @staticmethod
    def docWordsStatistics(doc):
        map = {}
        for word in doc.getWords():
            count = map.get(word)
            if count is None:
                count = 0
            map[word] = count + 1
        return map
    
    @staticmethod
    def docCategorySplit(docs):
        docSplits = {}
        for doc in docs:
            category = doc.getCategory()
            if docSplits.has_key(category):
                cDocs = docSplits.get(category)
                cDocs.append(doc)
            else :
                cDocs = [doc]
                docSplits[category] = cDocs
        return docSplits
    
    @staticmethod
    def docTopNWords(doc, n):
        sortedWords = DocHelper.sortWordValueMap(doc.getTfidfWords())
        words = []
        for item in sortedWords:
            for i in item[0:n]:
                words.append(i[0])
        return words
                    
    
    @staticmethod
    def docWordsVector(doc, words):
        vector = []
        docWords = DocHelper.docWordsStatistics(doc)
        for word in words:
            count = docWords.get(word)
            if count is None:
                vector.append(0)
            else :
                vector.append(count)
        return vector
    
    @staticmethod
    def wordCategorySplit(category, docs):
        belongDocs = []
        nobelongDocs = []
        for doc in docs:
            if category == doc.getCategory():
                belongDocs.append(doc)
            else:
                nobelongDocs.append(doc)
        return belongDocs, nobelongDocs
    
    @staticmethod
    def wordInDocsStatistics(word, docs):
        sum = 0
        for doc in docs:
            if DocHelper.docHasWord(doc, word):
                sum += 1
        return sum

    @staticmethod
    def wordNotInDocsStatistics(word, docs):
        sum = 0
        for doc in docs:
            if DocHelper.docHasWord(doc, word) == False:
                sum += 1
        return sum
    
    @staticmethod
    def calculateTFIDF(docs):
        docTotalCount = float(len(docs))
        for doc in docs:
            wordTotalCount = len(doc.getWords())
            tfidfWords = {}
            docWords = DocHelper.docWordsStatistics(doc)
            for word in docWords.keys():
                wordCount = docWords.get(word)
                tf = float(wordCount) / wordTotalCount
                docCount = DocHelper.wordInDocsStatistics(word, docs) + 1
                if docCount > docTotalCount:
                    docCount = docTotalCount
                idf = math.log(docTotalCount / docCount);
                tfidf = tf * idf
                tfidfWords[word] = tfidf
            doc.setTfidfWords(tfidfWords)
        
    @staticmethod
    def calculateSimilar(docs):
        for doc in docs:
            topWords = DocHelper.docTopNWords(doc, 20)
            similarities = []
            for odoc in docs:
                otopWords = DocHelper.docTopNWords(odoc, 20)
                words = WordUtils.mergeAndRemoveRepeat(topWords, otopWords);
                v1 = DocHelper.docWordsVector(doc, words)
                v2 = DocHelper.docWordsVector(odoc, words)
                cosine = DistanceUtils.cosine(v1,v2)
                similarity = DocSimilarity()
                similarity.setName1(doc.getName())
                similarity.setName2(odoc.getName())
                similarity.setVector1(v1)
                similarity.setVector2(v2)
                similarity.setCosine(cosine)
                similarities.append(similarity)
            doc.setSimilarities(similarities)
                
    @staticmethod
    def sortWordValueMap(wordValueMap):
        results = [sorted(wordValueMap.items(), key=lambda i : i[1], reverse=True), ]
        return results
import jieba as ws

class WordUtils:

    @staticmethod
    def split(input):
        seg_list = ws.cut(input, cut_all=False)
        words = []
        for word in seg_list:
            words.append(word)
        return words
    
    @staticmethod
    def splitFile(path):
        file = open(path)
        words = []
        for line in file.readlines():
            line = line.strip();
            if len(line) > 0:
                for w in WordUtils.split(line):
                    words.append(w)
        file.close()
        return WordUtils.removeStopWords(words)
    
    @staticmethod
    def removeStopWords(words):
        file = open("stopwords.dic")
        stopwords = []
        for line in file.readlines():
            line = line.strip();
            if len(line) > 0:
                stopwords.append(line)
        file.close()
        rwords = []
        for word in words:
            flag = True
            for stopword in stopwords:
                #if word.encode('utf-8') == stopword.encode('utf-8'):
                if word == stopword:
                    flag = False
                    break
            if flag and len(word.strip()) > 0:
                rwords.append(word)
        return rwords
    
    @staticmethod
    def mergeAndRemoveRepeat(w1, w2):
        all = [i1 for i1 in w1]
        all += [i2 for i2 in w2]
        return [i for i in set(all)] 
def testSimilarity(): 
    path = r'D:\resources\chinese'
    docs = DocHelper.genDocs(path)
    DocHelper.calculateTFIDF(docs)
    DocHelper.calculateSimilar(docs)
    for doc in docs:
        print '----------'
        for similarity in doc.getSimilarities():
            print '%s-%s-%s' %(similarity.getName1(),\
                    similarity.getName2(), similarity.getCosine())




Python数据挖掘学习笔记主要包括以下几个方面的内容:Python基础知识、Python爬虫技术、Python数据分析与数据挖掘。其中,Python基础知识部分介绍了Python编程语言的基本语法、数据类型、流程控制等内容,为数据挖掘的学习打下了基础。Python爬虫技术部分介绍了如何使用Python编写爬虫程序,从网页中获取所需数据。Python数据分析与数据挖掘部分则介绍了使用Python进行数据分析和数据挖掘的相关技术和工具。 在Python数据挖掘中,还涉及到一些扩展库的使用,可以使用pip或apt-get进行安装,例如numpy库可以使用命令"sudo pip install numpy"或"sudo apt-get install python-numpy"进行安装。 另外,Matplotlib是Python中最常用的绘图库之一,主要用于绘制二维图形,也可以绘制简单的三维图形。下面是一个使用Matplotlib进行简单绘图的示例代码: ```python import numpy as np import matplotlib.pyplot as plt x = np.linspace(0, 10, 1000) y = np.sin(x) z = np.cos(x ** 2) plt.figure(figsize=(8, 4)) plt.plot(x, y, label='$\sin x$', color='red', linewidth=2) plt.plot(x, z, 'b--', label='$\cos x^2$') plt.xlabel('Time(s)') plt.ylabel('Volt') plt.title('A Simple Example') plt.ylim(0, 2.2) plt.legend() plt.show() ``` 这段代码使用了numpy库生成了一组x轴的数据,然后分别计算了对应的y轴和z轴的数值。接下来使用Matplotlib进行绘图,其中plt.plot函数用于绘制曲线,plt.xlabel和plt.ylabel分别设置x轴和y轴的标签,plt.title设置图的标题,plt.ylim设置y轴的范围,plt.legend用于显示图例,plt.show用于显示图形。 通过学习这些内容,你可以掌握Python数据挖掘的基本知识和常用技术,为进一步的学习和实践打下坚实的基础。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* [python数据挖掘学习笔记](https://blog.csdn.net/yinghuoai/article/details/88392141)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] - *2* *3* [python数据挖掘笔记](https://blog.csdn.net/djm82755/article/details/101452842)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值