关于特征选择相关的知识可以参考一下连接
项目源码里面包含Java和Python的实现,这里只列出Python实现:
代码托管:https://github.com/fighting-one-piece/repository-datamining.git
- class Doc:
- def __init__(self, name):
- self._name = name
- def setName(self, name):
- self._name = name
- def getName(self):
- return self._name
- def setCategory(self, category):
- self._category = category
- def getCategory(self):
- return self._category
- def setWords(self, words):
- self._words = words
- def getWords(self):
- return self._words
- def setTfidfWords(self, tfidfWords):
- self._tfidfWords = tfidfWords
- def getTfidfWords(self):
- return self._tfidfWords
- def getSortedTfidfWords(self):
- results = [sorted(self._tfidfWords.items(), key=lambda i : i[1], reverse=True), ]
- return results
- def setCHIWords(self, chiWords):
- self._chiWords = chiWords
- def getCHIWords(self):
- return self._chiWords
- def setSimilarities(self, similarities):
- self._similarities = similarities
- def getSimilarities(self):
- return self._similarities
- #文档操作工具类
- class DocHelper:
- #获取目录下所有的文档
- @staticmethod
- def genDocs(path):
- docs = []
- DocHelper.genDocsIterator(path, docs)
- return docs
- #遍历目录获取目录下所有的文档
- @staticmethod
- def genDocsIterator(path, docs):
- if os.path.isdir(path):
- for subPathName in os.listdir(path):
- subPath = os.path.join(path, subPathName)
- DocHelper.genDocsIterator(subPath, docs)
- else:
- name = path[path.rfind('\\') + 1 : path.rfind('.')]
- doc = Doc(name)
- doc.setCategory(path.split('\\')[-2])
- doc.setWords(WordUtils.splitFile(path));
- docs.append(doc)
- #文档中是否包含指定词
- @staticmethod
- def docHasWord(doc, word):
- for dword in doc.getWords():
- if dword == word:
- return True
- return False
- #文档中词频统计
- @staticmethod
- def docWordsStatistics(doc):
- map = {}
- for word in doc.getWords():
- count = map.get(word)
- if count is None:
- count = 0
- map[word] = count + 1
- return map
- #根据文档所属类型分割文档集
- @staticmethod
- def docCategorySplit(docs):
- docSplits = {}
- for doc in docs:
- category = doc.getCategory()
- if docSplits.has_key(category):
- cDocs = docSplits.get(category)
- cDocs.append(doc)
- else :
- cDocs = [doc]
- docSplits[category] = cDocs
- return docSplits
- #根据TFIDF取文档排行前N的词
- @staticmethod
- def docTopNWords(doc, n):
- sortedWords = DocHelper.sortWordValueMap(doc.getTfidfWords())
- words = []
- for item in sortedWords[0:n]:
- words.append(item[0])
- return words
- #文档中词的向量化
- @staticmethod
- def docWordsVector(doc, words):
- vector = []
- docWords = DocHelper.docWordsStatistics(doc)
- for word in words:
- count = docWords.get(word)
- if count is None:
- vector.append(0)
- else :
- vector.append(count)
- return vector
- #根据词所属文档类型获取同类文档集和非同类文档集
- @staticmethod
- def categorySplit(category, docs):
- belongDocs = []
- nobelongDocs = []
- for doc in docs:
- if category == doc.getCategory():
- belongDocs.append(doc)
- else:
- nobelongDocs.append(doc)
- return belongDocs, nobelongDocs
- #根据词所属文档类型获取同类文档集数量
- @staticmethod
- def categoryStatistics(category, docs):
- sum = 0
- for doc in docs:
- if category == doc.getCategory():
- sum = sum + 1
- return sum
- #根据词所属文档类型获取文档集中含有词数量
- @staticmethod
- def categoryWordStatistics(category, word, docs):
- sum = 0
- for doc in docs:
- if category == doc.getCategory() and \
- DocHelper.docHasWord(doc, word):
- sum = sum + 1
- return sum
- #获取包含词文档集和非包含词文档集
- @staticmethod
- def wordDocsSplit(word, docs):
- belongDocs = []
- nobelongDocs = []
- for doc in docs:
- flag = False
- for dword in doc.getWords():
- if word == dword:
- flag = True
- belongDocs.append(doc)
- break;
- if flag == False:
- nobelongDocs.append(doc)
- return belongDocs, nobelongDocs
- #统计文档集包含词的文档数
- @staticmethod
- def wordInDocsStatistics(word, docs):
- sum = 0
- for doc in docs:
- if DocHelper.docHasWord(doc, word):
- sum += 1
- return sum
- #统计文档集不包含词的文档数
- @staticmethod
- def wordNotInDocsStatistics(word, docs):
- sum = 0
- for doc in docs:
- if DocHelper.docHasWord(doc, word) == False:
- sum += 1
- return sum
- #文档集包含词的文档数所属类型在包含次文档数中的概率
- @staticmethod
- def wordCategoryInDocsPercent(word, category, docs):
- sumWord = 0
- sumCategory = 0
- for doc in docs:
- if DocHelper.docHasWord(doc, word):
- sumWord += 1
- if category == doc.getCategory():
- sumCategory += 1
- return float(sumCategory) / sumWord
- @staticmethod
- def calculateTF(doc):
- tf = {}
- for word in doc.getWords():
- if tf.has_key(word):
- tf[word] = tf.get(word) + 1
- else:
- tf[word] = 1
- return tf
- #计算TFIDF
- @staticmethod
- def calculateTFIDF(docs):
- docTotalCount = float(len(docs))
- for doc in docs:
- wordTotalCount = len(doc.getWords())
- tfidfWords = {}
- docWords = DocHelper.docWordsStatistics(doc)
- for word in docWords.keys():
- wordCount = docWords.get(word)
- tf = float(wordCount) / wordTotalCount
- docCount = DocHelper.wordInDocsStatistics(word, docs) + 1
- if docCount > docTotalCount:
- docCount = docTotalCount
- idf = math.log(docTotalCount / docCount);
- tfidf = tf * idf
- tfidfWords[word] = tfidf
- doc.setTfidfWords(tfidfWords)
- #根据开方检验特征选择算法计算文档集中各个文档的词与类别的开方值
- @staticmethod
- def calculateCHI(docs):
- docTotalCount = len(docs)
- for doc in docs:
- chiWords = {}
- words = doc.getWords()
- belongDocs,nobelongDocs = DocHelper.categorySplit(\
- doc.getCategory(), docs)
- for word in words:
- a = DocHelper.wordInDocsStatistics(word, belongDocs)
- b = DocHelper.wordInDocsStatistics(word, nobelongDocs)
- c = DocHelper.wordNotInDocsStatistics(word, belongDocs)
- d = DocHelper.wordNotInDocsStatistics(word, nobelongDocs)
- x = float((a*d-b*c)**2) / ((a+b)*(c+d))
- chiWords[word] = x
- doc.setCHIWords(chiWords)
- #根据信息增益特征选择算法计算文档集中词的信息增益
- @staticmethod
- def calculateInformationGain(docs):
- docTotalCount = len(docs)
- splits = DocHelper.docCategorySplit(docs)
- categories = []
- pcSum = 0
- for item in splits.items():
- categories.append(item[0])
- categoryCount = float(len(item[1]))
- pc = categoryCount / docTotalCount
- pcSum += pc * (math.log(pc) / math.log(2))
- words = []
- for doc in docs:
- words += [i for i in doc.getWords()]
- wordDict = {}
- for word in words:
- belongDocs,nobelongDocs = DocHelper.wordDocsSplit(word, docs)
- wordInDocsCount = len(belongDocs)
- wordNotInDocsCount = len(nobelongDocs)
- pctSum = 0;pcntSum = 0
- for category in categories:
- ctCount = len(DocHelper.categorySplit(category, belongDocs)[0])
- pct = float(ctCount) / wordInDocsCount
- if pct != 0:
- pctSum += pct * (math.log(pct) / math.log(2))
- cntCount = len(DocHelper.categorySplit(category, nobelongDocs)[0])
- if cntCount != 0:
- pcnt = float(cntCount) / wordNotInDocsCount
- if pcnt != 0:
- pcntSum += pcnt * (math.log(pcnt) / math.log(2))
- pt = float(wordInDocsCount) / docTotalCount
- pnt = float(wordNotInDocsCount) / docTotalCount
- ig = -pcSum + pt * pctSum + pnt * pcntSum
- wordDict[word] = ig
- return DocHelper.sortWordValueMap(wordDict)
- #计算文档集中词的交叉期望熵
- @staticmethod
- def calculateKL(docs):
- docTotalCount = len(docs)
- allWords = []
- categories = []
- cateToCount = {}
- wordToCount = {}
- for doc in docs:
- cate = doc.getCategory()
- categories.append(cate)
- cateCount = cateToCount.get(cate)
- if cateCount is None:
- cateToCount[cate] = 1
- else:
- cateToCount[cate] = cateCount + 1
- words = doc.getWords()
- for word in words:
- allWords.append(word)
- count = wordToCount.get(word)
- if count is None:
- wordToCount[word] = 1
- else :
- wordToCount[word] = count + 1
- allWords = set(allWords)
- categories = set(categories)
- wordDict = {}
- word_len = len(allWords)
- for word in allWords:
- pt = float(wordToCount.get(word)) / word_len
- sum = 0; cd = 0; dd = 0
- nt = DocHelper.wordInDocsStatistics(word, docs)
- for category in categories:
- cateCount = cateToCount.get(category)
- pc = float(cateCount) / docTotalCount
- pct = DocHelper.wordCategoryInDocsPercent(word, category, docs)
- sum += pct * math.log(pct / pc)
- nct = DocHelper.categoryWordStatistics(category, word, docs)
- cd += float(nct) / nt
- dd += float(nct) / cateCount
- wordDict[word] = cd * dd * pt * sum
- return DocHelper.sortWordValueMap(wordDict)
- #计算文档集之间的相似度
- @staticmethod
- def calculateSimilar(docs):
- for doc in docs:
- topWords = DocHelper.docTopNWords(doc, 20)
- similarities = []
- for odoc in docs:
- otopWords = DocHelper.docTopNWords(odoc, 20)
- words = WordUtils.mergeAndRemoveRepeat(topWords, otopWords);
- v1 = DocHelper.docWordsVector(doc, words)
- v2 = DocHelper.docWordsVector(odoc, words)
- cosine = DistanceUtils.cosine(v1,v2)
- similarity = DocSimilarity()
- similarity.setName1(doc.getName())
- similarity.setName2(odoc.getName())
- similarity.setVector1(v1)
- similarity.setVector2(v2)
- similarity.setCosine(cosine)
- similarities.append(similarity)
- doc.setSimilarities(similarities)
- #根据字典的value字段倒排序
- @staticmethod
- def sortWordValueMap(wordValueMap):
- results = sorted(wordValueMap.items(), key=lambda i : i[1], reverse=True)
- return results
- import jieba as ws
- #词工具类
- class WordUtils:
- #对文本进行分词
- @staticmethod
- def split(input):
- seg_list = ws.cut(input, cut_all=False)
- words = []
- for word in seg_list:
- words.append(word)
- return words
- #对文件进行分词
- @staticmethod
- def splitFile(path):
- file = open(path)
- words = []
- for line in file.readlines():
- line = line.strip();
- if len(line) > 0:
- for w in WordUtils.split(line):
- words.append(w)
- file.close()
- return WordUtils.removeStopWords(words)
- #根据停用词文件移除词集中的停用词
- @staticmethod
- def removeStopWords(words):
- file = open("stopwords.dic")
- stopwords = []
- for line in file.readlines():
- line = line.strip();
- if len(line) > 0:
- stopwords.append(line)
- file.close()
- rwords = []
- for word in words:
- flag = True
- for stopword in stopwords:
- #if word.encode('utf-8') == stopword.encode('utf-8'):
- if word == stopword:
- flag = False
- break
- if flag and len(word.strip()) > 0:
- rwords.append(word)
- return rwords
- #词汇总并且去除重复
- @staticmethod
- def mergeAndRemoveRepeat(w1, w2):
- all = [i1 for i1 in w1]
- all += [i2 for i2 in w2]
- return [i for i in set(all)]
- #下面这种方式也可以
- #all = set(l1) | set(l2)
- #return [i for i in all]
测试用例:
- def testTFIDF():
- path = r'D:\resources\test'
- docs = DocHelper.genDocs(path)
- DocHelper.calculateTFIDF(docs)
- for doc in docs:
- print '----------'
- tf = DocHelper.calculateTF(doc)
- tfidf = doc.getTfidfWords()
- for item in DocHelper.sortWordValueMap(tf)[0:20]:
- print '%s-%s-%s' %(item[0],item[1],tfidf.get(item[0]))
- def testSimilarity():
- path = r'D:\resources\test'
- docs = DocHelper.genDocs(path)
- DocHelper.calculateTFIDF(docs)
- DocHelper.calculateSimilar(docs)
- for doc in docs:
- print '----------'
- for similarity in doc.getSimilarities():
- print '%s-%s-%s' %(similarity.getName1(),\
- similarity.getName2(), similarity.getCosine())
- def testCHI():
- path = r'D:\resources\test'
- docs = DocHelper.genDocs(path)
- DocHelper.calculateCHI(docs)
- for doc in docs:
- print '----------'
- for item in DocHelper.sortWordValueMap(doc.getCHIWords())[0:10]:
- print '%s-%s' %(item[0],item[1])
- def testInformationGain():
- path = r'D:\resources\test'
- docs = DocHelper.genDocs(path)
- wordDict = DocHelper.calculateInformationGain(docs)
- for item in wordDict[0:30]:
- print '%s-%s' %(item[0],item[1])
- def testKL():
- path = r'D:\resources\test'
- docs = DocHelper.genDocs(path)
- wordDict = DocHelper.calculateKL(docs)
- for item in wordDict[0:30]:
- print '%s-%s' %(item[0],item[1])
- if __name__ == '__main__':
- print '-----TFIDF-----'
- testTFIDF()
- print '-----CHI-----'
- testCHI()
- print '-----IG-----'
- testInformationGain()
- print '-----KL-----'
- testKL()
- print '----------'