关于特征选择相关的知识可以参考一下连接
项目源码里面包含Java和Python的实现,这里只列出Python实现:
代码托管:https://github.com/fighting-one-piece/repository-datamining.git
class Doc:
def __init__(self, name):
self._name = name
def setName(self, name):
self._name = name
def getName(self):
return self._name
def setCategory(self, category):
self._category = category
def getCategory(self):
return self._category
def setWords(self, words):
self._words = words
def getWords(self):
return self._words
def setTfidfWords(self, tfidfWords):
self._tfidfWords = tfidfWords
def getTfidfWords(self):
return self._tfidfWords
def getSortedTfidfWords(self):
results = [sorted(self._tfidfWords.items(), key=lambda i : i[1], reverse=True), ]
return results
def setCHIWords(self, chiWords):
self._chiWords = chiWords
def getCHIWords(self):
return self._chiWords
def setSimilarities(self, similarities):
self._similarities = similarities
def getSimilarities(self):
return self._similarities
#文档操作工具类
class DocHelper:
#获取目录下所有的文档
@staticmethod
def genDocs(path):
docs = []
DocHelper.genDocsIterator(path, docs)
return docs
#遍历目录获取目录下所有的文档
@staticmethod
def genDocsIterator(path, docs):
if os.path.isdir(path):
for subPathName in os.listdir(path):
subPath = os.path.join(path, subPathName)
DocHelper.genDocsIterator(subPath, docs)
else:
name = path[path.rfind('\\') + 1 : path.rfind('.')]
doc = Doc(name)
doc.setCategory(path.split('\\')[-2])
doc.setWords(WordUtils.splitFile(path));
docs.append(doc)
#文档中是否包含指定词
@staticmethod
def docHasWord(doc, word):
for dword in doc.getWords():
if dword == word:
return True
return False
#文档中词频统计
@staticmethod
def docWordsStatistics(doc):
map = {}
for word in doc.getWords():
count = map.get(word)
if count is None:
count = 0
map[word] = co