特征提取TF-IDF值的两种方法,结果保存在.mat文件中:
1、人工分词,每类取3000合并
#人工分词,每类取3000合并
def load_files(directory,prefix=None,postfix=None):
#获取所有文件
files_list=[]
classlen=[0 for i in range(11)]
i = 0
for root, sub_dirs, files in os.walk(directory):
classlen[i] = len(files)
i += 1
for special_file in files:
if postfix:
if special_file.endswith(postfix):
files_list.append(os.path.join(root, special_file))
elif prefix:
if special_file.startswith(prefix):
files_list.append(os.path.join(root, special_file))
else:
files_list.append(os.path.join(root, special_file))
#扫描建立词典
articallist = [dict() for i in range(len(files_list))]
filelen = [0 for l in range(len(files_list))]
i = 0
for eachfile in files_list:
file_object = open(eachfile,'r')
t = 0
for line in file_object:
for word in line.split():
#非数字
if not str(word).isdigit():
t += 1
#大小写转换
word = str(word).lower()
if articallist[i].has_key(word):
articallist[i][word] += 1
else:
articallist[i][word] = 1
filelen[i] = t
i += 1
file_object.close()
#print '总文件数:',len(files_list)
print len(articallist[1])
return articallist,classlen,filelen
#导入停止词表
def load_stop_en(filename):
word_list=[]
file_object = open(filename,'r')
for line in file_object:
word_list.append(line.strip())
return word_list
#去停止词
def delet_stopword_en(stop_en_set, en_dict):
for key in stop_en_set:
if en_dict.has_key(key):
del en_dict[key]
#获取某个词在所有文档中的TF-IDF
def get_TFIDF(artic