由于大作业中需要,本来是想找现成的卡方检验程序的,但是没找过,尴尬,所以一怒之下之前用了一个晚上编出来的(编程水平太渣,据说大神只用一小时.....)
这里还是提一下卡方检验的处理步骤吧,虽然我在实验报告里写到了......
老师在上课时提供的ppt里这部分的例子很好,放上来:
这段程序的用处是对一类文章中出现的词进行卡方检验统计,找到每个类别中CHI值较大的一些作为本类的特征词,然后,只保留本类中每个文章中出现的这些特征词,以便后续处理。我觉得我没有说明白,之后会放上整个实验的代码和文档,里面应该对整体流程说的比较清楚.....
#!/usr/bin/python
# coding: utf8
from __future__ import division
import os
def CHI_control_text(text,ca_num,textname,categoryName):
#针对每个文章中只保留根据CHI值选取的特征词,使一会儿构成的特征词仅仅由这些组成
CHI_order_select = open("CHIorder" + '\\' + 'class' + str(ca_num) + "_CHIorder_select.txt", 'r')
if not os.path.exists("text_remain\\sougou_all" + '\\' + categoryName):
os.makedirs("text_remain\\sougou_all" + '\\' + categoryName)
text_remain = open("text_remain" + '\\' + textname, 'w')
#根据CHI筛选后的词语对给每个文章的分词结果进行修改,也就是只保留每篇文章中出现这些词的
dict = {}
for kv in [d.strip() for d in CHI_order_select]:
dict[kv] = kv #读入对应类别的保留词
have_word_num = 0 #用来记录本篇文章中共有几个关键词
for line in text.readlines():
# print line.strip()
text_info = line.strip().split('\t')
if dict.has_key(text_info[0].strip()):
have_word_num += 1
#判断本篇文章中的词是否是保留词,如是,写入这篇文章的text_remain中
text_remain.write(text_info[0].strip() + '\n')
if have_word_num < 2: #把出现关键词个数少于2的文章的文章删除,这种文章几乎和本主题没有什么关系
text_remain.close()
os.remove("text_remain" + '\\' + textname)
else:
text_remain.close()
#得到每个类别下,文章的数目
class1_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "aoyun"))
class2_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "fangchan"))
class3_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "jiankang"))
class4_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "jiaoyu"))
class5_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "lvyou"))
class6_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "qiche"))
class7_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "shangye"))
class8_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "shishang"))
class9_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "tiyu"))
class10_num = len(os.listdir("wordtimes\\sougou_all" + '\\' + "yule"))
#得到每个类别下,每个词在多少个文章中出现
dict_1 = {}
with open("wordtimes\\sougou_all" + '\\' + "aoyun_classtimes.txt", 'r') as df1:
for kv in [d.strip().split('\t') for d in df1]:
dict_1[kv[0].decode('utf-8')] = kv[1]
# for k in dict_gaokao: #这个k是只有单词的
# print k,dict_gaokao[k]
dict_2 = {}
with open("wordtimes\\sougou_all" + '\\' + "fangchan_classtimes.txt", 'r') as df2:
for kv in [d.strip().split('\t') for d in df2]:
# print kv[0].decode('utf-8')
dict_2[kv[0].decode('utf-8')] = kv[1]
dict_3 = {}
with open("wordtimes\\sougou_all" + '\\' + "jiankang_classtimes.txt", 'r') as df3:
for kv in [d.strip().split('\t') for d in df3]:
dict_3[kv[0].decode('utf-8')] = kv[1]
dict_4 = {}
with open("wordtimes\\sougou_all" + '\\' + "jiaoyu_classtimes.txt", 'r') as df4:
for kv in [d.strip().split('\t') for d in df4]:
dict_4[kv[0].decode('utf-8')] = kv[1]
dict_5 = {}
with open("wordtimes\\sougou_all" + '\\' + "lvyou_classtimes.txt", 'r') as df5:
for kv in [d.strip().split('\t') for d in df5]:
dict_5[kv[0].decode('utf-8')] = kv[1]
dict_6 = {}
with open("wordtimes\\sougou_all" + '\\' + "qiche_classtimes.txt", 'r') as df6:
for kv in [d.strip().split('\t') for d in df6]:
dict_6[kv[0].decode('utf-8')] = kv[1]
dict_7 = {}
with open("wordtimes\\sougou_all" + '\\' + "shangye_classtimes.txt", 'r') as df7:
for kv in [d.strip().split('\t') for d in df7]:
dict_7[kv[0].decode('utf-8')] = kv[1]
dict_8 = {}
with open("wordtimes\\sougou_all" + '\\' + "shishang_classtimes.txt", 'r') as df8:
for kv in [d.strip().split('\t') for d in df8]:
dict_8[kv[0].decode('utf-8')] = kv[1]
dict_9 = {}
with open("wordtimes\\sougou_all" + '\\' + "tiyu_classtimes.txt", 'r') as df9:
for kv in [d.strip().split('\t') for d in df9]:
dict_9[kv[0].decode('utf-8')] = kv[1]
dict_10 = {}
with open("wordtimes\\sougou_all" + '\\' + "yule_classtimes.txt", 'r') as df10:
for kv in [d.strip().split('\t') for d in df10]:
dict_10[kv[0].decode('utf-8')] = kv[1]
for class_num in range(1,11): #这里注意才是1到10 用于遍历每个class词典
dictname = locals()['dict_'+str(class_num)] #超级棒的一个locals()[],可以这样得到变量的名字
CHI_dic = {} # 用于记录这个class中每个词的卡方检验值
for kv in dictname: #遍历这个类别下的每个词,把这个类别下每个词的CHI值比较一下,取前100个
# print kv #记录这个单词名称
kv_out_class = 0 # 统计一个新词时,初始化本类别外用到这个词的文档数目为0 相当于b
not_kv_out_class = 0 #统计一个新词时,初始化本类别外没有用到这个词的文档数目为0 相当于d
kv_in_class = int(dictname[kv]) #记录在这个分类下包含这个词的文档的数量 相当于a
# print type(kv_in_class) #注意这里得到的是str型的,一会儿做减法要类型转换
not_kv_in_class = (locals()['class' + str(class_num) + '_num']) - kv_in_class ##记录在这个分类下不包含这个词的文档的数量 相当于c
for class_compare in range(1,11) :
if class_compare != class_num:
comparename = locals()['dict_' + str(class_compare)]
if comparename.has_key(kv):
kv_out_class += int(comparename[kv])
not_kv_out_class += (locals()['class' + str(class_compare) + '_num']) - kv_in_class
CHI_dic[kv] = ((kv_in_class*not_kv_out_class - kv_out_class*not_kv_in_class)** 2)/((kv_in_class+kv_out_class)*(not_kv_in_class+not_kv_out_class))
# print kv,CHI_dic[kv]
# print sys.getdefaultencoding()
CHI_order = open("CHIorder" + '\\' + 'class'+str(class_num) + "_CHIorder.txt", 'w')
CHI_order.write(('\n'.join(sorted(CHI_dic, key=CHI_dic.get, reverse=True))).encode('utf-8'))
fin = open("CHIorder" + '\\' + 'class'+str(class_num) + "_CHIorder.txt", 'r')
N = int(0.015*len(locals()['dict_'+str(class_num)])) #只取CHI值较大的前0.015个单词
print "从第%d类中选出%d个关键词" % (class_num,N)
CHI_order_select = open("CHIorder" + '\\' + 'class' + str(class_num) + "_CHIorder_select.txt", 'w')
for line in fin.readlines()[0:N] : #得到CHI值较大的N个单词作为当前的特征词,N和本类别的单词的数量有关
CHI_order_select.write(line.strip() + '\n')
#下面得到每个文章中出现这些被选出词的情况,也就是使一会儿构成的特征词仅仅由这些组成
rootpath = "..\seg and anno"+"\\"+"results"+"\\"+"sougou_all"
category = os.listdir(rootpath)
ca_num = 1
for categoryName in category: # 循环类别文件,OSX系统默认第一个是系统文件
# if categoryName == 'yule':
if(categoryName=='.DS_Store'):continue
categoryPath = os.path.join(rootpath,categoryName) # 这个类别的路径
filesList = os.listdir(categoryPath) # 这个类别内所有文件列表
for filename in filesList:
if(filename=='.DS_Store'):continue
textname = (os.path.join(categoryPath, filename))[24:] #gaokao\1.txt
contents = open(os.path.join(categoryPath,filename))
text_remain = CHI_control_text(contents,ca_num,textname,categoryName)
# break
ca_num += 1
# break
print "CHI_run is finished!"