一、实验目的
通过实验,掌握利用Rocchio算法
二、实验任务与要求
利用Rocchio算法实现测试文档分类。完成tf_idf的数值计算、词项权重存储、权重归一以及质心向量;
三、code
1.code
import math
#处理数据输入
def getdata():
data={}
type={}
document=[]
index = 1
while True:
document_tt = {}
document_temp=input("请输入文本:").split()
type_temp=input("请输入该文本的类别:")
type[type_temp]=type.get(type_temp,0)+1
document+=document_temp
document_tt[index]=document_temp
index=index+1
if type_temp in data:
data[type_temp].update(document_tt)
else:
data[type_temp]=document_tt
flag = input('要继续输入吗[y/n]:')
if flag == 'n':
break
return data,type,index-1,set(document)
#求df
def df_to_list(data,words):
df_list={}
for word in words:
flag = 0
for value in data.values():
for value2 in value.values():
if word in value2:
flag+=1
df_list[word]=flag
return df_list
#训练集向量权重
def train(data,df_list,N,words):
vec_fin={}
for type_temp in data.keys():
torange=data[type_temp]
index = 1
vec = {}
for value in torange.values():
vec_temp_dic = {}
for word in words:
tf=value.count(word)
if tf ==0:
vec_temp = 0
vec_temp_dic[word] = vec_temp
else:
vec_temp = (1 + math.log10(tf)) * math.log10(N / df_list[word])
vec_temp_dic[word] = vec_temp
vec[index]=vec_temp_dic
index+=1
vec_fin[type_temp]=vec
return vec_fin
#训练集归一化
def normalize(vec):
for type in vec.values():
for index in type.values():
norma=0
for num in index.values():
norma+=num**2
norma=math.sqrt(norma)
for word in index.keys():
index[word]/=norma
return vec
#测试集向量权重
def test(df_list,N,test,words):
words=set(test+list(words))
d_test={}
for word in words:
tf=test.count(word)
if (tf ==0) or (word not in df_list.keys()):
vec_temp = 0
d_test[word] = vec_temp
else:
vec_temp = (1 + math.log10(tf)) * math.log10(N / df_list[word])
d_test[word] = vec_temp
return d_test
#测试集归一化
def normalize_test(d_test):
norma=0
for num in d_test.values():
norma+=num**2
norma=math.sqrt(norma)
for word in d_test.keys():
d_test[word]/=norma
return d_test
#不同类别权重平均
def average(vec_nor,words):
average_dis={}
for type_temp in vec_nor.keys():
torange = vec_nor[type_temp]
average_dis_temp = {}
for word in words:
average_start=0
for values in torange.values():
average_start+=values[word]
average_dis_temp[word]=average_start/len(torange)
average_dis[type_temp] = average_dis_temp
return average_dis
#最终类别分类器
def distance(average_dis,d_test_nor,words):
dis_list={}
for type_temp in average_dis.keys():
torange = average_dis[type_temp]
sum=0
for word in words:
sum+=(d_test_nor[word]-torange[word])**2
dis=math.sqrt(sum)
dis_list[type_temp]=dis
return dis_list
data, type,N,words=getdata()
print("数据为:",data,"\n词项:",words)
df_list=df_to_list(data,words)
print("df:",df_list)
vec=train(data,df_list, N,words)
print("词项权重:",vec)
vec_nor=normalize(vec)
print("归一化词项权重:",vec_nor)
txt=input("请输入测试文档:")
inputt=txt.split()
d_test=test(df_list, N,inputt,words)
print("测试集词项权重",d_test)
d_test_nor=normalize_test(d_test)
print("归一化测试集词项权重:",d_test_nor)
average_dis=average(vec_nor,words)
print("平均向量:",average_dis)
distan=distance(average_dis,d_test_nor,words)
print("最终距离是:",distan)
temp=list(distan.items())
print("\n")
for type, value in temp:
print("测试文档属于{}类的距离为{}".format(type, value))
temp.sort(key=lambda x:x[1],reverse=False)
print("\nRocchio分类器判定测试文档属于{}类".format(temp[0][0]))
2.测试样例
Chinese Beijing Chinese
Yes
Chinese Shanghai
Yes
Macao Shandong
Yes
Tokyo Japan Chinese
Yes
Russia Germany Qingdao
Yes
Via China To Korea
No
Overseas Chinese
No
Chinese Contains Shanghai
?