利用Rocchio算法实现测试文档分类


一、实验目的

  通过实验,掌握利用Rocchio算法


二、实验任务与要求

  利用Rocchio算法实现测试文档分类。完成tf_idf的数值计算、词项权重存储、权重归一以及质心向量;
在这里插入图片描述


三、code

1.code

import math
#处理数据输入
def getdata():
    data={}
    type={}
    document=[]
    index = 1
    while True:
        document_tt = {}
        document_temp=input("请输入文本:").split()
        type_temp=input("请输入该文本的类别:")
        type[type_temp]=type.get(type_temp,0)+1
        document+=document_temp

        document_tt[index]=document_temp
        index=index+1

        if type_temp in data:
            data[type_temp].update(document_tt)
        else:
            data[type_temp]=document_tt

        flag = input('要继续输入吗[y/n]:')
        if flag == 'n':
            break

    return data,type,index-1,set(document)
#求df
def df_to_list(data,words):
    df_list={}
    for word in words:
        flag = 0
        for value in data.values():
            for value2 in value.values():
                if word in value2:
                    flag+=1
        df_list[word]=flag
    return df_list


#训练集向量权重
def train(data,df_list,N,words):
    vec_fin={}
    for type_temp in data.keys():
        torange=data[type_temp]
        index = 1
        vec = {}
        for value in torange.values():
            vec_temp_dic = {}
            for word in words:
                tf=value.count(word)
                if tf ==0:
                    vec_temp = 0
                    vec_temp_dic[word] = vec_temp
                else:
                    vec_temp = (1 + math.log10(tf)) * math.log10(N / df_list[word])
                    vec_temp_dic[word] = vec_temp

            vec[index]=vec_temp_dic
            index+=1
        vec_fin[type_temp]=vec

    return vec_fin
#训练集归一化
def normalize(vec):
    for type in vec.values():
        for index in type.values():
            norma=0
            for num in index.values():
                norma+=num**2
            norma=math.sqrt(norma)

            for word in index.keys():
                index[word]/=norma
    return vec

#测试集向量权重
def test(df_list,N,test,words):
    words=set(test+list(words))
    d_test={}
    for word in words:
        tf=test.count(word)
        if (tf ==0) or  (word not in df_list.keys()):
            vec_temp = 0
            d_test[word] = vec_temp
        else:
            vec_temp = (1 + math.log10(tf)) * math.log10(N / df_list[word])
            d_test[word] = vec_temp

    return d_test
#测试集归一化
def normalize_test(d_test):
    norma=0
    for num in d_test.values():
        norma+=num**2
    norma=math.sqrt(norma)

    for word in d_test.keys():
        d_test[word]/=norma

    return d_test
#不同类别权重平均
def average(vec_nor,words):
    average_dis={}
    for type_temp in vec_nor.keys():
        torange = vec_nor[type_temp]
        average_dis_temp = {}
        for word in words:
            average_start=0
            for values in torange.values():
                average_start+=values[word]
            average_dis_temp[word]=average_start/len(torange)
        average_dis[type_temp] = average_dis_temp

    return average_dis
#最终类别分类器
def distance(average_dis,d_test_nor,words):
    dis_list={}
    for type_temp in average_dis.keys():
        torange = average_dis[type_temp]
        sum=0
        for word in words:
            sum+=(d_test_nor[word]-torange[word])**2
        dis=math.sqrt(sum)
        dis_list[type_temp]=dis
    return dis_list


data, type,N,words=getdata()
print("数据为:",data,"\n词项:",words)
df_list=df_to_list(data,words)
print("df:",df_list)
vec=train(data,df_list, N,words)
print("词项权重:",vec)

vec_nor=normalize(vec)
print("归一化词项权重:",vec_nor)

txt=input("请输入测试文档:")
inputt=txt.split()
d_test=test(df_list, N,inputt,words)
print("测试集词项权重",d_test)
d_test_nor=normalize_test(d_test)
print("归一化测试集词项权重:",d_test_nor)

average_dis=average(vec_nor,words)
print("平均向量:",average_dis)

distan=distance(average_dis,d_test_nor,words)
print("最终距离是:",distan)

temp=list(distan.items())
print("\n")
for type, value in temp:
    print("测试文档属于{}类的距离为{}".format(type, value))

temp.sort(key=lambda x:x[1],reverse=False)
print("\nRocchio分类器判定测试文档属于{}类".format(temp[0][0]))

2.测试样例

Chinese Beijing Chinese
Yes
Chinese Shanghai
Yes
Macao Shandong
Yes
Tokyo Japan Chinese
Yes
Russia Germany Qingdao
Yes
Via China To Korea
No
Overseas Chinese
No

Chinese Contains Shanghai
?

3.实验结果

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

幻兒

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值