利用Rocchio算法实现测试文档分类


一、实验目的

  通过实验,掌握利用Rocchio算法


二、实验任务与要求

  利用Rocchio算法实现测试文档分类。完成tf_idf的数值计算、词项权重存储、权重归一以及质心向量;
在这里插入图片描述


三、code

1.code

import math
#处理数据输入
def getdata():
    data={}
    type={}
    document=[]
    index = 1
    while True:
        document_tt = {}
        document_temp=input("请输入文本:").split()
        type_temp=input("请输入该文本的类别:")
        type[type_temp]=type.get(type_temp,0)+1
        document+=document_temp

        document_tt[index]=document_temp
        index=index+1

        if type_temp in data:
            data[type_temp].update(document_tt)
        else:
            data[type_temp]=document_tt

        flag = input('要继续输入吗[y/n]:')
        if flag == 'n':
            break

    return data,type,index-1,set(document)
#求df
def df_to_list(data,words):
    df_list={}
    for word in words:
        flag = 0
        for value in data.values():
            for value2 in value.values():
                if word in value2:
                    flag+=1
        df_list[word]=flag
    return df_list


#训练集向量权重
def train(data,df_list,N,words):
    vec_fin={}
    for type_temp in data.keys():
        torange=data[type_temp]
        index = 1
        vec = {}
        for value in torange.values():
            vec_temp_dic = {}
            for word in words:
                tf=value.count(word)
                if tf ==0:
                    vec_temp = 0
                    vec_temp_dic[word] = vec_temp
                else:
                    vec_temp = (1 + math.log10(tf)) * math.log10(N / df_list[word])
                    vec_temp_dic[word] = vec_temp

            vec[index]=vec_temp_dic
            index+=1
        vec_fin[type_temp]=vec

    return vec_fin
#训练集归一化
def normalize(vec):
    for type in vec.values():
        for index in type.values():
            norma=0
            for num in index.values():
                norma+=num**2
            norma=math.sqrt(norma)

            for word in index.keys():
                index[word]/=norma
    return vec

#测试集向量权重
def test(df_list,N,test,words):
    words=set(test+list(words))
    d_test={}
    for word in words:
        tf=test.count(word)
        if (tf ==0) or  (word not in df_list.keys()):
            vec_temp = 0
            d_test[word] = vec_temp
        else:
            vec_temp = (1 + math.log10(tf)) * math.log10(N / df_list[word])
            d_test[word] = vec_temp

    return d_test
#测试集归一化
def normalize_test(d_test):
    norma=0
    for num in d_test.values():
        norma+=num**2
    norma=math.sqrt(norma)

    for word in d_test.keys():
        d_test[word]/=norma

    return d_test
#不同类别权重平均
def average(vec_nor,words):
    average_dis={}
    for type_temp in vec_nor.keys():
        torange = vec_nor[type_temp]
        average_dis_temp = {}
        for word in words:
            average_start=0
            for values in torange.values():
                average_start+=values[word]
            average_dis_temp[word]=average_start/len(torange)
        average_dis[type_temp] = average_dis_temp

    return average_dis
#最终类别分类器
def distance(average_dis,d_test_nor,words):
    dis_list={}
    for type_temp in average_dis.keys():
        torange = average_dis[type_temp]
        sum=0
        for word in words:
            sum+=(d_test_nor[word]-torange[word])**2
        dis=math.sqrt(sum)
        dis_list[type_temp]=dis
    return dis_list


data, type,N,words=getdata()
print("数据为:",data,"\n词项:",words)
df_list=df_to_list(data,words)
print("df:",df_list)
vec=train(data,df_list, N,words)
print("词项权重:",vec)

vec_nor=normalize(vec)
print("归一化词项权重:",vec_nor)

txt=input("请输入测试文档:")
inputt=txt.split()
d_test=test(df_list, N,inputt,words)
print("测试集词项权重",d_test)
d_test_nor=normalize_test(d_test)
print("归一化测试集词项权重:",d_test_nor)

average_dis=average(vec_nor,words)
print("平均向量:",average_dis)

distan=distance(average_dis,d_test_nor,words)
print("最终距离是:",distan)

temp=list(distan.items())
print("\n")
for type, value in temp:
    print("测试文档属于{}类的距离为{}".format(type, value))

temp.sort(key=lambda x:x[1],reverse=False)
print("\nRocchio分类器判定测试文档属于{}类".format(temp[0][0]))

2.测试样例

Chinese Beijing Chinese
Yes
Chinese Shanghai
Yes
Macao Shandong
Yes
Tokyo Japan Chinese
Yes
Russia Germany Qingdao
Yes
Via China To Korea
No
Overseas Chinese
No

Chinese Contains Shanghai
?

3.实验结果

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
Rocchio算法是一种基于向量空间模型的文本分类算法,其思想是将测试文档的向量表示与已知类别的训练文档的向量表示进行比较,根据最相似的训练文档的类别来预测测试文档的类别。以下是一个基于Rocchio算法测试文档分类Python代码示例: ```python import numpy as np class RocchioClassifier: def __init__(self, alpha=1, beta=0.75, threshold=0): self.alpha = alpha # 加权因子 self.beta = beta # 减权因子 self.threshold = threshold # 判断阈值 def fit(self, X, y): # 计算各个类别的文档向量的平均值 self.class_means = {} for label in np.unique(y): self.class_means[label] = np.mean(X[y == label], axis=0) def predict(self, X): y_pred = [] for x in X: # 计算测试文档向量与各个类别的文档向量的余弦相似度 similarities = {} for label, mean in self.class_means.items(): similarities[label] = np.dot(x, mean) / (np.linalg.norm(x) * np.linalg.norm(mean)) # 根据余弦相似度最大的类别来预测测试文档的类别 max_label = max(similarities, key=similarities.get) if similarities[max_label] >= self.threshold: y_pred.append(max_label) else: y_pred.append(None) return y_pred def fit_predict(self, X_train, y_train, X_test): self.fit(X_train, y_train) return self.predict(X_test) ``` 使用示例: ```python from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report # 加载数据集 newsgroups = fetch_20newsgroups(subset='all') # 特征提取 vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(newsgroups.data) y = newsgroups.target # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 训练并预测 clf = RocchioClassifier() y_pred = clf.fit_predict(X_train, y_train, X_test) # 评估分类器性能 print(classification_report(y_test, y_pred, target_names=newsgroups.target_names)) ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

幻兒

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值