【知识图谱】基础——命名实体识别、TFIDF、Kmeans代码实现

已知观测序列为O,隐含转态序列为I,提取实体结果{‘张三’:”per”,‘北京大学’:”school”}
O = [‘张’,‘三’,‘在’,‘北’,‘京’,‘大’,‘学’,‘读’,‘书’]
I = [‘per_b’,‘per_i’,‘o’,‘school_b’,‘school_i’,’ school_i’,’ school _i’,‘o’,‘o’]
‘’’

O = ['张','三','在','北','京','大','学','读','书']
I =  ['per_b','per_i','o','school_b','school_i',' school_i',' school _i','o','o']
labels = list(zip(O,I))
labels = [label for label in labels if label[1]!='o']
index_b = [index for index,label in enumerate(labels) if '_b' in label[1]]
if index_b:
    index_b.append(len(labels))
    labels = [labels[index_b[index]:index_b[index+1]] for index,b in enumerate(index_b) if index<len(index_b)-1]
    word_dict = {''.join([word[0] for word in words]):[word[1].split('_')[0] for word in words][0] for words in labels}
    #print(word_dict)

TF-IDF

import numpy as np
import jieba

class TfIdf:
    def __init__(self,doc):
        self.doc = doc
        self.get_dic()
        
    def get_dic(self):
        self.doc = [list(jieba.cut(sent)) for sent in self.doc]
        self.dic = sorted(list(set([word for sent in self.doc for word in sent])))
    
    def cal_tf(self):
        self.tf = np.array([[round(sent.count(word)/len(sent),4) for word in self.dic] for sent in self.doc])
    
    def cal_idf(self):
        self.idf = np.array([round(np.log(len(self.doc)/sum([1 for sent in self.doc if word in sent])),4) for word in self.dic])
    
    def cal_tfidf(self):
        self.cal_tf()
        self.cal_idf()
        self.tfidf = self.tf*self.idf    

if __name__=="__main__":
    doc = ['女排北京奥运会夺冠',
           '北京奥运会的羽毛球男单决赛',
           '中国队女排夺北京奥运会金牌重返巅峰观众欢呼女排女排女排']
    tf_idf = TfIdf(doc)
    tf_idf.cal_tfidf()

Kmeans

import numpy as np
import os
import random
from tfidf_model import TfIdf

#a = np.array([0, 1, 2, 3, 1, 2, 2])
#print(a[[2,4,2,2,2,2]])
## 设置随机种子
#random.seed(4)

class Kmeans:
    def __init__(self,doc,k,max_iter):
        self.doc = doc
        self.k = k
        self.max_iter = max_iter
        self.tf_idf = TfIdf(doc)
        self.tf_idf.cal_tfidf()
    
    def train(self):
        # 1、随机初始化k个蔟中心
        cluster_center = {i:self.tf_idf.tfidf[i] for i in range(self.k)}
        kmean_iter = 1
        while True:
            # 2、计算每篇文本k个蔟中心的距离
            doc_dist = np.array([[self.cal_dist(cluster_center[i],sent) for i in range(self.k)] for sent in self.tf_idf.tfidf])
            doc_dist_argsort = np.argmax(doc_dist,axis=1)     # 每篇文本和他最近的蔟中心
            # 3、把数据划分到对应的蔟集合
            cluster_set = {i:np.argwhere(doc_dist_argsort==i).reshape(-1) for i in range(self.k)}
            # 4、重新计算蔟中心
            cluster_center = {i:np.mean(self.tf_idf.tfidf[cluster_set[i]],axis=0)  for  i in range(self.k)}
            # 5、设置停止条件
            if kmean_iter>self.max_iter:
                break
            kmean_iter += 1  
            print(kmean_iter)
            print(cluster_set)
        
    @staticmethod
    def cal_dist(vec1,vec2):
        return round(np.dot(vec1,vec2)/np.sqrt((np.dot(vec1,vec1)*np.dot(vec2,vec2))),4)     
    
if __name__ == "__main__":
    doc_dir = 'test_text'
    doc = []
    for file_name in os.listdir(doc_dir):
        file_path = os.path.join(doc_dir,file_name)
        with open(file_path,encoding="utf-8") as f:
            doc.append(f.read())
    kmeans = Kmeans(doc, 3, 100)  
    kmeans.train()
    
  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值