已知观测序列为O,隐含转态序列为I,提取实体结果{‘张三’:”per”,‘北京大学’:”school”}
O = [‘张’,‘三’,‘在’,‘北’,‘京’,‘大’,‘学’,‘读’,‘书’]
I = [‘per_b’,‘per_i’,‘o’,‘school_b’,‘school_i’,’ school_i’,’ school _i’,‘o’,‘o’]
‘’’
O = ['张','三','在','北','京','大','学','读','书']
I = ['per_b','per_i','o','school_b','school_i',' school_i',' school _i','o','o']
labels = list(zip(O,I))
labels = [label for label in labels if label[1]!='o']
index_b = [index for index,label in enumerate(labels) if '_b' in label[1]]
if index_b:
index_b.append(len(labels))
labels = [labels[index_b[index]:index_b[index+1]] for index,b in enumerate(index_b) if index<len(index_b)-1]
word_dict = {''.join([word[0] for word in words]):[word[1].split('_')[0] for word in words][0] for words in labels}
#print(word_dict)
TF-IDF
import numpy as np
import jieba
class TfIdf:
def __init__(self,doc):
self.doc = doc
self.get_dic()
def get_dic(self):
self.doc = [list(jieba.cut(sent)) for sent in self.doc]
self.dic = sorted(list(set([word for sent in self.doc for word in sent])))
def cal_tf(self):
self.tf = np.array([[round(sent.count(word)/len(sent),4) for word in self.dic] for sent in self.doc])
def cal_idf(self):
self.idf = np.array([round(np.log(len(self.doc)/sum([1 for sent in self.doc if word in sent])),4) for word in self.dic])
def cal_tfidf(self):
self.cal_tf()
self.cal_idf()
self.tfidf = self.tf*self.idf
if __name__=="__main__":
doc = ['女排北京奥运会夺冠',
'北京奥运会的羽毛球男单决赛',
'中国队女排夺北京奥运会金牌重返巅峰观众欢呼女排女排女排']
tf_idf = TfIdf(doc)
tf_idf.cal_tfidf()
Kmeans
import numpy as np
import os
import random
from tfidf_model import TfIdf
#a = np.array([0, 1, 2, 3, 1, 2, 2])
#print(a[[2,4,2,2,2,2]])
## 设置随机种子
#random.seed(4)
class Kmeans:
def __init__(self,doc,k,max_iter):
self.doc = doc
self.k = k
self.max_iter = max_iter
self.tf_idf = TfIdf(doc)
self.tf_idf.cal_tfidf()
def train(self):
# 1、随机初始化k个蔟中心
cluster_center = {i:self.tf_idf.tfidf[i] for i in range(self.k)}
kmean_iter = 1
while True:
# 2、计算每篇文本k个蔟中心的距离
doc_dist = np.array([[self.cal_dist(cluster_center[i],sent) for i in range(self.k)] for sent in self.tf_idf.tfidf])
doc_dist_argsort = np.argmax(doc_dist,axis=1) # 每篇文本和他最近的蔟中心
# 3、把数据划分到对应的蔟集合
cluster_set = {i:np.argwhere(doc_dist_argsort==i).reshape(-1) for i in range(self.k)}
# 4、重新计算蔟中心
cluster_center = {i:np.mean(self.tf_idf.tfidf[cluster_set[i]],axis=0) for i in range(self.k)}
# 5、设置停止条件
if kmean_iter>self.max_iter:
break
kmean_iter += 1
print(kmean_iter)
print(cluster_set)
@staticmethod
def cal_dist(vec1,vec2):
return round(np.dot(vec1,vec2)/np.sqrt((np.dot(vec1,vec1)*np.dot(vec2,vec2))),4)
if __name__ == "__main__":
doc_dir = 'test_text'
doc = []
for file_name in os.listdir(doc_dir):
file_path = os.path.join(doc_dir,file_name)
with open(file_path,encoding="utf-8") as f:
doc.append(f.read())
kmeans = Kmeans(doc, 3, 100)
kmeans.train()