import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
corpus = []
km = KMeans(n_clusters=2)
pca = PCA(n_components=2)
gb = open('THUOCL_animal.txt',encoding='utf-8').readlines()
w1= gb[:30]#为了方便,每个词库只取了前面30个单词
dct1=tokenizer(w1, return_tensors='pt',padding=True,max_length=6)
t1=model(**dct1)[0][:,0,:]
fb = open('THUOCL_diming.txt',encoding='utf-8').readlines()
w2=fb[:30]
dct2=tokenizer(w2, return_tensors='pt',padding=True,max_length=6)
t2=model(**dct2)[0][:,0,:]
vectors=torch.cat((t1,t2),dim=0)
vectors_ = pca.fit_transform(vectors.detach().numpy()) #降维到二维
y_ = km.fit_predict(vectors_) #聚类
print(y_)
plt.rcParams['font.sans-serif'] = ['FangSong']
plt.scatter(vectors_[:,0],vectors_[:, 1],c=y_) #将点画在图上
for i in range(len(corpus)): #给每个点进行标注
plt.annotate(s=corpus[i], xy=(vectors_[:, 0][i], vectors_[:, 1][i]),
xytext=(vectors_[:, 0][i] + 0.1, vectors_[:, 1][i] + 0.1))
plt.show()
bert_文本聚类_1
最新推荐文章于 2024-06-24 22:35:37 发布