bert2vec+kmeans

from bert_serving.client import BertClient
from sklearn.cluster import KMeans

#ivy_nie
bc = BertClient()
def wordsCluster(text, vectorSize, classCount):
‘’’
text:输入文本的本地路径
vectorSize:词向量大小
classCount:k值
‘’’
name = []
data = open(text, ‘r’, encoding=‘utf-8’)
for line in data.readlines():
line = line.replace(’\n’, ‘’)
if line not in name:
name.append(line)

true_labels = []
labels = open('doc.txt', 'r', encoding='utf-8')
for label in labels.readlines():
    label = label.replace('\n', '')
    true_labels.append(label)

# 获取词对于的词向量
wordvector = []
with open('doc.txt', 'r', encoding='utf-8') as ff:
    lines = ff.readlines()
    for line in lines:
        print(line)
        key = bc.encode([line])[0]
        print(key)
        wordvector.append(key)
fff=open('result.txt','w',encoding='utf-8')
# 聚类
clf = KMeans(n_clusters=classCount)
s = clf.fit_predict(wordvector)
for i in range(0, 500):
    label_i = []
    for j in range(0, len(s)):
        if s[j] == i:
            label_i.append(name[j])
    print('label_' + str(i) + ':' + str(label_i))
    fff.write('label_' + str(i) + ':' + str(label_i)+'\n')

wordsCluster(’./wan.txt’, 300,500)

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值