# [学习笔记] 神经网络之三：BERT和相似度（1）

于是该作者做了若干个实验，并在论文中给出了实验结果。同时也有python库的实现，那就是sentence-transformers，安装：

pip install sentence-transformers

# 1.application_clustering.py

"""
This is a simple application for sentence embeddings: clustering
Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from utils import show_cluster_image

if __name__ == '__main__':
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
# Corpus with example sentences
corpus = ['A man is eating food.',
'A man is eating a piece of bread.',
'A man is eating pasta.',
'The girl is carrying a baby.',
'The baby is carried by the woman',
'A man is riding a horse.',
'A man is riding a white horse on an enclosed ground.',
'A monkey is playing drums.',
'Someone in a gorilla costume is playing a set of drums.',
'A cheetah is running behind its prey.',
'A cheetah chases prey on across a field.'
]
corpus_embeddings = embedder.encode(corpus)
# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
print("Cluster ", i)
print(cluster)
# 展现聚类结果
show_cluster_image(corpus_embeddings, cluster_assignment)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def show_cluster_image(vectors, labels, excepted_labels=None):
"""
根据数据绘制出聚类散点图，目前最多8个类别
:param vectors: 向量
:param labels: 该点属于哪个簇
:param excepted_labels: 排除的标签，该标签不绘制
:return:
"""
# 降维
estimator = PCA(n_components=2)
data_set = estimator.fit_transform(vectors)
# 分成若干个簇
clusters = {}
for index in range(len(data_set)):
datum = data_set[index]
# 标签所代表的簇
label = labels[index]
# 异常值目前不显示
if excepted_labels and label in excepted_labels:
continue
if label not in clusters:
clusters[label] = []
clusters[label].append(datum)
# 遍历簇
for label, array in clusters.items():
matrix = np.array(array)
plt.scatter(matrix[:, 0], matrix[:, 1], label='cluter%d' % label)

plt.legend(loc='upper right')
plt.show()

show_cluster_image先对向量使用了PCA降到了2维，然后以散点图的形式展现出来。

Cluster  1
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']
Cluster  0
['The girl is carrying a baby.', 'The baby is carried by the woman']
Cluster  3
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']
Cluster  4
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']
Cluster  2
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

# 3.源代码部分

## 3.1 得到句向量

    model_name = 'bert-base-uncased'
# 使用BERT映射token到embedding
word_embedding_model = models.BERT(model_name)

    # 使用mean pooling
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)

创建了一个SentenceTransformer类，该类继承自nn.Sequential：

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

SentenceTransformer的功能之一在于输入文本，返回这个文本的embedding。详细流程如下：

(batch_size, texts)=>BERT=>Pooling=>(batch, 768)

## 3.2 整合

    train_loss = losses.SoftmaxLoss(model=model,sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)


SoftmaxLoss类继承自nn.Module，它内部使用到了softmax和交叉熵损失函数。

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
rep_a, rep_b = reps

vectors_concat = []
if self.concatenation_sent_rep:
vectors_concat.append(rep_a)
vectors_concat.append(rep_b)

if self.concatenation_sent_difference:
vectors_concat.append(torch.abs(rep_a - rep_b))

if self.concatenation_sent_multiplication:
vectors_concat.append(rep_a * rep_b)

features = torch.cat(vectors_concat, 1)

output = self.classifier(features)
loss_fct = nn.CrossEntropyLoss()

if labels is not None:
loss = loss_fct(output, labels.view(-1))
return loss
else:
return reps, output

## 3.3 评估

"""

"""
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging

# Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
# /print debug information to stdout

if __name__ == '__main__':
batch_size = 16
model = SentenceTransformer('bert-base-nli-mean-tokens')
# dev-set 用于测试
# 评估
model.evaluate(evaluator)

EmbeddingSimilarityEvaluator类使用了余弦距离、余弦相似度、欧几里得距离和曼哈顿距离进行了评估，结果如下：

2020-03-31 11:06:33 - Cosine-Similarity :	Pearson: 0.7415	Spearman: 0.7698
2020-03-31 11:06:33 - Manhattan-Distance:	Pearson: 0.7730	Spearman: 0.7712
2020-03-31 11:06:33 - Euclidean-Distance:	Pearson: 0.7713	Spearman: 0.7707
2020-03-31 11:06:33 - Dot-Product-Similarity:	Pearson: 0.7273	Spearman: 0.7270


## 3.4 相似度

from sentence_transformers import SentenceTransformer
import scipy.spatial

if __name__ == '__main__':
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
# 语料实例
corpus = ['A man is eating food.',
'A man is eating a piece of bread.',
'The girl is carrying a baby.',
'A man is riding a horse.',
'A woman is playing violin.',
'Two men pushed carts through the woods.',
'A man is riding a white horse on an enclosed ground.',
'A monkey is playing drums.',
'A cheetah is running behind its prey.'
]
corpus_embeddings = embedder.encode(corpus)
# 待查询的句子
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']
query_embeddings = embedder.encode(queries)
# 对于每个句子，使用余弦相似度查询最接近的5个句子
closest_n = 5
for query, query_embedding in zip(queries, query_embeddings):
distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
# 按照距离逆序
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print("======================")
print("Query:", query)
print("Result:Top 5 most similar sentences in corpus:")
for idx, distance in results[0:closest_n]:
print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))

该示例与聚类类似，它同样是生成了句向量，然后使用余弦相似度来获取最相近的若干个句子，结果如下：

======================
Query: A man is eating pasta.
Top 5 most similar sentences in corpus:
A man is eating a piece of bread. (Score: 0.8480)
A man is eating food. (Score: 0.7759)
Two men pushed carts through the woods. (Score: 0.2095)
A monkey is playing drums. (Score: 0.1945)
A man is riding a white horse on an enclosed ground. (Score: 0.1586)
======================
Query: Someone in a gorilla costume is playing a set of drums.
Top 5 most similar sentences in corpus:
A monkey is playing drums. (Score: 0.7985)
A cheetah is running behind its prey. (Score: 0.2860)
The girl is carrying a baby. (Score: 0.2351)
A man is riding a horse. (Score: 0.2023)
A man is riding a white horse on an enclosed ground. (Score: 0.1963)
======================
Query: A cheetah chases prey on across a field.
Top 5 most similar sentences in corpus:
A cheetah is running behind its prey. (Score: 0.9007)
Two men pushed carts through the woods. (Score: 0.3662)
A monkey is playing drums. (Score: 0.3061)
A man is riding a horse. (Score: 0.2930)
A man is riding a white horse on an enclosed ground. (Score: 0.2718)

# 参考：

sentence-transformers

Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks

11-29 3万+

12-07 5万+

05-12 4931

05-25 253

04-05 2万+

04-08 617

06-14 885

02-10 7万+

#### 《MySQL 性能优化》之理解 MySQL 体系结构

©️2020 CSDN 皮肤主题: 编程工作室 设计师: CSDN官方博客

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、C币套餐、付费专栏及课程。