需求
计算多个文本之间的相似度 按照相似度排序
任务分解
计算句子之间相似度
导入所需库
定义10句中文文本
使用jieba进行分词
构建Word2Vec模型
提取关键词
计算句子间的相似度
找出与第2句话最相似的3句话
准备
pip install jieba gensim
流程设计
测试数据导入
方式一 使用测试数据
sentences = [
"这部电影真的很好看。",
"我喜欢这部电影。",
"那部电影也挺不错的。",
"我觉得这部影片非常精彩。",
"这个电影太棒了。",
"那部电影我看了两遍。",
"这部电影的导演很有才华。",
"这部影片的剧情很吸引人。",
"我觉得这部电影一般般。",
"那部电影我不是很喜欢。"
]
方式一 使用mysql数据
db_config = {
'host': 'xxxxx',
'user': 'xxxx',
'password': 'xxxx',
'database': 'xxxx',
'port':xxxx
}
ef read_sentences_from_mysql(host, user, password, database, port):
# 连接到MySQL数据库
connection = mysql.connector.connect(
host=host,
user=user,
password=password,
database=database,
port=port
)
# 创建游标
cursor = connection.cursor()
# 查询数据
query = f"SELECT CONCAT_WS(', ',id,NAME,等需要的字段) AS info FROM testtable"
cursor.execute(query)
# 获取查询结果
sentences = [row[0] for row in cursor.fetchall()]
# 关闭连接
cursor.close()
connection.close()
return sentences
sentences = read_sentences_from_mysql(**db_config)
print(sentences)
分词
使用列表推导式遍历每行数据 进行分词
结果为每行数据的词汇
def tokenize(text):
return list(jieba.cut(text))
tokenized_sentences = [tokenize(sentence) for sentence in sentences]
print(tokenized_sentences)
分词后结果 tokenized_sentences
for tokens in tokenized_sentences:
print(tokens)
['这部', '电影', '真的', '很', '好看', '。'],
['我', '喜欢', '这部', '电影', '。'],
['那', '部', '电影', '也', '挺不错', '的', '。'],
['我', '觉得', '这部', '影片', '非常', '精彩', '。'],
['这个', '电影', '太棒了', '。'],
['那', '部', '电 影', '我', '看', '了', '两遍', '。'],
['这部', '电影', '的', '导演', '很', '有', '才华', '。'],
['这部', '影片', '的', '剧情', '很', '吸引', '人', '。'],
['我', '觉得', '这部', '电影', '一般般', '。'],
['那', '部', '电影', '我', '不是', '很', '喜欢', '。']
文本向量
model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
#计算句子向量
def sentence_vector(sentence_tokens):
vectors = [model.wv[word] for word in sentence_tokens if word in model.wv]
if not vectors:
return None
return np.sum(vectors, axis=0) / len(vectors)
# 调用
sentence_vectors = [sentence_vector(tokens) for tokens in tokenized_sentences]
相似度计算
from numpy.linalg import norm
if vec1 is None or vec2 is None:
return 0
dot_product = sum(vec1[i] * vec2[i] for i in range(len(vec1)))
vec1_norm = norm(vec1)
vec2_norm = norm(vec2)
return dot_product / (vec1_norm * vec2_norm)
mode是一个 Word2Vec<vocab=30, vector_size=100, alpha=0.025>
Word2Vec 函数参数分别表示 vector_size 维度大小 window 上下文窗口的大小 min_count 出现几次纳入模型 workers训练线程数
句子向量 采用单词向量加权平均方法
相似度采用 向量余弦相似度
分批计算
实际测试中数据量过大,改为分批处理
def find_similar_sentences(target_sentence, sentences, sentence_vectors, k=3, batch_size=1000, overlap=100):
target_tokens = tokenize(target_sentence)
target_vector = sentence_vector(target_tokens)
if target_vector is None:
return []
num_sentences = len(sentences)
similar_sentences = []
#遍历全部句子
for i in range(0, num_sentences, batch_size - overlap):
start = i
end = min(i + batch_size, num_sentences)
current_batch = sentences[start:end]
current_vectors = sentence_vectors[start:end]
# 计算目标句子与当前批次中其他句子的相似度
for j, sentence in enumerate(current_batch):
similarity = cosine_similarity(target_vector, current_vectors[j])
if similarity > 0:
similar_sentences.append((sentence, similarity))
# 对相似度进行排序
similar_sentences.sort(key=lambda x: x[1], reverse=True)
# 返回最相似的k个句子
return [sentence for sentence, _ in similar_sentences[:k]]
# 设置批次大小和重叠大小
测试
batch_size = 7000
overlap = 100
target_sentence = sentences[100]
top_3_similar_sentences = find_similar_sentences(target_sentence, sentences, sentence_vectors, k=10, batch_size=batch_size, overlap=overlap)
print("Top 3 similar sentences to:", target_sentence)
for sentence in top_3_similar_sentences:
print(sentence)