#-*- coding:utf-8 -*-
'''
第一步
将句子变成多个单词
pip install jieba
'''
import jieba
import numpy as np # 为了转换数组
s1 = "我来学习python!"
# 第一种切分模式:精确模式,比较适合做文本分析
s1_result_list = list(jieba.cut(s1))
print(s1_result_list)
# 全模式
s1_result_list = list(jieba.cut(s1, cut_all=True))
print(s1_result_list)
# 搜索引擎模式
s1_result_list = list(jieba.cut_for_search(s1))
print(s1_result_list)
s2 = "我们来学习python"
s3 = "我来学习python"
question = "python学习多久"
# 向量转换 list
s_vector = []
# 词典
word_vetor_list = ["我们","来","贪心","学院","学习"]
#
# for i in word_vetor_list:
# if i in list(jieba.cut(s1)):
# s_vector.append(1)
# else:
# s_vector.append(0)
#
# print(s_vector)
def get_vetor(data):
vetor_list = []
for i in word_vetor_list:
if i in list(jieba.cut(data)):
vetor_list.append(1)
else:
vetor_list.append(0)
print(data)
print(vetor_list)
return np.array(vetor_list).reshape(1,-1) # 将一维数组转换成了二维
question_vetor_list = get_vetor(question)
s1_result_list = get_vetor(s1)
s2_result_list = get_vetor(s2)
s3_result_list = get_vetor(s3)
# 相似度计算
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(question_vetor_list,s1_result_list)) # list需要转成二维数组,否则报错
print(cosine_similarity(question_vetor_list,s2_result_list))
print(cosine_similarity(question_vetor_list,s3_result_list))
jieba的学习知识点
最新推荐文章于 2022-09-14 08:26:13 发布