# -*- coding: utf-8 -*-
"""
Created on Fri Aug 23 10:33:41 2019
#分词后取同义词的搜索
#使用jieba分词、Synonyms同义词库
#分词后同义词搜索结果比较相似度的方法
@author: 崔子腾
"""
import pymysql
import jieba
import synonyms
import numpy
db_sqls = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='qqq111',database='test2',charset = 'utf8')
cur_sqls = db_sqls.cursor()
search_word=input('检索词:').strip() # 输入检索词
#==============jieba分词=================
#seg_list = jieba.cut(search_word, cut_all=False)# 精准分词模式
seg_list = jieba.cut_for_search(search_word) # 搜索引擎模式
#==============构造相似度数组=============
cur_sqls.execute('SELECT MAX(_id) FROM ciem_pathtable')
max_path=cur_sqls.fetchone()[0]
factorarray=numpy.zeros(max_path)#相似度数组
factor=factorarray.tolist()
tempsql1="SELECT _id FROM ciem_pathtable WHERE cnpath LIKE '%"
tempsql2="%'"
#sql="""SELECT enpath FROM ciem_pathtable WHERE enpath LIKE '%search_word%'"""
for seg in seg_list: #对于每一段分词
(near_words,near_factor)=synonyms.nearby(seg) #查找同义词
if len(near_words)<1: #若词库里没有同义词,则自己独立组成同义词列表
near_words=[seg]
near_factor=(1,)
print('分词:',seg,'同义词:',near_words,'相似度:',near_factor)
for i in range(len(near_words)):
#print('index:',index)
sql=tempsql1+near_words[i]+tempsql2
cur_sqls.execute(sql)
search_result=cur_sqls.fetchall()
#print(search_result)
for path_id in search_result:#为搜索结果添加相似度
#print(path_id)
factor[path_id[0]-1]=factor[path_id[0]-1]+near_factor[i]
#==============打印相似度前10的搜索结果=============
print('搜索结果:')
count=0
while count<10:
path_id=factor.index(max(factor))+1
sql='SELECT cnpath FROM ciem_pathtable WHERE _id='+str(path_id)
cur_sqls.execute(sql)
path_result=cur_sqls.fetchone()
print('XPath',count+1,':',path_result,'相似度',factor[path_id-1],'\n')
factor[path_id-1]=0
count=count+1
cur_sqls.close()#关闭游标
db_sqls.close()#关闭数据库连接
工作1
最新推荐文章于 2022-03-14 15:05:26 发布