from gensim.models import word2vec
import jieba
import numpy as np
import re
from zhon.hanzi import punctuation
import math
import time
from scipy import spatial
# In[1]:
from gensim import corpora, models, similarities
class_classpath = './classification_classification.txt'
class_trainpath = './classification_trainData.txt'
intention_classpath = './intention_classification.txt'
intention_trainpath = './intention_trainData.txt'
q_a_path = './user_quse_ans_scrawl.txt'
# ques_ansid_path = './faq_0.9/ques_ansid_v10.txt'
# ques_intid_path = './faq_0.9/ques_intid_v10.txt'
ques_ans_path = './ques_ans_scrawl_from_matrix_v2.txt'
#qq_path = './faq_0.7-0.8/qq_dui_v1.txt'
jieba.load_userdict('/export/user/shizhengxin/tf-idf/new_dict_pro.txt')
w2v=word2vec.Word2Vec.load('/export/user/shizhengxin/word2vec/word2vec_test_v7.model')
def new_sent2vec(s):
words = s
M = []
for w in words:
try:
M.append(w2v.wv[w])
except:
continue
if len(M) == 0:
return [0]*300
else:
M = np.array(M)
v = M.sum(axis=0)
return v / M.shape[0]
def throw_dirty(sentence):
content=re.sub("[%s]+" %punctuation, "", sentence)
newline2 = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%\-\_]", "", content)
newline3 = re.sub(' ','',newline2)
return newline3
def get_answer_intention_id(class_classpath,intention_classpath):
w = open(class_classpath,'r',encoding='utf-8')
ws = open(intention_classpath,'r',encoding='utf-8')
ans_id = {}
ans_int = {}
ans_jieba = {}
ans_vec = {}
times = 0
for line in w.readlines():
newline = line.strip().split('----')
if len(newline) < 4:
continue
ans_id[newline[3]] = times
sent = throw_dirty(newline[3])
ans_jieba[newline[3]] = new_sent2vec(list(jieba.cut(sent)))
times += 1
ans_int[newline[3]] = newline[1]
w.close()
sed = 0
int_id = {}
for s_line in ws.readlines():
newline = s_line.strip()
int_id[newline] = sed
sed += 1
ws.close()
return ans_id , ans_int , int_id,ans_jieba
ans_id , ans_int , int_id , ans_jieba = get_answer_intention_id(class_classpath,intention_classpath)
print(int_id)
def cos_dist(a, b):
part_up = 0.0
a_sq = 0.0
b_sq = 0.0
for a1, b1 in zip(a,b):
part_up += a1*b1
a_sq += a1**2
b_sq += b1**2
part_down = math.sqrt(a_sq*b_sq)
if part_down == 0.0:
return 0
else:
return part_up / part_down
def get_rawdata(q_a_path):
w = open(q_a_path, 'r', encoding='utf-8')
times = 0
q_jieba = {}
q_a_dui = {}
for line in w.readlines():
newline = line.strip().split('----')
if len(newline) != 2:
continue
sentence1 = newline[1]
sentence1 = throw_dirty(sentence1)
sentence1 = list(jieba.cut(sentence1))
q_jieba[newline[1]] = new_sent2vec(sentence1)
if newline[1] in q_a_dui.keys():
q_a_dui[newline[1]].add(newline[0])
else:
q_a_dui[newline[1]] = set([newline[0]])
w.close()
return q_jieba , q_a_dui
q_jieba , q_a_dui = get_rawdata(q_a_path)
print(1)
def get_matrix(q_jieba,ans_jieba):
q_mat = []
times = 0
q_dict = {}
for q in q_jieba.keys():
vec_q = q_jieba[q] / np.linalg.norm(q_jieba[q])
q_mat.append(vec_q)
q_dict[times] = q
times += 1
q_mat = np.mat(q_mat)
ans_mat = []
sed = 0
ans_dict = {}
for ans in ans_jieba.keys():
vec_ans = ans_jieba[ans]/np.linalg.norm(ans_jieba[ans])
ans_mat.append(vec_ans)
ans_dict[sed] = ans
sed += 1
ans_mat = np.mat(ans_mat)
return q_mat , ans_mat ,q_dict ,ans_dict
def get_user_data(q_jieba,q_a_dui,ans_id,ans_jieba,int_id,ans_int,ques_ans_path):
# w_qid = open(ques_ansid_path,'a',encoding='utf-8')
# w_qint = open(ques_intid_path,'a',encoding='utf-8')
w_qans = open(ques_ans_path,'a',encoding='utf-8')
#w_qq = open(qq_path,'a',encoding = 'utf-8')
q_a = {}
clock1 = time.time()
q_mat, ans_mat, q_dict, ans_dict = get_matrix(q_jieba,ans_jieba)
sta_user_mat = np.dot(ans_mat,q_mat.T)
sta_user_mat = np.nan_to_num(sta_user_mat)
print(sta_user_mat)
mat_index = np.argmax(sta_user_mat,axis=0)
print(mat_index.shape)
print(mat_index)
for column in range(sta_user_mat.shape[1]):
#print(sta_user_mat[mat_index[0,column],column])
#print(q_dict[column] +'----' +ans_dict[mat_index[0,column]]+'\n')
for k in q_a_dui[q_dict[column]]:
w_qans.writelines(k +'----' +ans_dict[mat_index[0,column]]+'----'+str(sta_user_mat[mat_index[0,column],column])+'\n')
w_qans.close()
#
# if 1 - spatial.distance.cosine(a,b) >= 0.9 :
# w_qid.writelines(q_a_dui[line]+'----'+str(ans_id[key])+'\n')
# w_qans.writelines(q_a_dui[line]+'----'+key+'\n')
# w_qint.writelines(q_a_dui[line]+'----'+str(int_id[ans_int[key]])+'\n')
# # w_qq.writelines(line+'----'+key+'\n')
# break
# times += 1
# if times % 1000 == 0:
# clock2 = time.time()
# print(times)
# print('time consum '+str(clock2-clock1))
# w_qint.close()
# w_qid.close()
# w_qans.close()
# # w_qq.close()
# get_user_data(q_jieba,q_a_dui,ques_ansid_path,ques_intid_path,ques_ans_path,ans_id,ans_jieba,int_id,ans_int)
get_user_data(q_jieba,q_a_dui,ans_id,ans_jieba,int_id,ans_int,ques_ans_path)
基于增量的矩阵聚类
最新推荐文章于 2023-04-14 13:23:01 发布