基于增量的矩阵聚类

from gensim.models import word2vec
import jieba
import numpy as np
import re
from zhon.hanzi import punctuation
import math
import time
from scipy import spatial
# In[1]:
from gensim import corpora, models, similarities

class_classpath = './classification_classification.txt'
class_trainpath = './classification_trainData.txt'
intention_classpath = './intention_classification.txt'
intention_trainpath = './intention_trainData.txt'
q_a_path = './user_quse_ans_scrawl.txt'
# ques_ansid_path = './faq_0.9/ques_ansid_v10.txt'
# ques_intid_path = './faq_0.9/ques_intid_v10.txt'
ques_ans_path = './ques_ans_scrawl_from_matrix_v2.txt'
#qq_path = './faq_0.7-0.8/qq_dui_v1.txt'
jieba.load_userdict('/export/user/shizhengxin/tf-idf/new_dict_pro.txt')
w2v=word2vec.Word2Vec.load('/export/user/shizhengxin/word2vec/word2vec_test_v7.model')
def new_sent2vec(s):
    words = s
    M = []
    for w in words:
        try:
            M.append(w2v.wv[w])
        except:
            continue
    if len(M) == 0:
        return [0]*300
    else:
        M = np.array(M)
        v = M.sum(axis=0)
        return v / M.shape[0]

def throw_dirty(sentence):
    content=re.sub("[%s]+" %punctuation, "", sentence)
    newline2 = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%\-\_]", "", content)
    newline3 = re.sub(' ','',newline2)
    return newline3

def get_answer_intention_id(class_classpath,intention_classpath):
    w = open(class_classpath,'r',encoding='utf-8')
    ws = open(intention_classpath,'r',encoding='utf-8')
    ans_id = {}
    ans_int = {}
    ans_jieba = {}
    ans_vec = {}
    times = 0
    for line in w.readlines():
        newline = line.strip().split('----')
        if len(newline) < 4:
            continue
        ans_id[newline[3]] = times
        sent = throw_dirty(newline[3])
        ans_jieba[newline[3]] = new_sent2vec(list(jieba.cut(sent)))
        times += 1
        ans_int[newline[3]] = newline[1]

    w.close()
    sed = 0
    int_id = {}
    for s_line in ws.readlines():
        newline = s_line.strip()
        int_id[newline] = sed
        sed += 1
    ws.close()
    return ans_id , ans_int , int_id,ans_jieba


ans_id , ans_int , int_id , ans_jieba = get_answer_intention_id(class_classpath,intention_classpath)

print(int_id)



def cos_dist(a, b):
    part_up = 0.0
    a_sq = 0.0
    b_sq = 0.0
    for a1, b1 in zip(a,b):
        part_up += a1*b1
        a_sq += a1**2
        b_sq += b1**2
    part_down = math.sqrt(a_sq*b_sq)
    if part_down == 0.0:
        return 0
    else:
        return part_up / part_down

def get_rawdata(q_a_path):
    w = open(q_a_path, 'r', encoding='utf-8')
    times = 0
    q_jieba = {}
    q_a_dui = {}
    for line in w.readlines():
        newline = line.strip().split('----')
        if len(newline) != 2:
            continue
        sentence1 = newline[1]
        sentence1 = throw_dirty(sentence1)
        sentence1 = list(jieba.cut(sentence1))
        q_jieba[newline[1]] = new_sent2vec(sentence1)
        if newline[1] in q_a_dui.keys():
            q_a_dui[newline[1]].add(newline[0])
        else:
            q_a_dui[newline[1]] = set([newline[0]])

    w.close()
    return q_jieba , q_a_dui



q_jieba , q_a_dui = get_rawdata(q_a_path)

print(1)
def get_matrix(q_jieba,ans_jieba):
    q_mat = []
    times = 0
    q_dict  = {}
    for q in  q_jieba.keys():
        vec_q = q_jieba[q] / np.linalg.norm(q_jieba[q])
        q_mat.append(vec_q)
        q_dict[times] = q
        times += 1
    q_mat = np.mat(q_mat)
    ans_mat = []
    sed = 0
    ans_dict = {}
    for ans in ans_jieba.keys():
        vec_ans = ans_jieba[ans]/np.linalg.norm(ans_jieba[ans])
        ans_mat.append(vec_ans)
        ans_dict[sed] = ans
        sed += 1
    ans_mat = np.mat(ans_mat)

    return q_mat , ans_mat ,q_dict ,ans_dict




def get_user_data(q_jieba,q_a_dui,ans_id,ans_jieba,int_id,ans_int,ques_ans_path):
    # w_qid = open(ques_ansid_path,'a',encoding='utf-8')
    # w_qint = open(ques_intid_path,'a',encoding='utf-8')
    w_qans = open(ques_ans_path,'a',encoding='utf-8')
    #w_qq = open(qq_path,'a',encoding = 'utf-8')
    q_a = {}
    clock1 = time.time()
    q_mat, ans_mat, q_dict, ans_dict = get_matrix(q_jieba,ans_jieba)
    sta_user_mat = np.dot(ans_mat,q_mat.T)
    sta_user_mat = np.nan_to_num(sta_user_mat)
    print(sta_user_mat)
    mat_index = np.argmax(sta_user_mat,axis=0)

    print(mat_index.shape)
    print(mat_index)
    for column  in range(sta_user_mat.shape[1]):
        #print(sta_user_mat[mat_index[0,column],column])
        #print(q_dict[column] +'----' +ans_dict[mat_index[0,column]]+'\n')
        for k in q_a_dui[q_dict[column]]:

            w_qans.writelines(k +'----' +ans_dict[mat_index[0,column]]+'----'+str(sta_user_mat[mat_index[0,column],column])+'\n')

    w_qans.close()


   #
   #      if 1 - spatial.distance.cosine(a,b) >= 0.9 :
   #          w_qid.writelines(q_a_dui[line]+'----'+str(ans_id[key])+'\n')
   #          w_qans.writelines(q_a_dui[line]+'----'+key+'\n')
   #          w_qint.writelines(q_a_dui[line]+'----'+str(int_id[ans_int[key]])+'\n')
   #          # w_qq.writelines(line+'----'+key+'\n')
   #          break
   #      times += 1
   #      if  times % 1000 == 0:
   #          clock2 = time.time()
   #          print(times)
   #          print('time consum '+str(clock2-clock1))
   #  w_qint.close()
   #  w_qid.close()
   #  w_qans.close()
   # # w_qq.close()

# get_user_data(q_jieba,q_a_dui,ques_ansid_path,ques_intid_path,ques_ans_path,ans_id,ans_jieba,int_id,ans_int)
get_user_data(q_jieba,q_a_dui,ans_id,ans_jieba,int_id,ans_int,ques_ans_path)
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值