计算mover distance的问题，速度太慢了

最新推荐文章于 2024-05-20 09:52:43 发布

kelvinLLL

最新推荐文章于 2024-05-20 09:52:43 发布

阅读量1.3k

点赞数

分类专栏： python 自然语言处理文章标签：文本挖掘多线程

本文链接：https://blog.csdn.net/kelvinLLL/article/details/59481840

版权

python 同时被 2 个专栏收录

24 篇文章 0 订阅

订阅专栏

自然语言处理

2 篇文章 0 订阅

订阅专栏

任务

要用1000篇文章来跟数据集里面的9722篇文章做一个mover distance的计算，总共要求的9722000个结果，但是现在每个结果平均都要两秒以上，有可能更慢。用了多线程，开了10个线程也快不起来。电脑是128G内存，CPU是i7-6855。请大神指教，谢谢！

源码

#!/usr/bin/python
# -*- encoding:utf-8 -*-

"""
@author : kelvin
@file : wmd_demo
@time : 2017/2/27 13:40
@description :

"""
from __future__ import division
import threading
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


# 将9722篇文档读入，并以列表方式存储
def read_file():
    doc = []
    f = open("G:\yang\MMSED-Text\Metadata_Text_100.txt")
    for line in f.readlines():
        f_split = line.split(':::')
        path = f_split[6]
        docpath = path.replace('C:\Users\zhengyang5\Workspaces\MyEclipse MMSED\EmptyWikiProcessing\Text20161007','G:\yang\MMSED-Text\Text20161007')
        #print docpath
        docpath = docpath.replace('\n','') #去除路径最后的换行符
        doc_file = open(docpath)
        doc.append(doc_file.read())   # 每篇文档是列表中的一个元素
        doc_file.close()
    f.close()
    return doc


def word_mover_distance(d1,d2):
    """
    用了word2vec的方法，计算文档相似度，以及mover distance
    :param d1:文档一
    :param d2:文档二
    :return:cosine相似度以及mover distance
    """
    # d1 = "Government speaks to the media in Illinois"
    # d2 = "The president addresses the press"

    # 去除不在word2vec中的词
    vocabulary = [w for w in set(d1.lower().split() + d2.lower().split()) if w in model.wv.vocab]
    vect = CountVectorizer(vocabulary=vocabulary).fit([d1, d2])
    v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()

    from sklearn.metrics import euclidean_distances
    W_ = np.array([model[w] for w in vect.get_feature_names() if w in model])
    D_ = euclidean_distances(W_)
    D_ = D_.astype(np.double)
    D_ /= D_.max()  # just for comparison purposes

    from pyemd import emd  
    
    def emd_d(v_1, v_2):
        # pyemd needs double precision input        
        v_1 = v_1.toarray().ravel()
        v_2 = v_2.toarray().ravel()
        v_1 = v_1.astype(np.double)
        v_2 = v_2.astype(np.double)
        v_1 /= v_1.sum()
        v_2 /= v_2.sum()
        mover_dis = float(emd(v_1, v_2, D_))
        return mover_dis
    v_1, v_2 = vect.transform([d1, d2])
    mover_dis = emd_d(v_1, v_2)
    return mover_dis


def get_text_num(num):
    text_num = []
    num_file = open("G:\yang\MMSED-Text\Text_Index1_10.txt")
    for line in num_file.readlines()[num:num+10]:      # 10篇为间隔, num为开始的数
        line = line.strip('\n')
        text_num.append(int(line)-1)
    num_file.close()
    return text_num


def loop_file_thread(n):    # 从第几篇开始，取10篇来算
    text = get_text_num(n)
    mover_thread = []
    for num in text:
        doc1 = doc[num]
        mover_doc1 = []    # 所有9722文档与doc1比较的mover distance存到列表中
        for doc2 in doc:
            mover = word_mover_distance(str(doc1), str(doc2))  # 调用函数计算两个值
            mover_doc1.append(mover)            
        mover_thread.append(mover_doc1)
        print 'Finish one 9722 compare'
    save_txt(mover_thread, 'mov_'+str(n)+'_'+str(n+10)+'.txt')


def save_txt(mover_distance, fname):    # 存文件
    mover_dis_matrix = np.array(mover_distance)
    print mover_dis_matrix.shape
    np.savetxt(fname, mover_dis_matrix, delimiter=',', fmt='%10.8f')


if __name__ == '__main__':
    # 载入googleNews的word2vec模型
    model = Word2Vec.load_word2vec_format("G:\yang\MMSED-Text\GoogleNews-vectors-negative300.bin", binary=True)
    # 读入所有文档
    doc = read_file() 
    print "Please wait.I'm caculating......"
    # 定义任意个线程
    for i in range(0,10):
        t = threading.Thread(target=loop_file_thread, args={10*i})
        t.setDaemon(True)
        t.start()
    t.join()           # 子进程完成后才能进入父进程
    print 'all finish!!'

怎样才能快点啊，这个问题困扰了我两天啊！