python 多线程计算特征

#! /usr/bin/env python
# -*- coding:utf-8 -*-




import sys
import os
from multiprocessing import Process, Lock, Queue, Manager
from multiprocessing.managers import BaseManager
import argparse
import math
import numpy as np


parser = argparse.ArgumentParser(description='filter_word_pair')


parser.add_argument('--output', type=str, default="out.syn.news",help='output')
parser.add_argument('--threads', type=int, default=30,help='thread num')
parser.add_argument('--glsa_dict', type=str, default="glsa.txt",help='glsa dict')
parser.add_argument('--lsi_dict', type=str, default="lsi.txt",help='lsi dict')
parser.add_argument('--word2vec_dict', type=str, default="word2vec.bin",help='word2vec dict')


args = parser.parse_args()


def cosin(vec1,vec2):
    if len(vec1) != len(vec2):
        return 0
    
    norm1 = 0.0
    for t in vec1:
        norm1 += t*t
    norm1 = math.sqrt(norm1)    
    
    norm2 = 0.0
    for t in vec2:
        norm2 += t*t
    norm2 = math.sqrt(norm2)    


    if norm1 < 0.0000001 or norm2 < 0.0000001:
        return 0


    sum = 0.0
    for i in range(len(vec1)):
        sum += vec1[i]*vec2[i]


    return sum*1.0/(norm1*norm2)            




def weighted_cosin(vec1,vec2,sigma):
    if len(vec1) != len(vec2):
        return 0


    norm1 = 0.0
    for i in range(len(vec1)):
        norm1 += vec1[i]*vec1[i]*sigma[i]*sigma[i]
    norm1 = math.sqrt(norm1)    
    
    norm2 = 0.0
    for i in range(len(vec2)):
        norm2 += vec2[i]*vec2[i]*sigma[i]*sigma[i]
    norm2 = math.sqrt(norm2)    


    if norm1 < 0.0000001 or norm2 < 0.0000001:
        return 0


    sum = 0.0
    for i in range(len(vec1)):
        sum += vec1[i]*vec2[i]*sigma[i]*sigma[i]
    return sum*1.0/(norm1*norm2)            


class LSIDict(object):
    def __init__(self):
        self.data = {}
        self.sigma = []
        self.dim = 0


    def load_lsi_from_binfile(self,word_vec_file):
        with open(word_vec_file, "rb") as f:
            self.data = {}
            self.sigma = []


            self.dim = int(f.readline())
            self.sigma = map(float,f.readline().split("\t"))
                
            dlist = f.readlines()
            for line in dlist:
                if len(line) == 0:
                    break
                tlist = line.split("\t")
                if len(tlist) != 2:
                    continue
                word = tlist[0]
                self.data[word] = map(float,tlist[1].split(" "))


    def calc_sim(self,term1,term2):
        if cmp(term1,term2) == 0:
            return 1
        if term1 not in self.data or term2 not in self.data:
            return 0
        return weighted_cosin(self.data[term1],self.data[term2],self.sigma)
    
    def test_suite(self):
        print "北京:上海",self.calc_sim("北京","上海")
        print "北大:清华",self.calc_sim("北大","清华")
        print "北大:aa",self.calc_sim("北大","aa")


class Word2vecDict(object):
    def __init__(self):
        self.data = {}


    def load_wordvecs_from_binfile(self,word_vec_file):
            with open(word_vec_file, "rb") as f:
                header = f.readline()
                vocab_size, layer1_size = map(int, header.split())


                binary_len = np.dtype('float32').itemsize * layer1_size
                for line in xrange(vocab_size):
                    word = []
                    while True:
                        ch = f.read(1)
                        if ch == ' ':
                            word = ''.join(word)
                            break
                        if ch != '\n':
                            word.append(ch)
                    self.data[word] = np.fromstring(f.read(binary_len), dtype='float32')


    def calc_sim(self,term1,term2):
        if cmp(term1,term2) == 0:
            return 1
        if term1 not in self.data or term2 not in self.data:
            return 0
        return cosin(self.data[term1],self.data[term2])








glist = []        


count = 0
for line in open("./out.syn.top5"):
    line = line.strip()
    
    if not line:
        continue


    tlist = line.split("\t")
    #(terma,termb) = field[0].split("|")
    if len(tlist) != 5:
        continue
    terma = tlist[0]
    termb = tlist[1]


    if cmp(terma,termb) == 0:
        continue


    
    count += 1


    if count%10000 == 0:
        print >> sys.stderr, "handled %d lines" %(count)
    
    glist.append((terma,termb))




print >> sys.stderr, "start load Word2vecDict"
word2vec_dict = Word2vecDict()
word2vec_dict.load_wordvecs_from_binfile(args.word2vec_dict)
 


glsa_dict = LSIDict()
print "start load GLSADict"
glsa_dict.load_lsi_from_binfile(args.glsa_dict)
glsa_dict.test_suite()
print "end load GLSADict"


lsi_dict = LSIDict()
print "start load LSIDict"
lsi_dict.load_lsi_from_binfile(args.lsi_dict)
lsi_dict.test_suite()
print "end load LSIDict"
           
lock = Lock()
def calc_sim(tasks,begin,end,out):
    for i in range(begin,end):
        qf = tasks[i]
        sim1 = glsa_dict.calc_sim(qf[0],qf[1])
        sim2 = lsi_dict.calc_sim(qf[0],qf[1])
        sim3 = word2vec_dict.calc_sim(qf[0],qf[1])
        with lock:            
            out.put((qf[0],qf[1],sim1,sim2,sim3))


#w1,w2,prob,ratio,glsa,lsi,word2vec


#计算cosin时采用多进程


Kthread = int(args.threads)
load = len(glist)
quota = load/Kthread
remain = load-quota*Kthread
threads = []




manager = Manager()
out = manager.Queue()


for i in range(Kthread):
    begin = i*quota
    if i != Kthread-1:
        end = (i+1)*quota
    else:
        end = (i+1)*quota + remain


    print >>sys.stderr, "load=%d thread %d begin=%d end=%d" %(load,i,begin,end)


    th = Process(target=calc_sim,args=(glist,begin,end,out))
    th.daemon = True
    th.start()
    threads.append(th)




for i in range(Kthread):
    threads[i].join()






fout = open(args.output,"w")    
while not out.empty():
    info = out.get()
    
    #zero
    fout.write("%s\t%s\t%f\t%f\t%f\n" %(info[0],info[1],info[2],info[3],info[4]))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值