创新实训 英文句子相似度的简单实现

首先需要得到词向量,这里使用Google新闻的语料来进行转化

python word2vec2text.py GoogleNews-vectors-negative300.bin GoogleNews-vectors-300d.txt
python text2numpy.py GoogleNews-vectors-300d.txt 300  

word2vec2text.py如下:

import argparse
import mmap
import os
import struct
import sys
from array import array
from contextlib import contextmanager

# ensures Python 3.x
assert sys.version_info >= (3, 0)


@contextmanager
def memorymap(filename):
    try:
        size = os.path.getsize(filename)
        fd = os.open(filename, os.O_RDONLY)
        mapped_file = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
        yield mapped_file
    finally:
        mapped_file.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
                description='Export embeddings from binary word2vec files '
                'to plain text')
    parser.add_argument('source', help='word2vec binary file')
    parser.add_argument('dest', help='destination file')
    parser.add_argument('--vocabulary', help='text file containing words to '
                        'export (one word per line)')
    args = parser.parse_args()

    if args.vocabulary:
        print('loading vocabulary')
        with open(args.vocabulary) as fin:
            vocabulary = {word.strip() for word in fin}
    else:
        vocabulary = None

    print('exporting vectors')
    with memorymap(args.source) as mvec, \
            open(args.dest, 'w', encoding='utf-8') as fout:
        end = mvec.find(b'\n', 0, 100)
        if end == -1:
            raise Exception("Invalid file format")
        _, token2 = mvec[0:end].split()
        vector_size = int(token2)
        byte_offset = vector_size * struct.calcsize('f')
        while True:
            # reading a word
            pos = end
            if pos >= mvec.size():
                break
            end = mvec.find(b' ', pos)
            if end == -1:
                break
            wordbytes = mvec[pos:end]
            word = wordbytes.decode('utf-8', errors='replace').strip()
            # reading the corresponding vector
            pos = end + 1
            end = pos + byte_offset
            vector = array('f', mvec[pos:end])
            if vocabulary is not None and word not in vocabulary:
                continue  # skip word if not in vocabulary
            print(word, ' '.join(map(str, vector)), file=fout)
        print('finished')

text2numpy.py如下:

import sys
import argparse
import re
import os
from pathlib import Path
from multiprocessing import Process

import numpy as np


# ensures Python 3.x
assert sys.version_info >= (3, 0)


RE_COORD = re.compile(r'-?\d+\.\d+')


def process_batch(data_file, dimension, start, batch_size):
    done = 0
    vocab_file = 'vocabulary-%05d.voc' % start
    matrix_file = 'matrix-%05d.npy' % start
    matrix = np.zeros((batch_size, dimension), dtype=np.float)
    with open(data_file, encoding='utf-8') as fin, open(vocab_file, 'w', encoding='utf-8') as fout:
        for i, line in enumerate(fin):
            if i < start:
                continue
            # begin job
            tokens = RE_COORD.findall(line)
            coords = tokens[-dimension:]
            word = line[:line.find(coords[0])]
            print(word, file=fout)
            vector = np.array([float(x) for x in coords], dtype=np.float)
            row = i - start
            matrix[row, :] = vector
            # end job
            done += 1
            if done == batch_size:  # finished batch
                break
    np.save(matrix_file, matrix)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
                description='Export embeddings from text to binary '
                'NumPy arrays')
    parser.add_argument('source', help='embeddings text file')
    parser.add_argument('dimension', help='embedding dimension', type=int)
    args = parser.parse_args()
    source = Path(args.source)
    # output files
    vocab_file = source.with_suffix('.voc').name
    matrix_file = source.with_suffix('.npy').name
    dimension = args.dimension

    print('computing matrix dimensions')
    with source.open(encoding='utf-8') as fin:
        line = next(fin)
        n_lines = sum((1 for _ in fin), 1)
    batch_size = n_lines // os.cpu_count()

    print('starting workers...')
    start = 0
    workers = []
    batches = []
    while start < n_lines:
        remaining = n_lines - start
        this_batch = batch_size if batch_size <= remaining else remaining
        p = Process(target=process_batch,
                    args=(args.source, dimension, start, this_batch))
        batches.append(start)
        p.start()
        workers.append(p)
        start += batch_size

    print('waiting...')
    for p in workers:
        p.join()

    print('concatenating vocabulary...')
    with open(vocab_file, 'w', encoding='utf-8') as fout:
        for batch in batches:
            batch_file = Path('vocabulary-%05d.voc' % batch)
            # matrix_file = 'matrix-%05d.npy' % batch
            with batch_file.open(encoding='utf-8') as fin:
                for line in fin:
                    print(line.strip(), file=fout)
            batch_file.unlink()

    print('concatenating partial matrices...')
    matrix = np.zeros((n_lines, dimension), dtype=np.float)
    i = 0
    for batch in batches:
        batch_file = Path('matrix-%05d.npy' % batch)
        partial = np.load(batch_file.as_posix())
        matrix[i: i+len(partial), :] = partial
        i += len(partial)
        batch_file.unlink()
    print('saving matrix...')
    np.save(matrix_file, matrix)
    print('finished')

然后通过计算词向量的平均值作为句向量,并使用夹角的余弦值来进行判断

import argparse
from pathlib import Path
import numpy as np
import math
import string
import sys

PUNCT = set(string.punctuation) - set('$%#')


class EmbeddingMatrix:
    @classmethod
    def load(cls, array_file, vocabulary_file):
        matrix = np.load(array_file, mmap_mode='r')
        with open(vocabulary_file, 'r', encoding='utf-8') as fin:
            vocabulary = [word.strip() for word in fin]
        return cls(vocabulary, matrix)

    def __init__(self, vocabulary, matrix):
        self.index = {w: i for i, w in enumerate(vocabulary)}
        self.matrix = matrix
        self.dimension = matrix.shape[1]
        #self.vocabulary = vocabulary

    def __getitem__(self, word):
        return self.matrix[self.index[word]]

    def __contains__(self, item):
        return item in self.index

    def __len__(self):
        return len(self.index)

    #def GetIndex(self,index):
        #return self.vocabulary[index]

class Unit:

    def __init__(self,word='',similarity=0):
        self.word = word
        self.similarity = similarity

    def init(self,_new):
        self.word = _new.word
        self.similarity = _new.similarity

    def GetWord(self):
        return self.word

    def GetSimilarity(self):
        return self.similarity

    def SetValue(self,word,similarity):
        self.word = word
        self.similarity = similarity

    def __str__(self):
        return self.word

def load_embeddings(filename):
    vectors_file = Path(filename).with_suffix('.npy')
    vocabulary_file = Path(filename).with_suffix('.voc')

    # print('%s',vocabulary_file)
    # validation of vector files
    assert vectors_file.is_file(), '%s is not a file' % vectors_file
    assert vocabulary_file.is_file(), '%s is not a file' % vocabulary_file

    return EmbeddingMatrix.load(
        vectors_file.as_posix(), vocabulary_file.as_posix())

def vectorize(sentence,embeddings,ignore_case=False):
    if ignore_case:
        sentence = sentence.lower()
    tokens = [w for w in sentence.strip().split() if w not in PUNCT]

    vectors = []
    for t in tokens:
        #t = 'b\''+t+'\''
        #print(t)
        if t in embeddings:
            vectors.append(embeddings[t])
        else:  # OOV word
            #print("random")
            #print(t)
            random_vector = np.random.uniform(-0.25, 0.25,
                                              size=embeddings.dimension)
            vectors.append(random_vector)
    vv = 0
    for vx in vectors:
       vv = vv+vx
    vv = vv/len(vectors)
    #print(vv)
    vectorized = np.array(vv).T  # dimension lines x len(tokens) columns
    #print(vectorized.shape)
    return vectorized

def cos(vec1,vec2):
    fenzi = 0
    for i in range(0,300):
        fenzi = fenzi + vec1[i]*vec2[i]
    fenmu = 0
    for i in range(0,300):
        fenmu = fenmu + vec1[i]*vec1[i]
    #print(fenmu)
    fenmu2 = 0
    for i in range(0,300):
        fenmu2 = fenmu2 + vec2[i]*vec2[i]
    #print(fenmu2)
    fenmu = fenmu * fenmu2
    #print(fenmu)
    fenmu = math.sqrt(fenmu)
    return fenzi/fenmu

if __name__ == '__main__':
    #命令行输入读取
    # print('input first sentence')
    # first_sentence = input()
    # print('input second sentence')
    # second_sentence = input()

    #命令行参数读取
    #python SS.py "I have a pen" "I have an apple"
    first_sentence = sys.argv[1]
    second_sentence = sys.argv[2]

    embeddings = load_embeddings('../GoogleNews-vectors-300d')
    #print('first sentence : '+first_sentence)
    #print('second sentence : '+second_sentence)

    fs = vectorize(first_sentence,embeddings)
    ss = vectorize(second_sentence,embeddings)

    ans = cos(fs,ss)
    print((ans+1)/2)
阅读更多

没有更多推荐了,返回首页