首先需要得到词向量,这里使用Google新闻的语料来进行转化
python word2vec2text.py GoogleNews-vectors-negative300.bin GoogleNews-vectors-300d.txt
python text2numpy.py GoogleNews-vectors-300d.txt 300
word2vec2text.py如下:
import argparse
import mmap
import os
import struct
import sys
from array import array
from contextlib import contextmanager
# ensures Python 3.x
assert sys.version_info >= (3, 0)
@contextmanager
def memorymap(filename):
try:
size = os.path.getsize(filename)
fd = os.open(filename, os.O_RDONLY)
mapped_file = mmap.mmap(fd, size, access=mmap.ACCESS_READ)
yield mapped_file
finally:
mapped_file.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Export embeddings from binary word2vec files '
'to plain text')
parser.add_argument('source', help='word2vec binary file')
parser.add_argument('dest', help='destination file')
parser.add_argument('--vocabulary', help='text file containing words to '
'export (one word per line)')
args = parser.parse_args()
if args.vocabulary:
print('loading vocabulary')
with open(args.vocabulary) as fin:
vocabulary = {word.strip() for word in fin}
else:
vocabulary = None
print('exporting vectors')
with memorymap(args.source) as mvec, \
open(args.dest, 'w', encoding='utf-8') as fout:
end = mvec.find(b'\n', 0, 100)
if end == -1:
raise Exception("Invalid file format")
_, token2 = mvec[0:end].split()
vector_size = int(token2)
byte_offset = vector_size * struct.calcsize('f')
while True:
# reading a word
pos = end
if pos >= mvec.size():
break
end = mvec.find(b' ', pos)
if end == -1:
break
wordbytes = mvec[pos:end]
word = wordbytes.decode('utf-8', errors='replace').strip()
# reading the corresponding vector
pos = end + 1
end = pos + byte_offset
vector = array('f', mvec[pos:end])
if vocabulary is not None and word not in vocabulary:
continue # skip word if not in vocabulary
print(word, ' '.join(map(str, vector)), file=fout)
print('finished')
text2numpy.py如下:
import sys
import argparse
import re
import os
from pathlib import Path
from multiprocessing import Process
import numpy as np
# ensures Python 3.x
assert sys.version_info >= (3, 0)
RE_COORD = re.compile(r'-?\d+\.\d+')
def process_batch(data_file, dimension, start, batch_size):
done = 0
vocab_file = 'vocabulary-%05d.voc' % start
matrix_file = 'matrix-%05d.npy' % start
matrix = np.zeros((batch_size, dimension), dtype=np.float)
with open(data_file, encoding='utf-8') as fin, open(vocab_file, 'w', encoding='utf-8') as fout:
for i, line in enumerate(fin):
if i < start:
continue
# begin job
tokens = RE_COORD.findall(line)
coords = tokens[-dimension:]
word = line[:line.find(coords[0])]
print(word, file=fout)
vector = np.array([float(x) for x in coords], dtype=np.float)
row = i - start
matrix[row, :] = vector
# end job
done += 1
if done == batch_size: # finished batch
break
np.save(matrix_file, matrix)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Export embeddings from text to binary '
'NumPy arrays')
parser.add_argument('source', help='embeddings text file')
parser.add_argument('dimension', help='embedding dimension', type=int)
args = parser.parse_args()
source = Path(args.source)
# output files
vocab_file = source.with_suffix('.voc').name
matrix_file = source.with_suffix('.npy').name
dimension = args.dimension
print('computing matrix dimensions')
with source.open(encoding='utf-8') as fin:
line = next(fin)
n_lines = sum((1 for _ in fin), 1)
batch_size = n_lines // os.cpu_count()
print('starting workers...')
start = 0
workers = []
batches = []
while start < n_lines:
remaining = n_lines - start
this_batch = batch_size if batch_size <= remaining else remaining
p = Process(target=process_batch,
args=(args.source, dimension, start, this_batch))
batches.append(start)
p.start()
workers.append(p)
start += batch_size
print('waiting...')
for p in workers:
p.join()
print('concatenating vocabulary...')
with open(vocab_file, 'w', encoding='utf-8') as fout:
for batch in batches:
batch_file = Path('vocabulary-%05d.voc' % batch)
# matrix_file = 'matrix-%05d.npy' % batch
with batch_file.open(encoding='utf-8') as fin:
for line in fin:
print(line.strip(), file=fout)
batch_file.unlink()
print('concatenating partial matrices...')
matrix = np.zeros((n_lines, dimension), dtype=np.float)
i = 0
for batch in batches:
batch_file = Path('matrix-%05d.npy' % batch)
partial = np.load(batch_file.as_posix())
matrix[i: i+len(partial), :] = partial
i += len(partial)
batch_file.unlink()
print('saving matrix...')
np.save(matrix_file, matrix)
print('finished')
然后通过计算词向量的平均值作为句向量,并使用夹角的余弦值来进行判断
import argparse
from pathlib import Path
import numpy as np
import math
import string
import sys
PUNCT = set(string.punctuation) - set('$%#')
class EmbeddingMatrix:
@classmethod
def load(cls, array_file, vocabulary_file):
matrix = np.load(array_file, mmap_mode='r')
with open(vocabulary_file, 'r', encoding='utf-8') as fin:
vocabulary = [word.strip() for word in fin]
return cls(vocabulary, matrix)
def __init__(self, vocabulary, matrix):
self.index = {w: i for i, w in enumerate(vocabulary)}
self.matrix = matrix
self.dimension = matrix.shape[1]
#self.vocabulary = vocabulary
def __getitem__(self, word):
return self.matrix[self.index[word]]
def __contains__(self, item):
return item in self.index
def __len__(self):
return len(self.index)
#def GetIndex(self,index):
#return self.vocabulary[index]
class Unit:
def __init__(self,word='',similarity=0):
self.word = word
self.similarity = similarity
def init(self,_new):
self.word = _new.word
self.similarity = _new.similarity
def GetWord(self):
return self.word
def GetSimilarity(self):
return self.similarity
def SetValue(self,word,similarity):
self.word = word
self.similarity = similarity
def __str__(self):
return self.word
def load_embeddings(filename):
vectors_file = Path(filename).with_suffix('.npy')
vocabulary_file = Path(filename).with_suffix('.voc')
# print('%s',vocabulary_file)
# validation of vector files
assert vectors_file.is_file(), '%s is not a file' % vectors_file
assert vocabulary_file.is_file(), '%s is not a file' % vocabulary_file
return EmbeddingMatrix.load(
vectors_file.as_posix(), vocabulary_file.as_posix())
def vectorize(sentence,embeddings,ignore_case=False):
if ignore_case:
sentence = sentence.lower()
tokens = [w for w in sentence.strip().split() if w not in PUNCT]
vectors = []
for t in tokens:
#t = 'b\''+t+'\''
#print(t)
if t in embeddings:
vectors.append(embeddings[t])
else: # OOV word
#print("random")
#print(t)
random_vector = np.random.uniform(-0.25, 0.25,
size=embeddings.dimension)
vectors.append(random_vector)
vv = 0
for vx in vectors:
vv = vv+vx
vv = vv/len(vectors)
#print(vv)
vectorized = np.array(vv).T # dimension lines x len(tokens) columns
#print(vectorized.shape)
return vectorized
def cos(vec1,vec2):
fenzi = 0
for i in range(0,300):
fenzi = fenzi + vec1[i]*vec2[i]
fenmu = 0
for i in range(0,300):
fenmu = fenmu + vec1[i]*vec1[i]
#print(fenmu)
fenmu2 = 0
for i in range(0,300):
fenmu2 = fenmu2 + vec2[i]*vec2[i]
#print(fenmu2)
fenmu = fenmu * fenmu2
#print(fenmu)
fenmu = math.sqrt(fenmu)
return fenzi/fenmu
if __name__ == '__main__':
#命令行输入读取
# print('input first sentence')
# first_sentence = input()
# print('input second sentence')
# second_sentence = input()
#命令行参数读取
#python SS.py "I have a pen" "I have an apple"
first_sentence = sys.argv[1]
second_sentence = sys.argv[2]
embeddings = load_embeddings('../GoogleNews-vectors-300d')
#print('first sentence : '+first_sentence)
#print('second sentence : '+second_sentence)
fs = vectorize(first_sentence,embeddings)
ss = vectorize(second_sentence,embeddings)
ans = cos(fs,ss)
print((ans+1)/2)