用gensim-word2vec实现词矢量化

'''
文本特征学习-词矢量化 Word2vec 用Python包genism实现
'''

import gensim
from gensim.models import word2vec, Word2Vec

model = gensim.models.Word2Vec(sentences, min_count=1, size=20)

sentences = """How to Sound Like a Data Scientist
    Types of Data
    The Five Steps of Data Science
    Basic Mathematics
    A Gentle Introduction to Probability
    Advanced Probability
    Basic Statistics
    Advanced Statistics
    Communicating Data
    Machine Learning Essentials
    Beyond the Essentials
    Case Studies """.split('\n')

'''词嵌入'''
def get_embedding(string):
       try:
           return model.wv[string]
       except:
           return None

vectorized_sentences = np.zeros((len(sentences),300))
for i, sentence in enumerate(sentences):
    words = sentence.split(' ')
    embedded_words = [get_embedding(w) for w in words]
    embedded_words = filter(lambda x:x is not None, embedded_words)
    vectorized_sentence = reduce(lambda x,y:x+y,embedded_words)/len(embedded_words)
    vectorized_sentences[i:] = vectorized_sentence
vectorized_sentences.shape

# find similar text about math
reference_word = 'math'
best_sentence_idx =
np.dot(vectorized_sentences,get_embedding(reference_word)).argsort()[-3:][::-1]
print([sentences[b] for b in best_sentence_idx])

# find similar text are about AI
reference_word = 'AI'
best_sentence_idx = np.dot(vectorized_sentences,get_embedding(reference_word)).argsort()[-3:][::-1]
print([sentences[b] for b in best_sentence_idx])

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值