vocabulary_embedding.py

# coding: utf-8

# Generate intial word embedding for headlines and description

# The embedding is limited to a fixed vocabulary size (`vocab_size`) but
# a vocabulary of all the words that appeared in the data is built.

# In[3]:


FN = 'vocabulary-embedding'


# In[4]:


seed=42


# In[5]:


vocab_size = 40000


# In[6]:


embedding_dim = 100


# In[8]:


lower = False # dont lower case the text


# # read tokenized headlines and descriptions

# In[7]:


import  pickle
FN0 = 'tokens' # this is the name of the data file which I assume you already have
with open('data/%s.pkl'%FN0, 'rb') as fp:
    heads, desc, keywords = pickle.load(fp) # keywords are not used in this project


# In[9]:


if lower:
    heads = [h.lower() for h in heads]


# In[10]:


if lower:
    desc = [h.lower() for h in desc]


# In[11]:


i=0
heads[i]


# In[12]:


desc[i]


# In[13]:


keywords[i]


# In[14]:


len(heads),len(set(heads))


# In[15]:


len(desc),len(set(desc))


# # build vocabulary

# In[16]:


from collections import Counter
from itertools import chain
def get_vocab(lst):
    vocabcount = Counter(w for txt in lst for w in txt.split())
    vocab = map(lambda x: x[0], sorted(vocabcount.items(), key=lambda x: -x[1]))
    return vocab, vocabcount


# In[17]:


vocab, vocabcount = get_vocab(heads+desc)


# most popular tokens

# In[18]:

vocab = list(vocab)
print (vocab[:50])
print ('...',len(vocab))


# In[19]:


import matplotlib.pyplot as plt
#get_ipython().magic('matplotlib inline')
plt.plot([vocabcount[w] for w in vocab]);
plt.gca().set_xscale("log", nonposx='clip')
plt.gca().set_yscale("log", nonposy='clip')
plt.title('word distribution in headlines and discription')
plt.xlabel('rank')
plt.ylabel('total appearances');
plt.show()

# always nice to see [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law)

# # Index words

# In[21]:


empty = 0 # RNN mask of no data
eos = 1  # end of sentence
start_idx = eos+1 # first real word


# In[22]:


def get_idx(vocab, vocabcount):
    word2idx = dict((word, idx+start_idx) for idx,word in enumerate(vocab))
    word2idx['<empty>'] = empty
    word2idx['<eos>'] = eos
    
    idx2word = dict((idx,word) for word,idx in word2idx.items())

    return word2idx, idx2word


# In[23]:


word2idx, idx2word = get_idx(vocab, vocabcount)


# # Word Embedding

# ## read GloVe

# In[24]:


fname = 'glove.6B.%dd.txt'%embedding_dim
import os
datadir_base = os.path.expanduser(os.path.join('~', '.keras'))  #把path中包含的"~"和"~user"转换成用户目录
if not os.access(datadir_base, os.W_OK):  #包含在access()的mode参数中 , 测试path是否可写。
    datadir_base = os.path.join('/tmp', '.keras')
datadir = os.path.join(datadir_base, 'datasets')  #将目录组合成路径
glove_name = os.path.join(datadir, fname)
#print(glove_name)

# if not os.path.exists(glove_name): #查某个路径是否存在,返回True or False
#     path = 'glove.6B.zip'
#     #path = get_file(path, origin="http://nlp.stanford.edu/data/glove.6B.zip")
#     os.system('unzip {datadir}/{path}')


# In[25]:
import subprocess
glove_name = 'tmp\keras\datasets\glove.6B.100d.txt'
# lines = glove_name.readlines()
# length = len(lines)
# print(length)

#glove_n_symbols = subprocess.getoutput("wc -l  {glove_name}") #显示行数
glove_n_symbols = subprocess.getoutput("ls {glove_name}") #显示行数
print(glove_n_symbols)
glove_n_symbols = int(glove_n_symbols[0].split()[0])
#print(glove_n_symbols)

round(float("1.0"))
# In[26]:

import numpy as np
glove_index_dict = {}
glove_embedding_weights = np.empty((glove_n_symbols, embedding_dim))
globale_scale=.1
with open(glove_name, 'r') as fp:
    i = 0
    for l in fp:
        l = l.strip().split()
        w = l[0]
        glove_index_dict[w] = i
        glove_embedding_weights[i,:] = map(float,l[1:])
        i += 1
glove_embedding_weights *= globale_scale


# # In[27]:
#
#
# glove_embedding_weights.std()
#
#
# # In[28]:
#
#
# for w,i in glove_index_dict.items():
#     w = w.lower()
#     if w not in glove_index_dict:
#         glove_index_dict[w] = i
#
#
# # ## embedding matrix
#
# # use GloVe to initialize embedding matrix
#
# # In[30]:
#
#
# import numpy as np
#
# # generate random embedding with same scale as glove
# np.random.seed(seed)
# shape = (vocab_size, embedding_dim)
# scale = glove_embedding_weights.std()*np.sqrt(12)/2 # uniform and not normal
# embedding = np.random.uniform(low=-scale, high=scale, size=shape)
# print ('random-embedding/glove scale', scale, 'std', embedding.std())
#
# # copy from glove weights of words that appear in our short vocabulary (idx2word)
# c = 0
# for i in range(vocab_size):
#     w = idx2word[i]
#     g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))
#     if g is None and w.startswith('#'): # glove has no hastags (I think...)
#         w = w[1:]
#         g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))
#     if g is not None:
#         embedding[i,:] = glove_embedding_weights[g,:]
#         c+=1
# print ('number of tokens, in small vocab, found in glove and copied to embedding', c,c/float(vocab_size))
#
#
# # lots of word in the full vocabulary (word2idx) are outside `vocab_size`.
# # Build an alterantive which will map them to their closest match in glove but only if the match
# # is good enough (cos distance above `glove_thr`)
#
# # In[31]:
#
#
# glove_thr = 0.5
#
#
# # In[34]:
#
#
# word2glove = {}
# for w in word2idx:
#     if w in glove_index_dict:
#         g = w
#     elif w.lower() in glove_index_dict:
#         g = w.lower()
#     elif w.startswith('#') and w[1:] in glove_index_dict:
#         g = w[1:]
#     elif w.startswith('#') and w[1:].lower() in glove_index_dict:
#         g = w[1:].lower()
#     else:
#         continue
#     word2glove[w] = g
#
#
# # for every word outside the embedding matrix find the closest word inside the mebedding matrix.
# # Use cos distance of GloVe vectors.
# #
# # Allow for the last `nb_unknown_words` words inside the embedding matrix to be considered to be outside.
# # Dont accept distances below `glove_thr`
#
# # In[47]:
#
#
# normed_embedding = embedding/np.array([np.sqrt(np.dot(gweight,gweight)) for gweight in embedding])[:,None]
#
# nb_unknown_words = 100
#
# glove_match = []
# for w,idx in word2idx.items():
#     if idx >= vocab_size-nb_unknown_words and w.isalpha() and w in word2glove:
#         gidx = glove_index_dict[word2glove[w]]
#         gweight = glove_embedding_weights[gidx,:].copy()
#         # find row in embedding that has the highest cos score with gweight
#         gweight /= np.sqrt(np.dot(gweight,gweight))
#         score = np.dot(normed_embedding[:vocab_size-nb_unknown_words], gweight)
#         while True:
#             embedding_idx = score.argmax()
#             s = score[embedding_idx]
#             if s < glove_thr:
#                 break
#             if idx2word[embedding_idx] in word2glove :
#                 glove_match.append((w, embedding_idx, s))
#                 break
#             score[embedding_idx] = -1
# glove_match.sort(key = lambda x: -x[2])
# print ('# of glove substitutes found', len(glove_match))
#
#
# # manually check that the worst substitutions we are going to do are good enough
#
# # In[48]:
#
#
# for orig, sub, score in glove_match[-10:]:
#     print (score, orig,'=>', idx2word[sub])
#
#
# # build a lookup table of index of outside words to index of inside words
#
# # In[49]:
#
#
# glove_idx2idx = dict((word2idx[w],embedding_idx) for  w, embedding_idx, _ in glove_match)
#
#
# # # Data
#
# # In[50]:
#
#
# Y = [[word2idx[token] for token in headline.split()] for headline in heads]
# len(Y)
#
#
# # In[51]:
#
#
# plt.hist(map(len,Y),bins=50);
#
#
# # In[52]:
#
#
# X = [[word2idx[token] for token in d.split()] for d in desc]
# len(X)
#
#
# # In[53]:
#
#
# plt.hist(map(len,X),bins=50);
#
#
# # In[54]:
#
#
# import pickle
# with open('data/%s.pkl'%FN,'wb') as fp:
#     pickle.dump((embedding, idx2word, word2idx, glove_idx2idx),fp,-1)
#
#
# # In[4]:
#
#
# import  pickle
# with open('data/%s.data.pkl'%FN,'wb') as fp:
#     pickle.dump((X,Y),fp,-1)
#

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值