为了让更多胖友体会到和我一样的快感!
首先务必更新您的tensorflow!(本人1.9.0)
然后
(以下来自Sanford CS224n的课题代码)
utils.py
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 02 15:39:24 2018
@author: lulu
"""
import os
import gzip
import shutil
import struct
import urllib
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf
def huber_loss(labels, predictions, delta=14.0):
residual = tf.abs(labels - predictions)
def f1(): return 0.5 * tf.square(residual)
def f2(): return delta * residual - 0.5 * tf.square(delta)
return tf.cond(residual < delta, f1, f2)
def safe_mkdir(path):
""" Create a directory if there isn't one already. """
try:
os.mkdir(path)
except OSError:
pass
def read_birth_life_data(filename):
"""
Read in birth_life_2010.txt and return:
data in the form of NumPy array
n_samples: number of samples
"""
text = open(filename, 'r').readlines()[1:]
data = [line[:-1].split('\t') for line in text]
births = [float(line[1]) for line in data]
lifes = [float(line[2]) for line in data]
data = list(zip(births, lifes))
n_samples = len(data)
data = np.asarray(data, dtype=np.float32)
return data, n_samples
def download_one_file(download_url,
local_dest,
expected_byte=None,
unzip_and_remove=False):
"""
Download the file from download_url into local_dest
if the file doesn't already exists.
If expected_byte is provided, check if
the downloaded file has the same number of bytes.
If unzip_and_remove is True, unzip the file and remove the zip file
"""
if os.path.exists(local_dest) or os.path.exists(local_dest[:-3]):
print('%s already exists' %local_dest)
else:
print('Downloading %s' %download_url)
local_file, _ = urllib.request.urlretrieve(download_url, local_dest)
file_stat = os.stat(local_dest)
if expected_byte:
if file_stat.st_size == expected_byte:
print('Successfully downloaded %s' %local_dest)
if unzip_and_remove:
with gzip.open(local_dest, 'rb') as f_in, open(local_dest[:-3],'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(local_dest)
else:
print('The downloaded file has unexpected number of bytes')
word2vec_utils.py
from collections import Counter
import random
import os
import sys
sys.path.append('..')
import zipfile
import numpy as np
from six.moves import urllib
import tensorflow as tf
import utils
def read_data(file_path):
""" Read data into a list of tokens
There should be 17,005,207 tokens
"""
with zipfile.ZipFile(file_path) as f:
words = tf.compat.as_str(f.read(f.namelist()[0])).split()
return words
def build_vocab(words, vocab_size, visual_fld):
""" Build vocabulary of VOCAB_SIZE most frequent words and write it to
visualization/vocab.tsv
"""
utils.safe_mkdir(visual_fld)
file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')
dictionary = dict()
count = [('UNK', -1)]
index = 0
count.extend(Counter(words).most_common(vocab_size - 1))
for word, _ in count:
dictionary[word] = index
index += 1
file.write(word + '\n')
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
file.close()
return dictionary, index_dictionary
def convert_words_to_index(words, dictionary):
""" Replace each word in the dataset with its index in the dictionary """
return [dictionary[word] if word in dictionary else 0 for word in words]
def generate_sample(index_words, context_window_size):
""" Form training pairs according to the skip-gram model. """
for index, center in enumerate(index_words):
context = random.randint(1, context_window_size)
# get a random target before the center word
for target in index_words[max(0, index - context): index]:
yield center, target
# get a random target after the center wrod
for target in index_words[index + 1: index + context + 1]:
yield center, target
def most_common_words(visual_fld, num_visualize):
""" create a list of num_visualize most frequent words to visualize on TensorBoard.
saved to visualization/vocab_[num_visualize].tsv
"""
words = open(os.path.join(visual_fld, 'vocab.tsv'), 'r').readlines()[:num_visualize]
words = [word for word in words]
file = open(os.path.join(visual_fld, 'vocab_' + str(num_visualize) + '.tsv'), "w")
for word in words:
file.write(word)
file.close()
def batch_gen(download_url, expected_byte, vocab_size, batch_size,
skip_window, visual_fld):
local_dest = 'data/text8.zip'
utils.download_one_file(download_url, local_dest, expected_byte)
words = read_data(local_dest)
dictionary, _ = build_vocab(words, vocab_size, visual_fld)
index_words = convert_words_to_index(words, dictionary)
del words # to save memory
single_gen = generate_sample(index_words, skip_window)
while True:
center_batch = np.zeros(batch_size, dtype=np.int32)
target_batch = np.zeros([batch_size, 1])
for index in range(batch_size):
center_batch[index], target_batch[index] = next(single_gen)
yield center_batch, target_batch
word2vec.py
""" starter code for word2vec skip-gram model with NCE loss
CS 20: "TensorFlow for Deep Learning Research"
cs20.stanford.edu
Chip Huyen (chiphuyen@cs.stanford.edu)
Lecture 04
"""
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf
import utils
import word2vec_utils
# Model hyperparameters
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window
NUM_SAMPLED = 64 # number of negative examples to sample
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
VISUAL_FLD = 'visualization'
SKIP_STEP = 5000
# Parameters for downloading data
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016
NUM_VISUALIZE = 2000 # number of tokens to visualize
def word2vec(dataset):
""" Build the graph for word2vec model and train it """
# Step 1: get input, output from the dataset
iterator = dataset.make_initializable_iterator()
center_words, target_words = iterator.get_next()
# Step 2: define weights.
# In word2vec, it's the weights that we care about
embed_matrix = tf.get_variable('embed_matrix',
shape=[VOCAB_SIZE, EMBED_SIZE],
initializer=tf.random_uniform_initializer())
# Step 3: define the inference
embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')
# Step 4: define loss function
# construct variables for NCE loss
nce_weight = tf.get_variable('nce_weight',
shape=[VOCAB_SIZE, EMBED_SIZE],
initializer=tf.truncated_normal_initializer(stddev=1.0 / (EMBED_SIZE ** 0.5)))
nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([VOCAB_SIZE]))
# define loss function to be NCE loss function
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
biases=nce_bias,
labels=target_words,
inputs=embed,
num_sampled=NUM_SAMPLED,
num_classes=VOCAB_SIZE), name='loss')
# Step 5: define optimizer that follows gradient descent update rule
# to minimize loss
optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
utils.safe_mkdir('checkpoints')
with tf.Session() as sess:
# Step 6: initialize iterator and variables
sess.run(iterator.initializer)
sess.run(tf.global_variables_initializer())
total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
writer = tf.summary.FileWriter('graphs/word2vec_simple', sess.graph)
for index in range(NUM_TRAIN_STEPS):
try:
# Step 7: execute optimizer and fetch loss
loss_batch, _ = sess.run([loss, optimizer])
total_loss += loss_batch
if (index + 1) % SKIP_STEP == 0:
print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
total_loss = 0.0
except tf.errors.OutOfRangeError:
sess.run(iterator.initializer)
writer.close()
def gen():
yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE,
BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD)
def main():
utils.safe_mkdir('data')
dataset = tf.data.Dataset.from_generator(gen,
(tf.int32, tf.int32),
(tf.TensorShape([BATCH_SIZE]), tf.TensorShape([BATCH_SIZE, 1])))
word2vec(dataset)
if __name__ == '__main__':
main()
运行word2vec.py,成功!