import collections
import math
import datetime
import tensorflow as tf
from config import *
data = []
id_map_raw = np.loadtxt(os.path.join(DATA_PATH, 'id_mapping'), dtype=str, delimiter='\t')
id_map = {id_map_raw[i, 0]: int(id_map_raw[i, 1]) for i in range(id_map_raw.shape[0])}
reversed_id_map = {int(id_map_raw[i, 1]): id_map_raw[i, 0] for i in range(id_map_raw.shape[0])}
side_info = np.loadtxt(os.path.join(DATA_PATH, 'side_info_feature'), dtype=int, delimiter='\t')
#side_info = side_info[:, :-1]
item_size, feature_size = side_info.shape
embedding_size = 128
n_sampled = 500
batch_size = 1024
num_steps = 50001 # data_size / batch_size * n_epoch
num_steps = 250001 # data_size / batch_size * n_epoch
every_k_step = 10000
num_skips = 2 # batch_size % num_skips == 0
window_size = 3
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
click_item = set()
def read_data(filename):
global click_item
with open(filename) as f:
for line in f.readlines():
line = list(map(lambda x: int(x), line.strip().split(' ')))
line.extend([''] * window_size)
data.extend(line)
click_item = set(filter(lambda x: x != '', data))
logger.info("click item size: {0}".format(len(click_item)))
return data
def load_embedding():
rows = max(side_info[:, 0])+1
emb = np.zeros((rows, embedding_size))
hit = 0
with open(os.path.join(DATA_PATH, 'product_title_embedding')) as f:
for line in f.readlines():
line = line.strip().split('\t')
product_id = line[0]
row = id_map.get(product_id, -1)
if row >= 0:
hit += 1
for k in range(embedding_size):
emb[row][k] = float(line[k+1])
print("title embedding hit rate: {0}".format(hit/len(id_map)))
return emb
data_index = 0
offset = -window_size
def generate_batch(batch_size, num_skip, skip_window):
global data_index
global offset
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
ind = 0
flag = False
while ind < batch_size:
for j in range(offset, skip_window+1):
if j == 0 or data_index+j < 0 or data[data_index] == '' or data[data_index+j] == '':
continue
batch[ind] = data[data_index]
labels[ind, 0] = data[data_index+j]
ind += 1
if ind == batch_size:
offset = j + 1
flag = True
break
if flag:
break
else:
data_index = (data_index + 1) % len(data)
offset = -skip_window
return batch, labels
def ctx_idxx(target_idx, window_size, tokens):
"""Return positions of context words."""
ctx_range = tf.range(start=tf.maximum(tf.constant(0, dtype=tf.int32),
target_idx-window_size),
limit=tf.minimum(tf.size(input=tokens, out_type=tf.int32),
target_idx+window_size+1),
delta=1, dtype=tf.int32)
idx = tf.case({tf.less_equal(target_idx, window_size): lambda: target_idx,
tf.greater(target_idx, window_size): lambda: window_size},
exclusive=True)
t0 = lambda: tf.constant([], dtype=tf.int32)
t1 = lambda: ctx_range[idx+1:]
t2 = lambda: ctx_range[0:idx]
t3 = lambda: tf.concat([ctx_range[0:idx], ctx_range[idx+1:]], axis=0)
c1 = tf.logical_and(tf.equal(idx, 0),
tf.less(idx+1, tf.size(input=ctx_range, out_type=tf.int32)))
c2 = tf.logical_and(tf.greater(idx, 0),
tf.equal(idx+1, tf.size(input=ctx_range, out_type=tf.int32)))
c3 = tf.logical_and(tf.greater(idx, 0),
tf.less(idx+1, tf.size(input=ctx_range, out_type=tf.int32)))
return tf.case({c1: t1, c2: t2, c3: t3}, default=t0, exclusive=True)
def concat_to_features_and_labels(tokens, window_size):
"""Concatenate features and labels into Tensor."""
def internal_func(features, labels, target_idx):
ctxs = ctx_idxx(target_idx, window_size, tokens)
label = tf.reshape(tf.gather(tokens, ctxs), [-1, 1])
feature = tf.fill([tf.size(input=label)], tokens[target_idx])
return tf.concat([features, feature], axis=0), \
tf.concat([labels, label], axis=0), target_idx+1
return internal_func
def extract_examples(tokens, window_size, p_num_threads):
"""Extract (features, labels) examples from a list of tokens."""
features = tf.constant([], dtype=tf.int32)
labels = tf.constant([], shape=[0, 1], dtype=tf.int32)
target_idx = tf.constant(0, dtype=tf.int32)
concat_func = concat_to_features_and_labels(tokens, window_size)
max_size = tf.size(input=tokens, out_type=tf.int32)
idx_below_tokens_size = lambda w, x, idx: tf.less(idx, max_size)
result = tf.while_loop(
cond=idx_below_tokens_size,
body=concat_func,
loop_vars=[features, labels, target_idx],
shape_invariants=[tf.TensorShape([None]),
tf.TensorShape([None, 1]),
target_idx.get_shape()],
parallel_iterations=p_num_threads)
return result[0], result[1]
def generate_batch_no_use(batch_size, num_skips, skip_window):
global data_index
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
if data_index + span > len(data):
data_index = 0
buffer.extend(data[data_index:data_index + span])
data_index += span
for i in range(batch_size // num_skips):
target = skip_window
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
if data_index == len(data):
for k in range(span):
buffer.append(data[k])
data_index = span
else:
buffer.append(data[data_index])
data_index += 1
data_index = (data_index + len(data) - span) % len(data)
return batch, labels
def get_cold_start_embedding():
global click_item
global ck_embedding
cold_start_item = set(side_info[:, 0]).difference(click_item)
item_size = len(cold_start_item)
cnt = item_size // batch_size
remain = (cnt + 1) * batch_size - item_size
cold_start_item = np.array(cold_start_item)
cold_start_item = np.concatenate([cold_start_item, [0] * remain], axis=0)
for i in range(cnt+1):
eval_embedding_list = []
eval_input = all_item[i*batch_size+j*batch_size_gpu: i*batch_size+(j+1)*batch_size_gpu]
eval_label = np.zeros((batch_size, 1))
eval_embedding = sess.run(merged_embedding, feed_dict={train_input: eval_input, train_label: eval_label})
# for cold start item
# cold_start_embedding = sess.run(cold_start_embedding, feed_dict={train_input: eval_input, train_label: eval_label})
eval_embedding = eval_embedding.tolist()
eval_embedding_list.extend(eval_embedding)
if i == cnt:
eval_embedding_list = eval_embedding_list[:-remain]
ck_embedding.update({all_item[i*batch_size+k]: eval_embedding_list[k] for k in range(len(eval_embedding_list))})
def dump_embedding(embedding_result, output_file):
merged = {}
new = {}
with open(output_file) as f:
for line in f.readlines():
line = line.strip()
k = line.split(" ")[0]
merged[k] = [float(x) for x in line.split(" ")[1:]]
for k, v in embedding_result.items():
new[reversed_id_map[k]] = v
merged.update(new)
with open(output_file, 'w') as f:
for k, v in merged.items():
f.write("{0} {1}\n".format(k, " ".join(list(map(lambda x: str(x), v)))))
if __name__ == '__main__':
d = read_data(os.path.join(DATA_PATH, 'walk_seq'))
title_emb = load_embedding()
print("title embeding loaded")
graph = tf.Graph()
ck_embedding = {}
with graph.as_default():
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
embedding_list = []
is_train = tf.placeholder(tf.int32, shape=[])
for i in range(feature_size):
embedding = tf.Variable(tf.random_uniform((max(side_info[:, i])+2, embedding_size), -1, 1),
name='embedding_{}'.format(str(i)), trainable=True)
side_info_feature = tf.gather(side_info[:, i], train_inputs)
side_info_embed = tf.nn.embedding_lookup(embedding, side_info_feature)
embedding_list.append(side_info_embed)
title_embedding = tf.get_variable(name="embedding_{}".format(str(feature_size)), shape=title_emb.shape,
initializer=tf.constant_initializer(title_emb),
#trainable=False)
)
side_info_title_emb = tf.nn.embedding_lookup(title_embedding, train_inputs)
embedding_list.append(side_info_title_emb)
feature_size += 1
alpha_embedding = tf.Variable(tf.random_uniform((item_size, feature_size), 0, 1), name='alpha')
stacked_embed = tf.stack(embedding_list, axis=-1)
alpha_index = tf.gather(side_info[:, 0], train_inputs)
alpha_embed = tf.nn.embedding_lookup(alpha_embedding, alpha_index)
alpha_embed_expand = tf.expand_dims(alpha_embed, 1)
alpha_i_sum = tf.reduce_sum(tf.exp(alpha_embed_expand), axis=-1)
feature_weight = tf.cond(is_train > 0, lambda: tf.constant([[1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 1.8]]),\
lambda: tf.constant([[1.0, 1.0, 1.0, 1.5, 1.0, 1.0, 1.0, 1.0, 1.0]]))
weight_drop = tf.tile(feature_weight, [batch_size, 1])
weight_drop = tf.expand_dims(weight_drop, 1)
weight_drop = tf.tile(weight_drop, [1, embedding_size, 1])
merge_embedding = tf.reduce_sum(stacked_embed * tf.exp(alpha_embed_expand) * weight_drop, axis=-1) / alpha_i_sum
nce_weights = tf.Variable(tf.truncated_normal([item_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([item_size]))
softmax_w = tf.Variable(tf.truncated_normal((item_size, embedding_size), stddev=0.1), name='softmax_w')
softmax_b = tf.Variable(tf.zeros(item_size), name='softmax_b')
loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
weights=softmax_w,
biases=softmax_b,
labels=train_labels,
inputs=merge_embedding,
num_sampled=n_sampled,
num_classes=item_size))
global_step = tf.Variable(0, trainable=False)
initial_learning_rate = 0.005
decay_steps = 20000
decay_rate = 0.9
learning_rate = tf.train.exponential_decay(initial_learning_rate,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=decay_rate)
add_global = global_step.assign_add(1)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
with tf.Session(graph=graph, config=config) as session:
session.run(tf.global_variables_initializer())
logger.info('Model built')
trainable_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'title_embedding')
avg_loss = 0
final_loss = 0
for step in range(1, num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, window_size)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
feed_dict[is_train] = 1
_, loss_val, merged_emb, _, lr = session.run([optimizer, loss, merge_embedding, add_global, learning_rate], feed_dict=feed_dict)
for i in range(batch_inputs.shape[0]):
ck_embedding[batch_inputs[i]] = merged_emb[i]
avg_loss += loss_val
final_loss += loss_val
if step % every_k_step == 0:
end_time = datetime.datetime.now()
avg_loss /= every_k_step
logger.info("step: {0}, loss: {1}".format(step, avg_loss))
avg_loss = 0
start_time = datetime.datetime.now()
logger.info("click item, {0}".format(len(ck_embedding)))
# cold start item embedding
cold_start_embedding = {}
visited_item = set(ck_embedding.keys())
#cold_start_item = list(set(side_info[:, 0]).difference(click_item))
cold_start_item = list(set(side_info[:, 0]).difference(visited_item))
item_size = len(cold_start_item)
cnt = item_size // batch_size
remain = (cnt + 1) * batch_size - item_size
cold_start_item.extend([0] * remain)
cold_start_item = np.array(cold_start_item)
for i in range(cnt+1):
eval_input = cold_start_item[i*batch_size: (i+1)*batch_size]
eval_label = np.zeros((batch_size, 1))
eval_embedding = session.run(merge_embedding, feed_dict={train_inputs: eval_input, train_labels: eval_label, is_train: 0})
eval_embedding = eval_embedding.tolist()
if i == cnt:
eval_embedding = eval_embedding[:-remain]
for k in range(len(eval_embedding)):
cold_start_embedding[eval_input[k]] = eval_embedding[k]
logger.info("cold start, {0}".format(len(cold_start_embedding)))
ck_embedding.update(cold_start_embedding)
dump_embedding(ck_embedding, os.path.join(MODEL_PATH, 'final_embedding'))
logger.info("{0}Finish{0}".format("="*10))
saver = tf.train.Saver()
saver.save(session, save_path=os.path.join(MODEL_PATH, "model"))