使用LSTM预测用户的质量
使用用户的页面点击行为数据,预测用户的好坏
代码:
import numpy as np
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
import pdb
import sys
import json
import numpy
import time
import pickle
import os
from sklearn.metrics import roc_auc_score
import tensorflow as tf
import keras
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Input, Lambda,LSTM, Bidirectional
from keras import backend as K
from attention2 import Attention
INPUTS_NAME = ['input_sequence_dur_os', 'input_sequence_page', 'input_sequence_point', 'input_normal']
OUTPUTS_NAME = ['output']
TRAIN_FILE_NAME = "train_data_demo"
EPOCH_NUM = 3
CH_NUM = 3
MAXLEN = 400
BATCH_SIZE = 256
DUROS_LENGTH = 2
PAGE_ID_LENGTH=29
POINT_ID_LENGTH = 186
ORDER_TYPE_LENGTH = 4
def get_one_hot_vector(idx, length):
x = [0]*length
x[int(idx)] = 1
return x
def sequence_preprocess(sequences_ori):
sample_sequence = []
for sequence_ubt in sequences_ori:
inner_element = []
event_diff = float(sequence_ubt[0])
duration = float(sequence_ubt[1])+1
os_type = float(sequence_ubt[2])+1
page_sequence_cur = float(sequence_ubt[3])+1
point_sequence_cur = float(sequence_ubt[4])+1
inner_element = [event_diff, duration, os_type, page_sequence_cur, point_sequence_cur]
sample_sequence.append(inner_element)
sample_sequence_np = numpy.array(sample_sequence).T
if len(sequences_ori) == 0:
sample_sequence_np = numpy.array([[0.0]*5]).T
sequence_pad = sequence.pad_sequences(sample_sequence_np, maxlen= MAXLEN, dtype='float32', value=0.0)
dur_os_sequence = sequence_pad[:3,:].T
for i in dur_os_sequence:
if i[0] == 0:
i[0] = 2592000.0
else:
break
#ignore dur_os_sequence[:,0]
dur_os_sequence = dur_os_sequence[:,1:]
page_sequence = sequence_pad[3,:]
point_sequence = sequence_pad[4,:]
#pdb.set_trace()
return dur_os_sequence, page_sequence, point_sequence
def line_preprocess(line, input_output_dict):
line = line.strip().split('\t')
cid = line[0]
sequence_len = line[1]
date = line[2]
label = int(line[3]) #1 overduedays>0, 0 else
cid_type_id = line[4]
cid_type_vector = get_one_hot_vector(int(cid_type_id), ORDER_TYPE_LENGTH)
sequences_ori = json.loads(line[5])
dur_os_sequence, page_sequence, point_sequence = sequence_preprocess(sequences_ori)
input_output_dict['input_sequence_dur_os'].append(dur_os_sequence)
input_output_dict['input_sequence_page'].append(page_sequence)
input_output_dict['input_sequence_point'].append(point_sequence)
input_output_dict['input_normal'].append(cid_type_vector)
input_output_dict['output'].append(label)
return input_output_dict
def initial_input_output():
input_output_dict = {k:[] for k in INPUTS_NAME}
for i in OUTPUTS_NAME:
input_output_dict[i] = []
return input_output_dict
def trans_np(input_output_dict):
for k in input_output_dict:
input_output_dict[k] = numpy.array(input_output_dict[k]).astype('float32')
return ({k:input_output_dict[k] for k in INPUTS_NAME}, {k:input_output_dict[k] for k in OUTPUTS_NAME})
def generate_validation_set(path):
input_output_dict = initial_input_output()
#fin = open(path,'rb')
fin = open(path,'r')
for line in fin:
input_output_dict = line_preprocess(line, input_output_dict)
sample_tuple = trans_np(input_output_dict)
fin.close()
return sample_tuple
def lstm_weighted_attention(inputs):
lstm_out, attention = inputs
print("attention:", attention.shape)
a = K.expand_dims(attention, axis=-1)
print("a.shape:",a.shape)
print("lstm_out:", lstm_out.shape)
h_after_a = lstm_out * a
print("h_after_a:", h_after_a.shape)
h_after_a = K.sum(h_after_a, axis=1)
print("h_after_a:", h_after_a.shape)
return h_after_a
def auc(y_true, y_pred):
return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)
if __name__ == "__main__":
path = "train_data_demo"
test_path = "test_data_demo"
x_train, y_train = generate_validation_set(path)
x_test, y_test = generate_validation_set(test_path)
input_sequence_dur_os = Input(shape=(MAXLEN, DUROS_LENGTH,), dtype='float32', name='input_sequence_dur_os')
input_sequence_page = Input(shape=(MAXLEN, ), dtype='float32', name='input_sequence_page')
input_sequence_point= Input(shape=(MAXLEN, ), dtype='float32', name='input_sequence_point')
page_embedding = Embedding(output_dim=16, input_dim = PAGE_ID_LENGTH+1, input_length=MAXLEN, name='page_embedding')(input_sequence_page)
point_embedding = Embedding(output_dim=128, input_dim = POINT_ID_LENGTH+1, input_length=MAXLEN, name='point_embedding')(input_sequence_point)
input_sequence = keras.layers.concatenate([input_sequence_dur_os, page_embedding, point_embedding]) #2+16+128
lstm_out = LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(input_sequence)
attention = Attention(name='attention')(lstm_out)
lstm_after_attention = Lambda(lstm_weighted_attention, output_shape=(128,),name='lstm_weighted_a')([lstm_out, attention])
x = Dense(128, activation='relu')(lstm_after_attention)
main_output = Dense(1, activation='sigmoid', name='output')(x)
model = Model(inputs=[input_sequence_dur_os, input_sequence_page, input_sequence_point], outputs=[main_output])
print (model.summary())
model.compile(loss='binary_crossentropy',
optimizer='adagrad',#adam
metrics=[auc])
history = model.fit(x_train, y_train,
epochs=EPOCH_NUM,
steps_per_epoch = BATCH_NUM,
verbose = 1 )
pred_test = model.predict(x_test,verbose=1)[:,0]
print ('pred_test shape: %s' % str(pred_test.shape))
true_test = y_test['output']
print ('TEST_AUC: ', roc_auc_score(true_test, pred_test))