一.准备工作
import argparse
import datetime
import time
import tensorflow as tf
import numpy as np
import scipy.sparse as sp
import sys
import json
1.随机数
seed = int(time.time())
np.random.seed(seed)
tf.set_random_seed(seed)
2.设置命令行属性默认值
ap = argparse.ArgumentParser()
#数据集 "ml_100k"
ap.add_argument("-d", "--dataset", type=str, default="ml_100k",
help="Dataset string.")
#学习率 0.01
ap.add_argument("-lr", "--learning_rate", type=float, default=0.01,
help="Learning rate")
#轮次 2500
ap.add_argument("-e", "--epochs", type=int, default=2500,
help="Number training epochs")
#两层隐藏层单元数 [500, 75]
ap.add_argument("-hi", "--hidden", type=int, nargs=2, default=[500, 75],
help="Number hidden units in 1st and 2nd layer")
#特征的维数 64
ap.add_argument("-fhi", "--feat_hidden", type=int, default=64,
help="Number hidden units in the dense layer for features")
#累积方式 stack
ap.add_argument("-ac", "--accumulation", type=str, default="stack",
help="Accumulation function: sum or stack.")
#丢弃率 0.7
ap.add_argument("-do", "--dropout", type=float, default=0.7,
help="Dropout fraction")
#基本函数个数 2
ap.add_argument("-nb", "--num_basis_functions", type=int, default=2,
help="Number of basis functions for Mixture Model GCN.")
#洗牌用数据种子 1234
ap.add_argument("-ds", "--data_seed", type=int, default=1234,
help="""Seed used to shuffle data in data_utils, taken from cf-nade (1234, 2341, 3412, 4123, 1324).
Only used for ml_1m and ml_10m datasets. """)
#相关数据保存路径 'logs/' + 当前时间
ap.add_argument("-sdir", "--summaries_dir", type=str, default='logs/' + str(datetime.datetime.now()).replace(' ', '_'),
help="Directory for saving tensorflow summaries.")
_StoreAction(option_strings=[’-sdir’, ‘–summaries_dir’], dest=‘summaries_dir’, nargs=None, const=None, default=‘logs/2021-05-07_15:36:47.432757’, type=<class ‘str’>, choices=None, help=‘Directory for saving tensorflow summaries.’, metavar=None)
设置布尔量默认值
fp = ap.add_mutually_exclusive_group(required=False)
#归一化方法,默认true,即nsym
fp.add_argument('-nsym', '--norm_symmetric', dest='norm_symmetric',
help="Option to turn on symmetric global normalization", action='store_true')
fp.add_argument('-nleft', '--norm_left', dest='norm_symmetric',
help="Option to turn on left global normalization", action='store_false')
ap.set_defaults(norm_symmetric=True)
#是否使用特征,默认false,即no_f不使用特征
fp = ap.add_mutually_exclusive_group(required=False)
fp.add_argument('-f', '--features', dest='features',
help="Whether to use features (1) or not (0)", action='store_true')
fp.add_argument('-no_f', '--no_features', dest='features',
help="Whether to use features (1) or not (0)", action='store_false')
ap.set_defaults(features=False)
#是否记录,默认false,即no_ws不记录
fp = ap.add_mutually_exclusive_group(required=False)
fp.add_argument('-ws', '--write_summary', dest='write_summary',
help="Option to turn on summary writing", action='store_true')
fp.add_argument('-no_ws', '--no_write_summary', dest='write_summary',
help="Option to turn off summary writing", action='store_false')
ap.set_defaults(write_summary=False)
#测试还是验证,默认为false,即v验证
fp = ap.add_mutually_exclusive_group(required=False)
fp.add_argument('-t', '--testing', dest='testing',
help="Option to turn on test set evaluation", action='store_true')
fp.add_argument('-v', '--validation', dest='testing',
help="Option to only use validation set evaluation", action='store_false')
ap.set_defaults(testing=False)
打印默认设置
args = vars(ap.parse_args(args=[]))
print('Settings:')
print(args, '\n')
Settings:
{‘dataset’: ‘ml_100k’,
‘learning_rate’: 0.01,
‘epochs’: 2500,
‘hidden’: [500, 75],
‘feat_hidden’: 64,
‘accumulation’: ‘stack’,
‘dropout’: 0.7,
‘num_basis_functions’: 2,
‘data_seed’: 1234,
‘summaries_dir’: ‘logs/2021-05-07_15:36:47.432757’,
‘norm_symmetric’: True,
‘features’: False,
‘write_summary’: False,
‘testing’: False}
自设参数
DATASET = 'ml_100k'
DATASEED = 1234
NB_EPOCH = 3000
DO = 0.7 #丢弃率
HIDDEN = [500, 75]
FEATHIDDEN = 64
BASES = 2 #混合模型GCN的基本函数个数
LR = 0.01
WRITESUMMARY = True #选择记录
SUMMARIESDIR = './logs/' + str(datetime.datetime.now()).replace(' ', '_').replace(':', ':')
FEATURES = False
SYM = False #左标准化
TESTING = False
ACCUM = 'stack'
SELFCONNECTIONS = False #没有自连接
SPLITFROMFILE = True
VERBOSE = True
NUMCLASSES = 5 #评级个数
3. 数据预处理
print("Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...")
u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, \
val_labels, val_u_indices, val_v_indices, test_labels, \
test_u_indices, test_v_indices, class_values = load_official_trainvaltest_split(DATASET, TESTING)
1)调用preprocessing中的load_official_trainvaltest_split函数
参数:原始数据集,采用验证集
返回:
u_features,v_features: 用户和电影各自的特征矩阵943×23,1682×18
rating_mx_train: 用户和电影的邻接矩阵943×1682
train_labels: 训练集的索引列表,64000
u_train_idx, v_train_idx: 64000
val_labels:16000
u_val_idx, v_val_idx:16000
test_labels:20000
u_test_idx, v_test_idx:20000
class_values:5dtypes = { 'u_nodes': np.int32, 'v_nodes': np.int32, 'ratings': np.float32, 'timestamp': np.float64} filename_train = 'data/' + dataset + '/u1.base' filename_test = 'data/' + dataset + '/u1.test' # 将base和test文件读取成字典,训练集为80000*4,测试集为20000*4 data_train = pd.read_csv( filename_train, sep=sep, header=None, names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes) data_test = pd.read_csv( filename_test, sep=sep, header=None, names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes) # 转成np格式的矩阵 data_array_train = data_train.values.tolist() data_array_train = np.array(data_array_train) data_array_test = data_test.values.tolist() data_array_test = np.array(data_array_test) #将训练集,测试集两个矩阵上下拼接 data_array = np.concatenate([data_array_train, data_array_test], axis=0) #得到用户序号·电影序号·评级的三列向量 u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes']) v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes']) ratings = data_array[:, 2].astype(dtypes['ratings']) u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings) v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)
①调用data_utils中的map_data函数
参数: data 一列数据
返回: data 数据替换成排序后对应的索引
id_dict 索引字典
n 去重后的数组长度#对该数组去重并排序,即1~943 uniq = list(set(data)) #得到该数组对应的{索引:数值}字典,即索引=数值-1 id_dict = {old: new for new, old in enumerate(sorted(uniq))} #将原本的data中每个数值替换成它的索引 data = np.array(list(map(lambda x: id_dict[x], data))) n = len(uniq)
u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32) ratings = ratings.astype(np.float64) u_nodes = u_nodes_ratings v_nodes = v_nodes_ratings neutral_rating = -1 #得到评级的索引字典 rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())} #初始化一个全为-1的943*1682的矩阵,然后填入对应评级,得到二部图邻接矩阵 labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32) labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings]) for i in range(len(u_nodes)): assert(labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]]) #再将这个矩阵拉成一行 labels = labels.reshape([-1]) #划分数据集 num_train = data_array_train.shape[0] num_test = data_array_test.shape[0] num_val = int(np.ceil(num_train * 0.2)) num_train = num_train - num_val #得到【用户,电影】的对组成的数组 pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)]) #得到 第u号用户对第v号电影的评级在labels数组中的索引 组成的数组,相当于是邻接矩阵中有评级的值在拉长后的数组中的索引 idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero]) for i in range(len(ratings)): assert(labels[idx_nonzero[i]] == rating_dict[ratings[i]]) #再将其对应划分数据集 idx_nonzero_train = idx_nonzero[0:num_train+num_val] idx_nonzero_test = idx_nonzero[num_train+num_val:] pairs_nonzero_train = pairs_nonzero[0:num_train+num_val] pairs_nonzero_test = pairs_nonzero[num_train+num_val:] #将训练集打乱 rand_idx = list(range(len(idx_nonzero_train))) np.random.seed(42) np.random.shuffle(rand_idx) idx_nonzero_train = idx_nonzero_train[rand_idx] pairs_nonzero_train = pairs_nonzero_train[rand_idx] #将打乱后的训练集与测试集上下拼接得到新的总数据集 idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0) pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0) #再对打乱后的数据划分数据集 val_idx = idx_nonzero[0:num_val] train_idx = idx_nonzero[num_val:num_train + num_val] test_idx = idx_nonzero[num_train + num_val:] assert(len(test_idx) == num_test) val_pairs_idx = pairs_nonzero[0:num_val] train_pairs_idx = pairs_nonzero[num_val:num_train + num_val] test_pairs_idx = pairs_nonzero[num_train + num_val:] #通过求转置,得到分别的用户和电影的索引,然后得到的评级 u_test_idx, v_test_idx = test_pairs_idx.transpose() u_val_idx, v_val_idx = val_pairs_idx.transpose() u_train_idx, v_train_idx = train_pairs_idx.transpose() train_labels = labels[train_idx] val_labels = labels[val_idx] test_labels = labels[test_idx] if testing: #如果选择测试集分法,那么把训练集和验证合并 u_train_idx = np.hstack([u_train_idx, u_val_idx]) v_train_idx = np.hstack([v_train_idx, v_val_idx]) train_labels = np.hstack([train_labels, val_labels]) # for adjacency matrix construction train_idx = np.hstack([train_idx, val_idx]) #初始化训练集的邻接矩阵,填入对应的真实评级,即索引+1,然后做成压缩矩阵csr rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32) rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1. rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items)) #得到去重并排序后的评级数组 class_values = np.sort(np.unique(ratings)) # 整理电影辅助信息,即特征,将电影的题材构造成0,1表示的18维向量,即生成1682*18的矩阵 sep = r'|' movie_file = 'data/' + dataset + '/u.item' movie_headers = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] movie_df = pd.read_csv(movie_file, sep=sep, header=None, names=movie_headers, engine='python') genre_headers = movie_df.columns.values[6:] num_genres = genre_headers.shape[0] v_features = np.zeros((num_items, num_genres), dtype=np.float32) for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()): if movie_id in v_dict.keys(): v_features[v_dict[movie_id], :] = g_vec # 整理用户辅助信息,即特征,生成943*23的矩阵 sep = r'|' users_file = 'data/' + dataset + '/u.user' users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code'] users_df = pd.read_csv(users_file, sep=sep, header=None, names=users_headers, engine='python') occupation = set(users_df['occupation'].values.tolist()) age = users_df['age'].values #年龄做标准化 age_max = age.max() gender_dict = {'M': 0., 'F': 1.} #职业的计数字典,共21个职业,从2开始计数 occupation_dict = {f: i for i, f in enumerate(occupation, start=2)} num_feats = 2 + len(occupation_dict) u_features = np.zeros((num_users, num_feats), dtype=np.float32) for _, row in users_df.iterrows(): u_id = row['user id'] if u_id in u_dict.keys(): # age u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max) # gender u_features[u_dict[u_id], 1] = gender_dict[row['gender']] # occupation u_features[u_dict[u_id], occupation_dict[row['occupation']]] = 1. u_features = sp.csr_matrix(u_features) v_features = sp.csr_matrix(v_features) print("User features shape: "+str(u_features.shape)) print("Item features shape: "+str(v_features.shape))
Using official MovieLens dataset split u1.base/u1.test with 20% validation set size…
User features shape: (943, 23)
Item features shape: (1682, 18)
num_users, num_items = adj_train.shape #943, 1682
num_side_features = 0
4.判断是否使用特征
if not FEATURES:
u_features = sp.identity(num_users, format='csr')
v_features = sp.identity(num_items, format='csr')
u_features, v_features = preprocess_user_item_features(u_features, v_features)
elif FEATURES and u_features is not None and v_features is not None:
print("Normalizing feature vectors...")
u_features_side = normalize_features(u_features)
v_features_side = normalize_features(v_features)
u_features_side, v_features_side = preprocess_user_item_features(u_features_side, v_features_side)
u_features_side = np.array(u_features_side.todense(), dtype=np.float32)
v_features_side = np.array(v_features_side.todense(), dtype=np.float32)
num_side_features = u_features_side.shape[1]
# node id's for node input features
id_csr_v = sp.identity(num_items, format='csr')
id_csr_u = sp.identity(num_users, format='csr')
u_features, v_features = preprocess_user_item_features(id_csr_u, id_csr_v)
else:
raise ValueError('Features flag is set to true but no features are loaded from dataset ' + DATASET)
2)从preprocessing调用normalize_features
即 D − 1 A D^{-1}A D−1A
#943行每行求和,得到943维的度向量 degree = np.asarray(feat.sum(1)).flatten() #防止除零 degree[degree == 0.] = np.inf degree_inv = 1. / degree #将度向量画成对角阵 degree_inv_mat = sp.diags([degree_inv], [0]) feat_norm = degree_inv_mat.dot(feat) if feat_norm.nnz == 0: print('ERROR: normalized adjacency matrix has only zero entries!!!!!') exit
3)从preprocessing调用preprocess_user_item_features
zero_csr_u = sp.csr_matrix((u_features.shape[0], v_features.shape[1]), dtype=u_features.dtype) zero_csr_v = sp.csr_matrix((v_features.shape[0], u_features.shape[1]), dtype=v_features.dtype) u_features = sp.hstack([u_features, zero_csr_u], format='csr') v_features = sp.hstack([zero_csr_v, v_features], format='csr')
5.
将5个评级分别作出邻接矩阵
support = []
support_t = []
adj_train_int = sp.csr_matrix(adj_train, dtype=np.int32)
for i in range(NUMCLASSES):
support_unnormalized = sp.csr_matrix(adj_train_int == i + 1, dtype=np.float32)
#如果矩阵中没有非零元素,即为0矩阵
if support_unnormalized.nnz == 0 :
sys.exit('ERROR: normalized bipartite adjacency matrix has only zero entries!!!!!')
support_unnormalized_transpose = support_unnormalized.T
support.append(support_unnormalized)
support_t.append(support_unnormalized_transpose)
#此时support为5个943x1682的csr压缩矩阵组成的数组
support = globally_normalize_bipartite_adjacency(support, symmetric=SYM)
support_t = globally_normalize_bipartite_adjacency(support_t, symmetric=SYM)
4)从preprocessing调用globally_normalize_bipartite_adjacency
对矩阵做标准化,即 D − 1 2 A D − 1 2 D^{-\frac{1}{2}}AD^{-\frac{1}{2}} D−21AD−21
参数: 邻接矩阵的列表
verbose=true
symmetric=falseif verbose: #if True print('Symmetrically normalizing bipartite adj') #对当前的邻接矩阵 adj_tot = np.sum(adj for adj in adjacencies) #行度,相当于943个用户每人评价的电影数 degree_u = np.asarray(adj_tot.sum(1)).flatten() #列度,相当于1682个电影每部收到多少个用户评论 degree_v = np.asarray(adj_tot.sum(0)).flatten() degree_u[degree_u == 0.] = np.inf degree_v[degree_v == 0.] = np.inf degree_u_inv_sqrt = 1. / np.sqrt(degree_u) degree_v_inv_sqrt = 1. / np.sqrt(degree_v) degree_u_inv_sqrt_mat = sp.diags([degree_u_inv_sqrt], [0]) degree_v_inv_sqrt_mat = sp.diags([degree_v_inv_sqrt], [0]) degree_u_inv = degree_u_inv_sqrt_mat.dot(degree_u_inv_sqrt_mat) if symmetric: adj_norm = [degree_u_inv_sqrt_mat.dot(adj).dot(degree_v_inv_sqrt_mat) for adj in adjacencies] else: #当前symmetric为false adj_norm = [degree_u_inv.dot(adj) for adj in adjacencies]
自连接
if SELFCONNECTIONS:
support.append(sp.identity(u_features.shape[0], format='csr'))
support_t.append(sp.identity(v_features.shape[0], format='csr'))
#再将这5个矩阵拼接成943x8410和1682x4715的大矩阵
num_support = len(support)
support = sp.hstack(support, format='csr')
support_t = sp.hstack(support_t, format='csr')
if ACCUM == 'stack':
#500÷5=100
div = HIDDEN[0] // num_support
if HIDDEN[0] % num_support != 0:
print("""\nWARNING: HIDDEN[0] (=%d) of stack layer is adjusted to %d such that
it can be evenly split in %d splits.\n""" % (HIDDEN[0], num_support * div, num_support))
HIDDEN[0] = num_support * div
#得到测试集的对应矩阵,459x8410,1410x4715
test_u = list(set(test_u_indices))
test_v = list(set(test_v_indices))
test_u_dict = {n: i for i, n in enumerate(test_u)}
test_v_dict = {n: i for i, n in enumerate(test_v)}
test_u_indices = np.array([test_u_dict[o] for o in test_u_indices])
test_v_indices = np.array([test_v_dict[o] for o in test_v_indices])
test_support = support[np.array(test_u)]
test_support_t = support_t[np.array(test_v)]
#得到验证集的对应矩阵,933x8410,1351x4715
val_u = list(set(val_u_indices))
val_v = list(set(val_v_indices))
val_u_dict = {n: i for i, n in enumerate(val_u)}
val_v_dict = {n: i for i, n in enumerate(val_v)}
val_u_indices = np.array([val_u_dict[o] for o in val_u_indices])
val_v_indices = np.array([val_v_dict[o] for o in val_v_indices])
val_support = support[np.array(val_u)]
val_support_t = support_t[np.array(val_v)]
#得到训练集的对应矩阵,943x8410,1614x4715
train_u = list(set(train_u_indices))
train_v = list(set(train_v_indices))
train_u_dict = {n: i for i, n in enumerate(train_u)}
train_v_dict = {n: i for i, n in enumerate(train_v)}
train_u_indices = np.array([train_u_dict[o] for o in train_u_indices])
train_v_indices = np.array([train_v_dict[o] for o in train_v_indices])
train_support = support[np.array(train_u)]
train_support_t = support_t[np.array(train_v)]
if FEATURES:
test_u_features_side = u_features_side[np.array(test_u)]
test_v_features_side = v_features_side[np.array(test_v)]
val_u_features_side = u_features_side[np.array(val_u)]
val_v_features_side = v_features_side[np.array(val_v)]
train_u_features_side = u_features_side[np.array(train_u)]
train_v_features_side = v_features_side[np.array(train_v)]
else: #不使用特征
test_u_features_side = None
test_v_features_side = None
val_u_features_side = None
val_v_features_side = None
train_u_features_side = None
train_v_features_side = None