Graph Convolutional Matrix Completion——tensorflow代码理解①准备部分

一.准备工作

import argparse
import datetime
import time
import tensorflow as tf
import numpy as np
import scipy.sparse as sp
import sys
import json

1.随机数

seed = int(time.time())
np.random.seed(seed)
tf.set_random_seed(seed)

2.设置命令行属性默认值

ap = argparse.ArgumentParser()
#数据集 "ml_100k"
ap.add_argument("-d", "--dataset", type=str, default="ml_100k",                
                help="Dataset string.")
#学习率 0.01
ap.add_argument("-lr", "--learning_rate", type=float, default=0.01,
                help="Learning rate")
#轮次 2500
ap.add_argument("-e", "--epochs", type=int, default=2500,
                help="Number training epochs")
#两层隐藏层单元数 [500, 75]
ap.add_argument("-hi", "--hidden", type=int, nargs=2, default=[500, 75],
                help="Number hidden units in 1st and 2nd layer")
#特征的维数 64
ap.add_argument("-fhi", "--feat_hidden", type=int, default=64,
                help="Number hidden units in the dense layer for features")
#累积方式 stack
ap.add_argument("-ac", "--accumulation", type=str, default="stack", 
                help="Accumulation function: sum or stack.")
#丢弃率 0.7
ap.add_argument("-do", "--dropout", type=float, default=0.7,
                help="Dropout fraction")
#基本函数个数 2
ap.add_argument("-nb", "--num_basis_functions", type=int, default=2,
                help="Number of basis functions for Mixture Model GCN.")
#洗牌用数据种子 1234
ap.add_argument("-ds", "--data_seed", type=int, default=1234,
                help="""Seed used to shuffle data in data_utils, taken from cf-nade (1234, 2341, 3412, 4123, 1324).
                     Only used for ml_1m and ml_10m datasets. """)
#相关数据保存路径 'logs/' + 当前时间
ap.add_argument("-sdir", "--summaries_dir", type=str, default='logs/' + str(datetime.datetime.now()).replace(' ', '_'),
                help="Directory for saving tensorflow summaries.")

_StoreAction(option_strings=[’-sdir’, ‘–summaries_dir’], dest=‘summaries_dir’, nargs=None, const=None, default=‘logs/2021-05-07_15:36:47.432757’, type=<class ‘str’>, choices=None, help=‘Directory for saving tensorflow summaries.’, metavar=None)

设置布尔量默认值

fp = ap.add_mutually_exclusive_group(required=False)
#归一化方法,默认true,即nsym
fp.add_argument('-nsym', '--norm_symmetric', dest='norm_symmetric',
                help="Option to turn on symmetric global normalization", action='store_true')
fp.add_argument('-nleft', '--norm_left', dest='norm_symmetric',
                help="Option to turn on left global normalization", action='store_false')
ap.set_defaults(norm_symmetric=True)
#是否使用特征,默认false,即no_f不使用特征
fp = ap.add_mutually_exclusive_group(required=False)
fp.add_argument('-f', '--features', dest='features',
                help="Whether to use features (1) or not (0)", action='store_true')
fp.add_argument('-no_f', '--no_features', dest='features',
                help="Whether to use features (1) or not (0)", action='store_false')
ap.set_defaults(features=False)
#是否记录,默认false,即no_ws不记录
fp = ap.add_mutually_exclusive_group(required=False)
fp.add_argument('-ws', '--write_summary', dest='write_summary',
                help="Option to turn on summary writing", action='store_true')
fp.add_argument('-no_ws', '--no_write_summary', dest='write_summary',
                help="Option to turn off summary writing", action='store_false')
ap.set_defaults(write_summary=False)
#测试还是验证,默认为false,即v验证
fp = ap.add_mutually_exclusive_group(required=False)
fp.add_argument('-t', '--testing', dest='testing',
                help="Option to turn on test set evaluation", action='store_true')
fp.add_argument('-v', '--validation', dest='testing',
                help="Option to only use validation set evaluation", action='store_false')
ap.set_defaults(testing=False)

打印默认设置

args = vars(ap.parse_args(args=[]))

print('Settings:')
print(args, '\n')

Settings:
{‘dataset’: ‘ml_100k’,
‘learning_rate’: 0.01,
‘epochs’: 2500,
‘hidden’: [500, 75],
‘feat_hidden’: 64,
‘accumulation’: ‘stack’,
‘dropout’: 0.7,
‘num_basis_functions’: 2,
‘data_seed’: 1234,
‘summaries_dir’: ‘logs/2021-05-07_15:36:47.432757’,
‘norm_symmetric’: True,
‘features’: False,
‘write_summary’: False,
‘testing’: False}

自设参数

DATASET = 'ml_100k'
DATASEED = 1234
NB_EPOCH = 3000
DO = 0.7					#丢弃率
HIDDEN = [500, 75]
FEATHIDDEN = 64 
BASES = 2					#混合模型GCN的基本函数个数
LR = 0.01
WRITESUMMARY = True			#选择记录
SUMMARIESDIR = './logs/' + str(datetime.datetime.now()).replace(' ', '_').replace(':', ':') 
FEATURES = False 
SYM = False					#左标准化
TESTING = False
ACCUM = 'stack'
SELFCONNECTIONS = False		#没有自连接
SPLITFROMFILE = True
VERBOSE = True
NUMCLASSES = 5				#评级个数

3. 数据预处理

print("Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...")
u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, \
    val_labels, val_u_indices, val_v_indices, test_labels, \
    test_u_indices, test_v_indices, class_values = load_official_trainvaltest_split(DATASET, TESTING)

1)调用preprocessing中的load_official_trainvaltest_split函数

参数:原始数据集,采用验证集
返回:
u_features,v_features: 用户和电影各自的特征矩阵943×23,1682×18
rating_mx_train: 用户和电影的邻接矩阵943×1682
train_labels: 训练集的索引列表,64000
u_train_idx, v_train_idx: 64000
val_labels:16000
u_val_idx, v_val_idx:16000
test_labels:20000
u_test_idx, v_test_idx:20000
class_values:5

dtypes = {
    'u_nodes': np.int32, 'v_nodes': np.int32,
    'ratings': np.float32, 'timestamp': np.float64} filename_train = 'data/' + dataset + '/u1.base' filename_test = 'data/' + dataset +
'/u1.test'

# 将base和test文件读取成字典,训练集为80000*4,测试集为20000*4 
data_train = pd.read_csv(
    filename_train, sep=sep, header=None,
    names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes) 
data_test = pd.read_csv(
    filename_test, sep=sep, header=None,
    names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes)

# 转成np格式的矩阵 
data_array_train = data_train.values.tolist() 
data_array_train = np.array(data_array_train) 
data_array_test = data_test.values.tolist() 
data_array_test = np.array(data_array_test)

#将训练集,测试集两个矩阵上下拼接 
data_array = np.concatenate([data_array_train, data_array_test], axis=0)

#得到用户序号·电影序号·评级的三列向量 
u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes']) 
v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes']) 
ratings = data_array[:, 2].astype(dtypes['ratings'])

u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings) 
①调用data_utils中的map_data函数

参数: data 一列数据
返回: data 数据替换成排序后对应的索引
id_dict 索引字典
n 去重后的数组长度

#对该数组去重并排序,即1~943 
uniq = list(set(data))
#得到该数组对应的{索引:数值}字典,即索引=数值-1 
id_dict = {old: new for new, old in enumerate(sorted(uniq))}
#将原本的data中每个数值替换成它的索引 
data = np.array(list(map(lambda x: id_dict[x], data))) 
n = len(uniq)
u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32)
ratings = ratings.astype(np.float64)
u_nodes = u_nodes_ratings
v_nodes = v_nodes_ratings
neutral_rating = -1 

#得到评级的索引字典
rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

#初始化一个全为-1的943*1682的矩阵,然后填入对应评级,得到二部图邻接矩阵
labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])
for i in range(len(u_nodes)):
	assert(labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])
#再将这个矩阵拉成一行
labels = labels.reshape([-1])

#划分数据集
num_train = data_array_train.shape[0]
num_test = data_array_test.shape[0]
num_val = int(np.ceil(num_train * 0.2))
num_train = num_train - num_val

#得到【用户,电影】的对组成的数组
pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])     
#得到 第u号用户对第v号电影的评级在labels数组中的索引 组成的数组,相当于是邻接矩阵中有评级的值在拉长后的数组中的索引
idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])
for i in range(len(ratings)):
        assert(labels[idx_nonzero[i]] == rating_dict[ratings[i]])
#再将其对应划分数据集
idx_nonzero_train = idx_nonzero[0:num_train+num_val]
idx_nonzero_test = idx_nonzero[num_train+num_val:]
pairs_nonzero_train = pairs_nonzero[0:num_train+num_val]
pairs_nonzero_test = pairs_nonzero[num_train+num_val:]

#将训练集打乱
rand_idx = list(range(len(idx_nonzero_train)))
np.random.seed(42)
np.random.shuffle(rand_idx)
idx_nonzero_train = idx_nonzero_train[rand_idx]
pairs_nonzero_train = pairs_nonzero_train[rand_idx]

#将打乱后的训练集与测试集上下拼接得到新的总数据集
idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)
#再对打乱后的数据划分数据集
val_idx = idx_nonzero[0:num_val]
train_idx = idx_nonzero[num_val:num_train + num_val]
test_idx = idx_nonzero[num_train + num_val:]
assert(len(test_idx) == num_test)
val_pairs_idx = pairs_nonzero[0:num_val]
train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
test_pairs_idx = pairs_nonzero[num_train + num_val:]
#通过求转置,得到分别的用户和电影的索引,然后得到的评级
u_test_idx, v_test_idx = test_pairs_idx.transpose()
u_val_idx, v_val_idx = val_pairs_idx.transpose()
u_train_idx, v_train_idx = train_pairs_idx.transpose()
train_labels = labels[train_idx]
val_labels = labels[val_idx]
test_labels = labels[test_idx]
if testing:	#如果选择测试集分法,那么把训练集和验证合并
        u_train_idx = np.hstack([u_train_idx, u_val_idx])
        v_train_idx = np.hstack([v_train_idx, v_val_idx])
        train_labels = np.hstack([train_labels, val_labels])
        # for adjacency matrix construction
        train_idx = np.hstack([train_idx, val_idx])
        
#初始化训练集的邻接矩阵,填入对应的真实评级,即索引+1,然后做成压缩矩阵csr
rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

#得到去重并排序后的评级数组
class_values = np.sort(np.unique(ratings))

# 整理电影辅助信息,即特征,将电影的题材构造成0,1表示的18维向量,即生成1682*18的矩阵
sep = r'|'
movie_file = 'data/' + dataset + '/u.item'
movie_headers = ['movie id', 'movie title', 'release date', 'video release date',
                         'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
                         'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                         'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                         'Thriller', 'War', 'Western']
movie_df = pd.read_csv(movie_file, sep=sep, header=None,
                               names=movie_headers, engine='python')
genre_headers = movie_df.columns.values[6:]
num_genres = genre_headers.shape[0]
v_features = np.zeros((num_items, num_genres), dtype=np.float32)
for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()):
	 if movie_id in v_dict.keys():
                v_features[v_dict[movie_id], :] = g_vec
                
# 整理用户辅助信息,即特征,生成943*23的矩阵
sep = r'|'
users_file = 'data/' + dataset + '/u.user'
users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code']
users_df = pd.read_csv(users_file, sep=sep, header=None,
                               names=users_headers, engine='python')
occupation = set(users_df['occupation'].values.tolist())
age = users_df['age'].values
#年龄做标准化
age_max = age.max()
gender_dict = {'M': 0., 'F': 1.}
#职业的计数字典,共21个职业,从2开始计数
occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}
num_feats = 2 + len(occupation_dict)
u_features = np.zeros((num_users, num_feats), dtype=np.float32)
for _, row in users_df.iterrows():
	u_id = row['user id']
	if u_id in u_dict.keys():
                # age
                u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)
                # gender
                u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
                # occupation
                u_features[u_dict[u_id], occupation_dict[row['occupation']]] = 1.
                
u_features = sp.csr_matrix(u_features)
v_features = sp.csr_matrix(v_features)

print("User features shape: "+str(u_features.shape))
print("Item features shape: "+str(v_features.shape))

Using official MovieLens dataset split u1.base/u1.test with 20% validation set size…
User features shape: (943, 23)
Item features shape: (1682, 18)

num_users, num_items = adj_train.shape #943, 1682
num_side_features = 0

4.判断是否使用特征

if not FEATURES:
    u_features = sp.identity(num_users, format='csr')
    v_features = sp.identity(num_items, format='csr')

    u_features, v_features = preprocess_user_item_features(u_features, v_features)

elif FEATURES and u_features is not None and v_features is not None:
    
    print("Normalizing feature vectors...")
    u_features_side = normalize_features(u_features)
    v_features_side = normalize_features(v_features)

    u_features_side, v_features_side = preprocess_user_item_features(u_features_side, v_features_side)

    u_features_side = np.array(u_features_side.todense(), dtype=np.float32)
    v_features_side = np.array(v_features_side.todense(), dtype=np.float32)

    num_side_features = u_features_side.shape[1]

    # node id's for node input features
    id_csr_v = sp.identity(num_items, format='csr')
    id_csr_u = sp.identity(num_users, format='csr')

    u_features, v_features = preprocess_user_item_features(id_csr_u, id_csr_v)

else:
    raise ValueError('Features flag is set to true but no features are loaded from dataset ' + DATASET)

2)从preprocessing调用normalize_features

D − 1 A D^{-1}A D1A

#943行每行求和,得到943维的度向量
degree = np.asarray(feat.sum(1)).flatten()
#防止除零
degree[degree == 0.] = np.inf
degree_inv = 1. / degree
#将度向量画成对角阵
degree_inv_mat = sp.diags([degree_inv], [0])
feat_norm = degree_inv_mat.dot(feat)
if feat_norm.nnz == 0:
	print('ERROR: normalized adjacency matrix has only zero entries!!!!!')
	exit

3)从preprocessing调用preprocess_user_item_features

zero_csr_u = sp.csr_matrix((u_features.shape[0], v_features.shape[1]), dtype=u_features.dtype)
zero_csr_v = sp.csr_matrix((v_features.shape[0], u_features.shape[1]), dtype=v_features.dtype)
u_features = sp.hstack([u_features, zero_csr_u], format='csr')
v_features = sp.hstack([zero_csr_v, v_features], format='csr')

5.

将5个评级分别作出邻接矩阵

support = []
support_t = []
adj_train_int = sp.csr_matrix(adj_train, dtype=np.int32)
for i in range(NUMCLASSES):
    support_unnormalized = sp.csr_matrix(adj_train_int == i + 1, dtype=np.float32)
    #如果矩阵中没有非零元素,即为0矩阵
    if support_unnormalized.nnz == 0 :      
        sys.exit('ERROR: normalized bipartite adjacency matrix has only zero entries!!!!!')
    support_unnormalized_transpose = support_unnormalized.T
    support.append(support_unnormalized)
	support_t.append(support_unnormalized_transpose)
	
#此时support为5个943x1682的csr压缩矩阵组成的数组
support = globally_normalize_bipartite_adjacency(support, symmetric=SYM)
support_t = globally_normalize_bipartite_adjacency(support_t, symmetric=SYM)

4)从preprocessing调用globally_normalize_bipartite_adjacency

对矩阵做标准化,即 D − 1 2 A D − 1 2 D^{-\frac{1}{2}}AD^{-\frac{1}{2}} D21AD21
参数: 邻接矩阵的列表
verbose=true
symmetric=false

if verbose: #if True
	 print('Symmetrically normalizing bipartite adj')
#对当前的邻接矩阵
adj_tot = np.sum(adj for adj in adjacencies)
#行度,相当于943个用户每人评价的电影数
degree_u = np.asarray(adj_tot.sum(1)).flatten()
#列度,相当于1682个电影每部收到多少个用户评论
degree_v = np.asarray(adj_tot.sum(0)).flatten()
degree_u[degree_u == 0.] = np.inf 
degree_v[degree_v == 0.] = np.inf
degree_u_inv_sqrt = 1. / np.sqrt(degree_u) 
degree_v_inv_sqrt = 1. / np.sqrt(degree_v) 
degree_u_inv_sqrt_mat = sp.diags([degree_u_inv_sqrt], [0]) 
degree_v_inv_sqrt_mat = sp.diags([degree_v_inv_sqrt], [0])
degree_u_inv = degree_u_inv_sqrt_mat.dot(degree_u_inv_sqrt_mat)
if symmetric: 
    adj_norm = [degree_u_inv_sqrt_mat.dot(adj).dot(degree_v_inv_sqrt_mat) for adj in
adjacencies]
else:	#当前symmetric为false
    adj_norm = [degree_u_inv.dot(adj) for adj in adjacencies]

自连接

if SELFCONNECTIONS:
    support.append(sp.identity(u_features.shape[0], format='csr'))
    support_t.append(sp.identity(v_features.shape[0], format='csr'))
    
#再将这5个矩阵拼接成943x8410和1682x4715的大矩阵
num_support = len(support)
support = sp.hstack(support, format='csr')
support_t = sp.hstack(support_t, format='csr')

if ACCUM == 'stack':
	#500÷5=100
    div = HIDDEN[0] // num_support 
    if HIDDEN[0] % num_support != 0:
        print("""\nWARNING: HIDDEN[0] (=%d) of stack layer is adjusted to %d such that
                  it can be evenly split in %d splits.\n""" % (HIDDEN[0], num_support * div, num_support))
    HIDDEN[0] = num_support * div

#得到测试集的对应矩阵,459x8410,1410x4715
test_u = list(set(test_u_indices))
test_v = list(set(test_v_indices))
test_u_dict = {n: i for i, n in enumerate(test_u)}
test_v_dict = {n: i for i, n in enumerate(test_v)}
test_u_indices = np.array([test_u_dict[o] for o in test_u_indices])
test_v_indices = np.array([test_v_dict[o] for o in test_v_indices])
test_support = support[np.array(test_u)]
test_support_t = support_t[np.array(test_v)]

#得到验证集的对应矩阵,933x8410,1351x4715
val_u = list(set(val_u_indices))
val_v = list(set(val_v_indices))
val_u_dict = {n: i for i, n in enumerate(val_u)}
val_v_dict = {n: i for i, n in enumerate(val_v)}
val_u_indices = np.array([val_u_dict[o] for o in val_u_indices])
val_v_indices = np.array([val_v_dict[o] for o in val_v_indices])
val_support = support[np.array(val_u)]
val_support_t = support_t[np.array(val_v)]

#得到训练集的对应矩阵,943x8410,1614x4715
train_u = list(set(train_u_indices))
train_v = list(set(train_v_indices))
train_u_dict = {n: i for i, n in enumerate(train_u)}
train_v_dict = {n: i for i, n in enumerate(train_v)}
train_u_indices = np.array([train_u_dict[o] for o in train_u_indices])
train_v_indices = np.array([train_v_dict[o] for o in train_v_indices])
train_support = support[np.array(train_u)]
train_support_t = support_t[np.array(train_v)]

if FEATURES:
    test_u_features_side = u_features_side[np.array(test_u)]
    test_v_features_side = v_features_side[np.array(test_v)]
    val_u_features_side = u_features_side[np.array(val_u)]
    val_v_features_side = v_features_side[np.array(val_v)]
    train_u_features_side = u_features_side[np.array(train_u)]
    train_v_features_side = v_features_side[np.array(train_v)]
else:	#不使用特征
    test_u_features_side = None
    test_v_features_side = None
    val_u_features_side = None
    val_v_features_side = None
    train_u_features_side = None
    train_v_features_side = None
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 10
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值