一、背景与实际运行的代码
最近在利用tensorflow框架训练CNN模型的时候,出现了ValueError: No gradients provided for any variable错误。
1.1自定义数据集文件夹层级
随意打开一个文件夹:
1.2代码如下:
1、导入相关模块
import tensorflow as tf
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
%matplotlib inline
import os
from pathlib import Path
import time
2、超参数赋值
# 数据集主目录
DATA_ROOT = 'D:\BaiduNetdiskDownload\pycv-learning\data\spot_data_cls'
# 训练集验证集划分比例,8:2
train_ratio = 0.8
num_epochs = 30
learning_rate = 1e-3
resized_img_shape = (224, 224, 3)
batch_size = 16
# 创建path对象
DATA_ROOT_PATH = Path(DATA_ROOT)
3、数据集处理
dict_num2name = {} # 数据集标签序号to类别名称的转换字典
dict_name2num = {} # 数据集类别名称to标签序号的转换字典
sep_class_paths = {} # 获取数据集下的各个类别的图片路径
train_whole_paths,train_whole_labels,val_whole_paths,val_whole_labels = [],[],[],[]
for i,path in enumerate(DATA_ROOT_PATH.iterdir()):
dict_num2name[i] = path.name # {0: 'copper', 1: 'edge', 2: 'normal', 3: 'overlap', 4: 'splash', 5: 'twist'}
dict_name2num[path.name] = i # {'copper': 0, 'edge': 1, 'normal': 2, 'overlap': 3, 'splash': 4, 'twist': 5}
sep_class_paths[path.name] = list(map(lambda x : str(x), path.iterdir()))
np.random.shuffle(sep_class_paths[path.name])
train_whole_paths.extend(sep_class_paths[path.name][:int(train_ratio*len(sep_class_paths[path.name]))])
val_whole_paths.extend(sep_class_paths[path.name][int(train_ratio*len(sep_class_paths[path.name])):])
# 对全部的训练集、验证集图片路径进行打乱
np.random.shuffle(train_whole_paths)
np.random.shuffle(val_whole_paths)
# 根据图片文件路径的上一级路径的名称来确定label
for path in train_whole_paths:
serial_number = dict_name2num[os.path.basename(os.path.abspath(path + '/../'))]
train_whole_labels.append(serial_number)
for path in val_whole_paths:
serial_number = dict_name2num[os.path.basename(os.path.abspath(path + '/../'))]
val_whole_labels.append(serial_number)
train_whole_imgs,val_whole_imgs = [],[]
for i,path in enumerate(train_whole_paths):
img = cv.imread(path)
resized_img = cv.resize(img, [224, 224])
train_whole_imgs.append(resized_img)
for i,path in enumerate(val_whole_paths):
img = cv.imread(path)
resized_img = cv.resize(img, [224, 224]) / 255
val_whole_imgs.append(resized_img)
train_whole_imgs = tf.constant(train_whole_imgs)
train_whole_labels = tf.constant(train_whole_labels)
val_whole_imgs = tf.constant(val_whole_imgs)
val_whole_labels = tf.constant(val_whole_labels)
# 对实际标签进行one-hot编码
# 对实际标签进行one-hot编码
train_whole_labels = tf.keras.utils.to_categorical(train_whole_labels, num_classes=6)
val_whole_labels = tf.keras.utils.to_categorical(val_whole_labels, num_classes=6)
4、开始训练模型
conv_count = [1, 2]
units = [1, 2]
dense_count = [1, 2]
for conv in conv_count:
for unit in units:
for den in dense_count:
import time
# callback参数的路径准备
# Tensorboard日志文件写入路径
tb_log_dir = f'./logs_Tb/conv_{3*conv:02d}+dense_{den:02d}+units{64*unit}_{int(time.time())}'
# checkpoint模型文件写入路径
checkpoint_dir = f'./models/conv_{3*conv:02d}+dense_{den:02d}+units{64*unit}_'+'{epoch:02d}-{val_loss:.2f}.hdf5'
# callback参数的实例化
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=tb_log_dir)
check_point = tf.keras.callbacks.ModelCheckpoint(checkpoint_dir, monitor='val_loss')
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(32*unit, 3, activation='relu', input_shape=resized_img_shape))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.Dropout(0.2))
for i in range(conv):
model.add(tf.keras.layers.Conv2D(64*unit, 5, activation='relu'))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.Dropout(0.2))
for i in range(conv):
model.add(tf.keras.layers.Conv2D(128*unit, 5, activation='relu'))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Flatten())
for i in range(den):
model.add(tf.keras.layers.Dense(64*unit, activation='relu'))
model.add(tf.keras.layers.Dense(6, activation='softmax'))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
loss=tf.keras.metrics.CategoricalCrossentropy(),
metrics=[tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.Recall(), tf.map_fn])
model.fit(train_whole_imgs,
train_whole_labels,
batch_size=batch_size,
epochs=num_epochs,
callbacks=[tensorboard, check_point],
validation_data=(val_whole_imgs, val_whole_labels))
二、解决方案
在网上找解决方案的时候,大多数回答都说是因为loss的计算问题。
于是我就仔细观察了一下代码,发现loss参数竟然用的是tf.keras.metrics.CategoricalCrossentropy(),这个应该是参数,而不是损失。
然后,改成了tf.keras.losses.CategoricalCrossentropy()
问题解决,可以正常训练了!!!