Noisy Label Learning
1.f’create_noisy_data’
- include cuda :
numpy, random, matplotlib, pandas, tensorflow, os, sklearn - Functions:
- class cluster_data_preprocess:
- visualize_data: showing the distribution of the points we gennernate
- get_centroids: with the use of sklearn make_blobs method to genernate the data points which have been divided into 3 clusters automatically and return the central points’ coordinate
- calculate_avg_matric: calculate the mean distance between every point and the centroid in a cluster
- calculate_scores: useless
- genernate_noisy_data: with the use of the average distance to genernate the appropriate noisy data points
- create_clear_data: useless
- create_excel: divide the tuple coordinate into two parameters, which are x and y, in order to transfer these coordinates into excel more conveniently
- create_data_main: main function of the create data
- class cluster_data_preprocess:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import os
color = ['r', 'g', 'b', 'y', 'm', 'c']
path = r"F:\00PYTHON\noisy_label_learning\noisy_label.csv"
class cluster_data_preprocess():
def __init__(self):
self.data, self.target = make_blobs(n_samples=300, n_features=2, centers=3, random_state=1)
self.data, self.target = list(self.data), list(self.target)
def visualize_data(self):
plt.scatter(self.data[:,0], self.data[:,1], c=self.target)
plt.show()
def get_centroids(self):
cluster = KMeans(n_clusters=3, random_state=8)
cluster.fit(self.data)
y_pred = cluster.fit_predict(self.data)
centroid = cluster.cluster_centers_
return list(y_pred), centroid
def calculate_score(self, y_pred, target):
correct = 0
error = []
for i in range(len(y_pred)):
if (y_pred[i] == 1 and target[i] == 0) or \
(y_pred[i] == 0 and target[i] == 1) or \
(y_pred[i] == target[i]):
correct += 1
else:
error.append(self.data[i])
return (correct / len(y_pred)), error
def calculate_avg_matric(self, data, centroid, y_pred):
avg_matirc = [[0, 0], [0, 0], [0, 0]]
for num in range(len(y_pred)):
avg_matirc[y_pred[num]][0] += abs(float(centroid[y_pred[num]][0]) - data[num][0])
avg_matirc[y_pred[num]][1] += abs(float(centroid[y_pred[num]][1]) - data[num][1])
for i in range(3):
avg_matirc[i][0] /= len(data)
avg_matirc[i][1] /= len(data)
return avg_matirc
def generate_noisy_data(self, avg_matric, centroid, noisy_scale):
noisy_data = []
label = []
for i in range(3):
for num in range(noisy_scale):
if i == 1:
label.append(0)
elif i == 2:
label.append(1)
elif i ==0 :
label.append(2)
noisy_data.append((centroid[i][0] + random.choice([-1, 1]) * (avg_matric[i][0]+float(np.random.randint(1, 100)/100)),
centroid[i][1] + random.choice([-1, 1]) * (avg_matric[i][1]+float(np.random.randint(1, 100)/100))))
return label, noisy_data
def create_clear_data():
a_class_T, b_class_T, c_class_T, d_class_T = [], [], [], []
for i in range(1000):
a_class_T.append((np.random.randint(0, high=100) + np.random.random_sample(),
np.random.randint(0, high=100) + np.random.random_sample()))
b_class_T.append((np.random.randint(-100, high=0) + np.random.random_sample(),
np.random.randint(0, high=100) + np.random.random_sample()))
c_class_T.append((np.random.randint(-100, high=0) + np.random.random_sample(),
np.random.randint(-100, high=0) + np.random.random_sample()))
d_class_T.append((np.random.randint(0, high=100) + np.random.random_sample(),
np.random.randint(-100, high=0) + np.random.random_sample()))
return a_class_T, b_class_T, c_class_T, d_class_T
def create_ecxel(data, label, path):
data_x, data_y, keys_x, keys_y = [], [], [], []
for i in range(len(data)):
data_x.append(data[i][0])
data_y.append(data[i][1])
keys_x.append(label[i])
keys_y.append((label[i]+1) * -1)
divided_dic = {0:[], 1:[], 2:[], -1:[], -2:[], -3:[]}
for i in range(len(data_x)):
divided_dic[keys_x[i]].append(data_x[i])
divided_dic[keys_y[i]].append(data_y[i])
df = pd.DataFrame(data=divided_dic.values(), columns=None, index=divided_dic.keys())
df.to_csv(path, sep=',')
def create_data_main(noisy_scale):
final_data, label_noise = {}, {0: [], 1: [], 2: []}
plt.figure(figsize=(20, 8), dpi=100)
get_data = cluster_data_preprocess()
data, target = get_data.data, get_data.target
y_pred, centroid = get_data.get_centroids()
avg_matric = get_data.calculate_avg_matric(data, centroid, y_pred)
label, noisy = get_data.generate_noisy_data(avg_matric, centroid, noisy_scale)
for i in range(len(label)):
label_noise[label[i]].append(noisy[i])
for i in range(len(data)):
plt.scatter(data[i][0], data[i][1], c=color[y_pred[i]])
label_noise[y_pred[i]].append(tuple(data[i]))
for i in range(len(noisy)):
plt.scatter(noisy[i][0], noisy[i][1], c=color[label[i]], marker='p')
data.append(noisy[i])
y_pred.append(label[i])
plt.savefig('./points_showing.png')
plt.show()
create_ecxel(data, y_pred, path)
2.f’simulate_neural_network’
- affect of file:
Due to the PC doesn’ t own strong enough GPU to support us to train ResNet-34 or ResNet-50 on Cifar-10, so we take the measure of simulate the training process of the deep neural network but ignore the feature extraction of the Convolutional Neural Network. We genernate enough clear data and different scale noisy data in f’create_noisy_data’ and mix them up, then we take only one full-connect layer as our backbone learning
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import tensorflow as tf
import create_noisy_data
from matplotlib import rcParams
rcParams['font.family']='simhei'#显示中文
color = ['black', 'red', 'green', 'blue', 'mistyrose', 'cyan']
path = r"F:\00PYTHON\noisy_label_learning\noisy_label.csv"
warnings.filterwarnings('ignore')
def read_data(path):
df = pd.read_csv(path)
x_train, y_train, x_test, y_test = [], [], [], []
for i in range(3):
for j in range(len(list(df.iloc[i, :].values[31:]))):
y_train.append(i)
x_train += list(zip(list(df.iloc[i, :].values[31:]), list(df.iloc[i+3, :].values[31:])))
for i in range(3):
for j in range(len(list(df.iloc[i, :].values[1:31]))):
y_test.append(i)
x_test += list(zip(list(df.iloc[i, :].values[1:31]), list(df.iloc[i+3, :].values[1:31])))
return x_train, x_test, y_train, y_test
def shuffle_data(x_train, x_test, y_train, y_test):
np.random.seed(16)
np.random.shuffle(x_train)
np.random.seed(16)
np.random.shuffle(y_train)
np.random.seed(16)
np.random.shuffle(x_test)
np.random.seed(16)
np.random.shuffle(y_test)
return x_train, x_test, y_train, y_test
class CrossEntropy2d():
def __init__(self):
super(CrossEntropy2d, self).__init__()
self.criterion = tf.nn.CrossEntropyLoss(weight=None, size_average=True)
def forward(self, out, target):
n, c, h, w = out.size() # n:batch_size, c:class
out = out.view(-1, c) # (n*h*w, c)
target = target.view(-1) # (n*h*w)
# print('out', out.size(), 'target', target.size())
loss = self.criterion(out, target)
return loss
def Loss_o(x1, x2):
res = 0
#print(len(x1[0]), len(x1[1]))
for i in range(np.array(x1).shape[0]):
for j in range(np.array(x1).shape[1]):
res += x1[i][j] * tf.math.log(x2[i][j])
return - res / len(x1[0])
def Loss_c(x1, x2):
res = 0
for i in range(np.array(x1).shape[0]):
for j in range(np.array(x1).shape[1]):
res += x1[i][j] * tf.math.log(x1[i][j]/x2[i][j])
return res / len(x1[0])
def Loss_e(x1):
res = 0
for i in range(np.array(x1).shape[0]):
for j in range(np.array(x1).shape[1]):
res += x1[i][j] * tf.math.log(x1[i][j])
return - res / len(x1[0])
def fit_model_with_pencil(x_train, x_test, y_train, y_test):
x_train = tf.cast(x_train, tf.float32)
x_test = tf.cast(x_test, tf.float32)
y_list = y_train
x_list = x_train
length = len(x_train)
test_db = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)
w1 = tf.Variable(tf.random.truncated_normal([2, 3], stddev=0.1, seed=1))
b = tf.Variable(tf.random.truncated_normal([3], stddev=0.1, seed=1))
#y_d = tf.Variable(tf.random.truncated_normal([32, 3], stddev=0.1, seed=1))
loss_list, acc_list = [], []
epoch_list = []
epoches = 120 * 2
loss_all, lr = 0, 0.3
uu = [40 * x for x in range(1, 6)]
Y_d = [0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0
]
train_db = tf.data.Dataset.from_tensor_slices((x_list, y_list)).batch(32)
for epoch in range(epoches):
if epoch in uu:
lr *= 0.10
num = 0
#print(y_list)
if epoch == 121:
while len(y_list) != length:
y_list.pop()
print(x_list, len(y_list))
train_db = tf.data.Dataset.from_tensor_slices((x_list, y_list)).batch(32)
for step, (x_train, y_train) in enumerate(train_db):
with tf.GradientTape(persistent=True) as tape:
y = tf.matmul(x_train, w1) + b
y = tf.nn.softmax(y)
y_ = tf.one_hot(y_train, depth=3)
y_w = 3 * y_
if epoch == 0:
print(num)
Y_d[num] = tf.nn.softmax(y_w)
y_d = Y_d[num]
tape.watch(y_d)
# loss = tf.reduce_mean(tf.square(y_ - y)) #MSE损失仅仅用于回归问题分析
cce = tf.keras.losses.CategoricalCrossentropy()
loss = cce(y_, y)
if epoch <= 120:
loss_sum_ = 1/3 * cce(y, y/y_d) - 0.1 * cce(y_, y_d) - (0.8/3) * cce(y, y)
# loss = tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(y) + (1-y_) * tf.math.log(1-y)))
loss_all += loss.numpy()
num += 1
grad = tape.gradient(loss, [w1, b])
if epoch <= 120:
grad_pencil = tape.gradient(loss_sum_, y_d)
tf.Variable(y_d, dtype=tf.float32).assign_sub(200 * grad_pencil)
#y_d = tf.argmax(y_d, axis=1)
w1.assign_sub(lr * grad[0])
b.assign_sub(lr * grad[1])
if epoch == 120 :
y_m = y_d
y_x = tf.argmax(y_d, axis=1)
y_d = y_m
if num*32+32 <= length:
y_list[num*32:num*32+32] = list(y_x.numpy())
else :
y_list[num * 32:length] = list(y_x.numpy())[0:length-num*32]
loss_list.append(loss_all / 4)
loss_all = 0
total_correct = 0
total_number = 0
if epoch >= 121:
for x_test, y_test in test_db:
y = tf.matmul(x_test, w1) + b
y = tf.nn.softmax(y)
pred = tf.argmax(y, axis=1)
pred = tf.cast(pred, dtype=y_test.dtype)
correct = tf.cast(tf.equal(pred, y_test), dtype=tf.int32)
correct = tf.reduce_sum(correct)
total_correct += int(correct)
total_number += x_test.shape[0]
acc = total_correct / total_number
print(f'准确率为{acc}')
acc_list.append(acc)
epoch_list.append(epoch)
return acc_list, loss_list, epoch_list
def fit_model(x_train, x_test, y_train, y_test):
x_train = tf.cast(x_train, tf.float32)
x_test = tf.cast(x_test, tf.float32)
train_db = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(32)
test_db = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)
w1 = tf.Variable(tf.random.truncated_normal([2, 3], stddev=0.1, seed=1))
b = tf.Variable(tf.random.truncated_normal([3], stddev=0.1, seed=1))
loss_list, acc_list = [], []
epoch_list = []
epoches = 120
loss_all, lr = 0, 0.3
uu = [40*x for x in range(1,4)]
for epoch in range(epoches):
if epoch in uu:
lr *= 0.10
for step, (x_train, y_train) in enumerate(train_db):
with tf.GradientTape() as tape:
y = tf.matmul(x_train, w1) + b
y = tf.nn.softmax(y)
#print('y is :', np.array(y.numpy()))
y_ = tf.one_hot(y_train, depth=3)
#loss = tf.reduce_mean(tf.square(y_ - y)) #MSE损失仅仅用于回归问题分析
cce = tf.keras.losses.CategoricalCrossentropy()
loss = cce(y_, y)
#loss = tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(y) + (1-y_) * tf.math.log(1-y)))
loss_all += loss.numpy()
grad = tape.gradient(loss, [w1, b])
w1.assign_sub(lr * grad[0])
b.assign_sub(lr * grad[1])
loss_list.append(loss_all / 4)
loss_all = 0
total_correct = 0
total_number = 0
for x_test, y_test in test_db:
y = tf.matmul(x_test, w1) + b
y = tf.nn.softmax(y)
pred = tf.argmax(y, axis=1)
pred = tf.cast(pred, dtype=y_test.dtype)
correct = tf.cast(tf.equal(pred, y_test), dtype=tf.int32)
correct = tf.reduce_sum(correct)
total_correct += int(correct)
total_number += x_test.shape[0]
acc = total_correct / total_number
print(f'准确率为{acc}')
acc_list.append(acc)
epoch_list.append(epoch)
return acc_list, loss_list, epoch_list
def fit_model_api(x_train, x_test, y_train, y_test):
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(3, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2())
])
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.3),
loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
metrics=['sparse_categorical_accuracy'])
history = model.fit(x_train, y_train, batch_size=32, epochs=120, validation_data=(x_test, y_test))
model.summary
acc = history.history['sparse_categorical_accuracy']
val_acc = history.history['val_sparse_categorical_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.subplot(1, 2, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()
if __name__ == '__main__':
choicen_scale = [1, 40, 80, 100, 100]
legends, loss_lists, acc_lists = [], [[] for x in range(len(choicen_scale))], [[] for x in range(len(choicen_scale))]
labels, labels_1 = [], []
for i in range(len(choicen_scale)):
noisy_scale = choicen_scale[i]
create_noisy_data.create_data_main(noisy_scale)
x_train, x_test, y_train, y_test = read_data(path)
print(len(x_train), len(x_test), len(y_train), len(y_test), sep='\n')
x_train, x_test, y_train, y_test = shuffle_data(x_train, x_test, y_train, y_test)
#fit_model_api(np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test))
acc_list, loss_list, epoch_list = fit_model_with_pencil(x_train, x_test, y_train, y_test)
loss_lists[i] = loss_list
acc_lists[i] = acc_list
labels.append(f'{choicen_scale[i]}规模的噪声的损失')
labels_1.append(f'{choicen_scale[i]}规模的噪声的准确率')
ln_1, = plt.plot(epoch_list, loss_lists[0][121:], color=color[0], linewidth=2.0)
ln_2, = plt.plot(epoch_list, loss_lists[1][121:], color=color[1], linewidth=2.0)
ln_3, = plt.plot(epoch_list, loss_lists[2][121:], color=color[2], linewidth=2.0)
ln_4, = plt.plot(epoch_list, loss_lists[3][121:], color=color[3], linewidth=2.0)
ln_5, = plt.plot(epoch_list, loss_lists[4][121:], color=color[4], linewidth=2.0)
plt.title('不同噪声下的损失')
plt.legend(handles=[ln_1, ln_2, ln_3, ln_4, ln_5], labels=labels)
plt.savefig('./different_noisy_loss_with_pencil.png')
plt.show()
ln_1, = plt.plot(epoch_list, acc_lists[0][121:], color=color[0], linewidth=2.0)
ln_2, = plt.plot(epoch_list, acc_lists[1][121:], color=color[1], linewidth=2.0)
ln_3, = plt.plot(epoch_list, acc_lists[2][121:], color=color[2], linewidth=2.0)
ln_4, = plt.plot(epoch_list, acc_lists[3][121:], color=color[3], linewidth=2.0)
ln_5, = plt.plot(epoch_list, acc_lists[4][121:], color=color[4], linewidth=2.0)
plt.title('不同噪声下的准确率')
plt.legend(handles=[ln_1, ln_2, ln_3, ln_4, ln_5], labels=labels_1)
plt.savefig('./different_noisy_acc_with_pencil.png')
plt.show()
###############用于可视化点坐标###############
'''
for i in range(len(x_test)):
plt.scatter(x_test[i][0], x_test[i][1], c=color[int(y_test[i])])
plt.show()
for i in range(len(x_train)):
plt.scatter(x_train[i][0], x_train[i][1], c=color[y_train[i]])
plt.show()
'''
###############用于可视化点坐标###############
###############用于可视化点坐标###############
'''
for i in range(len(x_test)):
plt.scatter(x_test[i][0], x_test[i][1], c=color[int(y_test[i])])
plt.show()
for i in range(len(x_train)):
plt.scatter(x_train[i][0], x_train[i][1], c=color[y_train[i]])
plt.show()
'''
###############用于可视化点坐标###############
3.Result Showing of one FC layer
-
data showing:
-
loss in backbone network showing:
-
accuracy of backbone network showing
4.Noisy data learning
- reference paper
Probabilistic End-to-end Noise Correction for Learning with Noisy Labels
-
Backbone learning: The auther trained Cifar-10 dataset on ResNet-34, however due to COVID-19 research at home, we own limited source, so I take measures of genernate the data skip the process of feature extraction in ResNet, we genernate 600 points which are divided into 3 clusters automatically, and train them with a network owning just one full connect layer. Then as to noisy data, we gennernate different scale of noisy data, which are one scale (approximate to no noisy), 120 scale, 240 scale, 300 scale and 400 scale, these dataset are all symmetric noise, we add noisy data into every class with the same probability. And with a lot of paper finding, the model can’ t fit to data very well in a bit big learning rate, so we adapt a method which is decrease off 10 percent to learning rate in every 40 epochs, so we can also find the result in above two figures.
-
Pencil learning: In paper DLDL, the auther put forward a method to update the label during the Back Propagation, so Pencil method was also inspired by this method. At the begining, we initialize label (don’ t know if this is a clean label or not) into one-hot encode. And in the forward computa-
tion, we calculate thress kind of loss.-
Compatibility loss: As for the ordinary noisy data, we have an original label yd.
So we need to multiply one constant which is K, this is the number of class. As a result of this, we can make sure that the label softmaxed could be as approximated as possible.Then we can get Compatibility loss.
-
Classification loss
First of all, we need to know the KL-loss, which is
Then we can know the classification loss is
-
Entropy loss
-
The overall PENCIL framework
-
-
Update the parameters
1 ) network parameters updates: Nothing changed compared with normal Neural Network.
2 ) label probability updates: We need to update label with taking the gradient of label using sum-loss.
5.Results showing with pencil framework
-
loss of pencil framework
-
accuracy of pencil framework
-
loss of fine-tune learning
-
accuracy of backbone learning(epoch 0 - epoch 120) and fine-tune(epoch 121 - epoch 240)
6.Waiting solved questions
- Asymmetric Noise:
As for asymmetric noise, following[16] the noisy labels were generated by mapping truck→ automobile, bird→ airplane, deer → horseand cat ↔ dog with probability r. These noise genera-
tion methods are in coincidence with confusions that oftenhappen in the real world.
bird->airplane? These are two similar visual features, if we don’ t set attention mechanism but just extract features with CNN, I believe their corresponding net parameters will be very close, so we can deal with this kind of miswatched figures with PENCIL, but what about others like just be mislabeld which are very far visually, I think if we trained our model with this kind of asymmetric noisy data it would perform worse on those mislabeld but keep far away from each others’ noisy data.
- Gradient Disappearence: When I use one full connect layer as backbone network, the gradient would disappear when the dataset is too large and the learning rate is too big. Can’ t figure out any methods to set it down.
7.Question solved
- We need to add variable into watch when we use grad and the result is None.