Abivin: Case Study: Avoid Dead ReLU Units, Exploding Gradients, and Vanishing Gradients

The previous blog article (https://blog.abivin.vn/2019/09/case-study-separate-heavily-mixed-up-ys.html) states about separating heavily mixed-up classes. However, it uses sigmoid at activation which will make the computation slow, not the computation at matrix mul but a bit slow down as number of activations is much smaller than number of matrix multiplications.

And another matter is that sigmoid is not useful when the network is deep, very deep, as sigmoid-like activation functions face the vanishing gradients problem.

To achieve better performance, ReLU activation should be used, but ReLU has dead ReLU units problem, so Leaky ReLU is better.

There's still one more problem, ReLU and Leaky ReLU may make output values result in all max values when the network is too deep, unnecessarily deep. Thus reducing number of layers is important to make Leaky ReLU work as expected.

Training data & separation:

Source code:

#core
import time,os,sys;

#libs
import tensorflow        as tf;
import matplotlib.pyplot as pyplot;

#exit
def exit():
  #os._exit(1);
  sys.exit();
#end def

#mockup to emphasize value name
def units(Num):
  return Num;
#end def

#PROGRAMME ENTRY POINT==========================================================
#data
#https://i.imgur.com/uVOxZR7.png
X = [[1,1],[1,2],[1,3],[2,1],[2,2],[2,3],[3,1],[3,2],[3,3],[4,1],[4,2],[4,3],[5,1],[6,1]];
Y = [[0],  [1],  [0],  [1],  [0],  [1],  [0],  [2],  [1],  [1],  [1],  [0],  [0],  [1]  ];
Max_X      = 6#6;
Max_Y      = 2#2;
Batch_Size = 14#14;

#convert Y to probabilities P
In   = tf.placeholder(dtype=tf.int32, shape=[Batch_Size]);
Out  = tf.one_hot(In, depth=Max_Y+1);
Temp = [];
for I in range(len(Y)): Temp+=[Y[I][0]];

Sess = tf.Session();
P = Sess.run(Out, feed_dict={In:Temp}).tolist();
Sess.close();
#print(P);

#normalise
for I in range(len(X)):
  X[I][0] /= Max_X;
  X[I][1] /= Max_X;
  Y[I][0] /= Max_Y; #unused when using probs
#end for

#model
Input     = tf.placeholder(dtype=tf.float32, shape=[Batch_Size,2]);
#regress:
#Expected  = tf.placeholder(dtype=tf.float32, shape=[Batch_Size,1]);
#probs:
Expected  = tf.placeholder(dtype=tf.float32, shape=[Batch_Size,Max_Y+1]);

#SIGMOID WORKS BUT SLOW. USE LEAKY_RELU FOR PERFORMANCE, BUT REDUCE NUMBER OF 
#LAYERS TO AVOID LOSS STUCK AS ALL OUTPUT PROBS ARE 1, EXPLODING GRADIENTS?
activation_fn = tf.nn.leaky_relu;
#activation_fn = tf.sigmoid;
'''
#1
Weight1   = tf.Variable(tf.random_uniform(shape=[2,units(60)], minval=-1, maxval=1));
Bias1     = tf.Variable(tf.random_uniform(shape=[  units(60)], minval=-1, maxval=1));
Hidden1   = activation_fn(tf.matmul(Input,Weight1) + Bias1);

#2
Weight2   = tf.Variable(tf.random_uniform(shape=[60,units(50)], minval=-1, maxval=1));
Bias2     = tf.Variable(tf.random_uniform(shape=[   units(50)], minval=-1, maxval=1));
Hidden2   = activation_fn(tf.matmul(Hidden1,Weight2) + Bias2);

#3
Weight3   = tf.Variable(tf.random_uniform(shape=[50,units(40)], minval=-1, maxval=1));
Bias3     = tf.Variable(tf.random_uniform(shape=[   units(40)], minval=-1, maxval=1));
Hidden3   = activation_fn(tf.matmul(Hidden2,Weight3) + Bias3);
'''
#4
Weight4   = tf.Variable(tf.random_uniform(shape=[2,units(30)], minval=-1, maxval=1));
Bias4     = tf.Variable(tf.random_uniform(shape=[  units(30)], minval=-1, maxval=1));
Hidden4   = activation_fn(tf.matmul(Input,Weight4) + Bias4);

#5
Weight5   = tf.Variable(tf.random_uniform(shape=[30,units(20)], minval=-1, maxval=1));
Bias5     = tf.Variable(tf.random_uniform(shape=[   units(20)], minval=-1, maxval=1));
Hidden5   = activation_fn(tf.matmul(Hidden4,Weight5) + Bias5);

#out
#regress:
#Weight6   = tf.Variable(tf.random_uniform(shape=[20,units(1)], minval=-1, maxval=1));
#Bias6     = tf.Variable(tf.random_uniform(shape=[   units(1)], minval=-1, maxval=1));

#probs:
#N_Classes = Max_Y+1
Weight6   = tf.Variable(tf.random_uniform(shape=[20,units(Max_Y+1)], minval=-1, maxval=1));
Bias6     = tf.Variable(tf.random_uniform(shape=[   units(Max_Y+1)], minval=-1, maxval=1));
Output    = tf.sigmoid(tf.matmul(Hidden5,Weight6) + Bias6);

Loss      = tf.reduce_sum(tf.square(Expected-Output));
Optimiser = tf.train.GradientDescentOptimizer(1e-1);
Training  = Optimiser.minimize(Loss);

#training
Sess = tf.Session();
Init = tf.global_variables_initializer();
Sess.run(Init);

#regress:
#Feed = {Input:X, Expected:Y};

#probs:
Feed = {Input:X, Expected:P};

Epochs = 5000;
Losses = [];
Start  = time.time();
for I in range(Epochs):
  if (I%(Epochs/10)==0):
    Lossvalue = Sess.run(Loss, feed_dict=Feed);
    Losses   += [Lossvalue];
    
    if (I==0):
      print("Loss:",Lossvalue,"(first)");
    else:
      print("Loss:",Lossvalue);
  #end if
  
  Sess.run(Training, feed_dict=Feed);
#end for

Lastloss = Sess.run(Loss, feed_dict=Feed);
Losses  += [Lastloss];
print("Loss:",Lastloss,"(last)");

Finish = time.time();
print("Time:",Finish-Start,"seconds");

#eval
Evalresults = Sess.run(Output,feed_dict=Feed).tolist();

Sse = 0;
for I in range(len(P)):
  for J in range(len(P[I])):
    Sse    += (P[I][J]-Evalresults[I][J])**2;
    P[I][J] = round(P[I][J]);    
  #end for

for I in range(len(Evalresults)):
  for J in range(len(Evalresults[I])):
    #regress:
    #Evalresults[I][J] = round(Evalresults[I][J]*Max_Y);
    #probs:
    Evalresults[I][J] = round(Evalresults[I][J]);    
  #end for
#end for

print("\nSSE = {}".format(Sse));
print("Probs (Expected):");
print(P);    
print("Probs (Eval):");
print(Evalresults);
Sess.close();

#result: diagram
print("\nLoss curve:");
pyplot.plot(Losses,"-bo");
#eof

Reference:

Leaky ReLU: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)

Colab link:
https://colab.research.google.com/drive/1mixW4_wPM3_c_KQwh-hWLuW5Aay92Pit

Colab link (Regression version, instead of class probabilities):
https://colab.research.google.com/drive/1h6yrLPbGnzj5cPW0er9LBLRR5rNM3rmE

Friday, 13 September 2019

Case Study: Avoid Dead ReLU Units, Exploding Gradients, and Vanishing Gradients

No comments:

Post a Comment