The previous blog article (
https://blog.abivin.vn/2019/09/case-study-separate-heavily-mixed-up-ys.html) states about
separating heavily mixed-up classes. However, it uses
sigmoid at activation which
will make the computation slow, not the computation at matrix mul but
a bit slow down as number of activations is much smaller than number of matrix multiplications.
And another matter is that sigmoid is
not useful when the network is deep, very deep, as sigmoid-like activation functions face the
vanishing gradients problem.
To achieve better performance,
ReLU activation should be used, but
ReLU has dead ReLU units problem, so
Leaky ReLU is better.
There's still one more problem, ReLU and Leaky ReLU
may make output values result in all max values when the network is
too deep, unnecessarily deep. Thus reducing number of layers is important to make Leaky ReLU work as expected.
Training data & separation:
Source code:
#core
import time,os,sys;
#libs
import tensorflow as tf;
import matplotlib.pyplot as pyplot;
#exit
def exit():
#os._exit(1);
sys.exit();
#end def
#mockup to emphasize value name
def units(Num):
return Num;
#end def
#PROGRAMME ENTRY POINT==========================================================
#data
#https://i.imgur.com/uVOxZR7.png
X = [[1,1],[1,2],[1,3],[2,1],[2,2],[2,3],[3,1],[3,2],[3,3],[4,1],[4,2],[4,3],[5,1],[6,1]];
Y = [[0], [1], [0], [1], [0], [1], [0], [2], [1], [1], [1], [0], [0], [1] ];
Max_X = 6#6;
Max_Y = 2#2;
Batch_Size = 14#14;
#convert Y to probabilities P
In = tf.placeholder(dtype=tf.int32, shape=[Batch_Size]);
Out = tf.one_hot(In, depth=Max_Y+1);
Temp = [];
for I in range(len(Y)): Temp+=[Y[I][0]];
Sess = tf.Session();
P = Sess.run(Out, feed_dict={In:Temp}).tolist();
Sess.close();
#print(P);
#normalise
for I in range(len(X)):
X[I][0] /= Max_X;
X[I][1] /= Max_X;
Y[I][0] /= Max_Y; #unused when using probs
#end for
#model
Input = tf.placeholder(dtype=tf.float32, shape=[Batch_Size,2]);
#regress:
#Expected = tf.placeholder(dtype=tf.float32, shape=[Batch_Size,1]);
#probs:
Expected = tf.placeholder(dtype=tf.float32, shape=[Batch_Size,Max_Y+1]);
#SIGMOID WORKS BUT SLOW. USE LEAKY_RELU FOR PERFORMANCE, BUT REDUCE NUMBER OF
#LAYERS TO AVOID LOSS STUCK AS ALL OUTPUT PROBS ARE 1, EXPLODING GRADIENTS?
activation_fn = tf.nn.leaky_relu;
#activation_fn = tf.sigmoid;
'''
#1
Weight1 = tf.Variable(tf.random_uniform(shape=[2,units(60)], minval=-1, maxval=1));
Bias1 = tf.Variable(tf.random_uniform(shape=[ units(60)], minval=-1, maxval=1));
Hidden1 = activation_fn(tf.matmul(Input,Weight1) + Bias1);
#2
Weight2 = tf.Variable(tf.random_uniform(shape=[60,units(50)], minval=-1, maxval=1));
Bias2 = tf.Variable(tf.random_uniform(shape=[ units(50)], minval=-1, maxval=1));
Hidden2 = activation_fn(tf.matmul(Hidden1,Weight2) + Bias2);
#3
Weight3 = tf.Variable(tf.random_uniform(shape=[50,units(40)], minval=-1, maxval=1));
Bias3 = tf.Variable(tf.random_uniform(shape=[ units(40)], minval=-1, maxval=1));
Hidden3 = activation_fn(tf.matmul(Hidden2,Weight3) + Bias3);
'''
#4
Weight4 = tf.Variable(tf.random_uniform(shape=[2,units(30)], minval=-1, maxval=1));
Bias4 = tf.Variable(tf.random_uniform(shape=[ units(30)], minval=-1, maxval=1));
Hidden4 = activation_fn(tf.matmul(Input,Weight4) + Bias4);
#5
Weight5 = tf.Variable(tf.random_uniform(shape=[30,units(20)], minval=-1, maxval=1));
Bias5 = tf.Variable(tf.random_uniform(shape=[ units(20)], minval=-1, maxval=1));
Hidden5 = activation_fn(tf.matmul(Hidden4,Weight5) + Bias5);
#out
#regress:
#Weight6 = tf.Variable(tf.random_uniform(shape=[20,units(1)], minval=-1, maxval=1));
#Bias6 = tf.Variable(tf.random_uniform(shape=[ units(1)], minval=-1, maxval=1));
#probs:
#N_Classes = Max_Y+1
Weight6 = tf.Variable(tf.random_uniform(shape=[20,units(Max_Y+1)], minval=-1, maxval=1));
Bias6 = tf.Variable(tf.random_uniform(shape=[ units(Max_Y+1)], minval=-1, maxval=1));
Output = tf.sigmoid(tf.matmul(Hidden5,Weight6) + Bias6);
Loss = tf.reduce_sum(tf.square(Expected-Output));
Optimiser = tf.train.GradientDescentOptimizer(1e-1);
Training = Optimiser.minimize(Loss);
#training
Sess = tf.Session();
Init = tf.global_variables_initializer();
Sess.run(Init);
#regress:
#Feed = {Input:X, Expected:Y};
#probs:
Feed = {Input:X, Expected:P};
Epochs = 5000;
Losses = [];
Start = time.time();
for I in range(Epochs):
if (I%(Epochs/10)==0):
Lossvalue = Sess.run(Loss, feed_dict=Feed);
Losses += [Lossvalue];
if (I==0):
print("Loss:",Lossvalue,"(first)");
else:
print("Loss:",Lossvalue);
#end if
Sess.run(Training, feed_dict=Feed);
#end for
Lastloss = Sess.run(Loss, feed_dict=Feed);
Losses += [Lastloss];
print("Loss:",Lastloss,"(last)");
Finish = time.time();
print("Time:",Finish-Start,"seconds");
#eval
Evalresults = Sess.run(Output,feed_dict=Feed).tolist();
Sse = 0;
for I in range(len(P)):
for J in range(len(P[I])):
Sse += (P[I][J]-Evalresults[I][J])**2;
P[I][J] = round(P[I][J]);
#end for
for I in range(len(Evalresults)):
for J in range(len(Evalresults[I])):
#regress:
#Evalresults[I][J] = round(Evalresults[I][J]*Max_Y);
#probs:
Evalresults[I][J] = round(Evalresults[I][J]);
#end for
#end for
print("\nSSE = {}".format(Sse));
print("Probs (Expected):");
print(P);
print("Probs (Eval):");
print(Evalresults);
Sess.close();
#result: diagram
print("\nLoss curve:");
pyplot.plot(Losses,"-bo");
#eof
Reference:
Colab link:
https://colab.research.google.com/drive/1mixW4_wPM3_c_KQwh-hWLuW5Aay92Pit
Colab link (Regression version, instead of class probabilities):
https://colab.research.google.com/drive/1h6yrLPbGnzj5cPW0er9LBLRR5rNM3rmE