And another matter is that sigmoid is not useful when the network is deep, very deep, as sigmoid-like activation functions face the vanishing gradients problem.
To achieve better performance, ReLU activation should be used, but ReLU has dead ReLU units problem, so Leaky ReLU is better.
There's still one more problem, ReLU and Leaky ReLU may make output values result in all max values when the network is too deep, unnecessarily deep. Thus reducing number of layers is important to make Leaky ReLU work as expected.
Training data & separation:
Source code:
#core import time,os,sys; #libs import tensorflow as tf; import matplotlib.pyplot as pyplot; #exit def exit(): #os._exit(1); sys.exit(); #end def #mockup to emphasize value name def units(Num): return Num; #end def #PROGRAMME ENTRY POINT========================================================== #data #https://i.imgur.com/uVOxZR7.png X = [[1,1],[1,2],[1,3],[2,1],[2,2],[2,3],[3,1],[3,2],[3,3],[4,1],[4,2],[4,3],[5,1],[6,1]]; Y = [[0], [1], [0], [1], [0], [1], [0], [2], [1], [1], [1], [0], [0], [1] ]; Max_X = 6#6; Max_Y = 2#2; Batch_Size = 14#14; #convert Y to probabilities P In = tf.placeholder(dtype=tf.int32, shape=[Batch_Size]); Out = tf.one_hot(In, depth=Max_Y+1); Temp = []; for I in range(len(Y)): Temp+=[Y[I][0]]; Sess = tf.Session(); P = Sess.run(Out, feed_dict={In:Temp}).tolist(); Sess.close(); #print(P); #normalise for I in range(len(X)): X[I][0] /= Max_X; X[I][1] /= Max_X; Y[I][0] /= Max_Y; #unused when using probs #end for #model Input = tf.placeholder(dtype=tf.float32, shape=[Batch_Size,2]); #regress: #Expected = tf.placeholder(dtype=tf.float32, shape=[Batch_Size,1]); #probs: Expected = tf.placeholder(dtype=tf.float32, shape=[Batch_Size,Max_Y+1]); #SIGMOID WORKS BUT SLOW. USE LEAKY_RELU FOR PERFORMANCE, BUT REDUCE NUMBER OF #LAYERS TO AVOID LOSS STUCK AS ALL OUTPUT PROBS ARE 1, EXPLODING GRADIENTS? activation_fn = tf.nn.leaky_relu; #activation_fn = tf.sigmoid; ''' #1 Weight1 = tf.Variable(tf.random_uniform(shape=[2,units(60)], minval=-1, maxval=1)); Bias1 = tf.Variable(tf.random_uniform(shape=[ units(60)], minval=-1, maxval=1)); Hidden1 = activation_fn(tf.matmul(Input,Weight1) + Bias1); #2 Weight2 = tf.Variable(tf.random_uniform(shape=[60,units(50)], minval=-1, maxval=1)); Bias2 = tf.Variable(tf.random_uniform(shape=[ units(50)], minval=-1, maxval=1)); Hidden2 = activation_fn(tf.matmul(Hidden1,Weight2) + Bias2); #3 Weight3 = tf.Variable(tf.random_uniform(shape=[50,units(40)], minval=-1, maxval=1)); Bias3 = tf.Variable(tf.random_uniform(shape=[ units(40)], minval=-1, maxval=1)); Hidden3 = activation_fn(tf.matmul(Hidden2,Weight3) + Bias3); ''' #4 Weight4 = tf.Variable(tf.random_uniform(shape=[2,units(30)], minval=-1, maxval=1)); Bias4 = tf.Variable(tf.random_uniform(shape=[ units(30)], minval=-1, maxval=1)); Hidden4 = activation_fn(tf.matmul(Input,Weight4) + Bias4); #5 Weight5 = tf.Variable(tf.random_uniform(shape=[30,units(20)], minval=-1, maxval=1)); Bias5 = tf.Variable(tf.random_uniform(shape=[ units(20)], minval=-1, maxval=1)); Hidden5 = activation_fn(tf.matmul(Hidden4,Weight5) + Bias5); #out #regress: #Weight6 = tf.Variable(tf.random_uniform(shape=[20,units(1)], minval=-1, maxval=1)); #Bias6 = tf.Variable(tf.random_uniform(shape=[ units(1)], minval=-1, maxval=1)); #probs: #N_Classes = Max_Y+1 Weight6 = tf.Variable(tf.random_uniform(shape=[20,units(Max_Y+1)], minval=-1, maxval=1)); Bias6 = tf.Variable(tf.random_uniform(shape=[ units(Max_Y+1)], minval=-1, maxval=1)); Output = tf.sigmoid(tf.matmul(Hidden5,Weight6) + Bias6); Loss = tf.reduce_sum(tf.square(Expected-Output)); Optimiser = tf.train.GradientDescentOptimizer(1e-1); Training = Optimiser.minimize(Loss); #training Sess = tf.Session(); Init = tf.global_variables_initializer(); Sess.run(Init); #regress: #Feed = {Input:X, Expected:Y}; #probs: Feed = {Input:X, Expected:P}; Epochs = 5000; Losses = []; Start = time.time(); for I in range(Epochs): if (I%(Epochs/10)==0): Lossvalue = Sess.run(Loss, feed_dict=Feed); Losses += [Lossvalue]; if (I==0): print("Loss:",Lossvalue,"(first)"); else: print("Loss:",Lossvalue); #end if Sess.run(Training, feed_dict=Feed); #end for Lastloss = Sess.run(Loss, feed_dict=Feed); Losses += [Lastloss]; print("Loss:",Lastloss,"(last)"); Finish = time.time(); print("Time:",Finish-Start,"seconds"); #eval Evalresults = Sess.run(Output,feed_dict=Feed).tolist(); Sse = 0; for I in range(len(P)): for J in range(len(P[I])): Sse += (P[I][J]-Evalresults[I][J])**2; P[I][J] = round(P[I][J]); #end for for I in range(len(Evalresults)): for J in range(len(Evalresults[I])): #regress: #Evalresults[I][J] = round(Evalresults[I][J]*Max_Y); #probs: Evalresults[I][J] = round(Evalresults[I][J]); #end for #end for print("\nSSE = {}".format(Sse)); print("Probs (Expected):"); print(P); print("Probs (Eval):"); print(Evalresults); Sess.close(); #result: diagram print("\nLoss curve:"); pyplot.plot(Losses,"-bo"); #eof
Reference:
Colab link:
https://colab.research.google.com/drive/1mixW4_wPM3_c_KQwh-hWLuW5Aay92Pit
Colab link (Regression version, instead of class probabilities):
https://colab.research.google.com/drive/1h6yrLPbGnzj5cPW0er9LBLRR5rNM3rmE
No comments:
Post a Comment