I have done alot reading about deep learning and it just happenst to be that
the more I read, the more confused I get cause I'm having hard times understanding the math language. I decided to take a peek into caffe to extract adadelta.
Caffe's code is really big and complicated what is understandable when jumping into big library to find something, not sure how that something even looks like.
If someone knows how adadelta works and what else there might be hidden in caffe, could you please help me to apply those learning algorithms to my
simple neural network header so I could understand the math behind all this.
( converting math language to c++ )
class neuron;
class connection { public:
double weight;
double delta; // In my current algorithms, there is a momentum, more about that deeper in code;
neuron *target;
};
class neuron { public:
vector<connection *> IncomingConnections;
vector<connection *> OutcomingConnections;
double output;
double bias;
double bias_delta; // bias weight change momentum;
double gradient;
// calculating output value for current neuron
void CalculateOutput() {
double sum = 0.0;
for (int i = 0; i < IncomingConnections.size(); i++) {
sum += IncomingConnections[i]->weight * IncomingConnections[i]->target->output;
}
output = tanh(output + bias);
}
// calculating gradient value for neuron what is inside the hidden layer
void CalculatHiddenGradient() {
double sum = 0.0;
for (int i = 0; i < IncomingConnections.size(); i++) {
sum += OutcomingConnections[i]->weight * OutcomingConnections[i]->target->gradient;
}
gradient = (1.0 - output * output) * sum;
}
// calculating gradient value for output neurons where we know the desired output value
void CalculatGradient(double TargetOutput) {
gradient = (TargetOutput - output) * (1.0 - output * output);
}
};
class layer { public:
vector<neuron *> neurons;
};
vector<layer> network;
// In my current algorithms, the order of weight change doesn't matter and we update all connection weights in one epoch
void UpdatingWeights() {
// We're gonna ignore last layer where there are output neuron's because they don't have outcoming connections.
for (int iLayer = 0; iLayer < network.size() - 1; iLayer++) {
for (int iNeuron = 0; iNeuron < network[iLayer].neurons.size(); iNeuron++) {
// Calculating only outcoming connections so we won't overlap. We only want to calculate each weight variable once in one epoch.
for (int iConnection = 0; iConnection < network[iLayer].neurons[iNeuron]->OutcomingConnections.size(); iConnection++) {
// calculating weight change:
// Outcoming Connection's Target Gradient * current neuron output * learning rate + previous weight change * momentum;
network[iLayer].neurons[iNeuron]->OutcomingConnections[iConnection]->delta = network[iLayer].neurons[iNeuron]->OutcomingConnections[iConnection]->target->gradient * network[iLayer].neurons[iNeuron]->output * 0.15 + network[iLayer].neurons[iNeuron]->OutcomingConnections[iConnection]->delta * 0.5;
// Applying weight change:
network[iLayer].neurons[iNeuron]->OutcomingConnections[iConnection]->weight += network[iLayer].neurons[iNeuron]->OutcomingConnections[iConnection]->delta;
}
// Calculating neuron's bias:
// Current Neuron's output * Current neuron's gradient * learning rate + previus bias change * momentum;
network[iLayer].neurons[iNeuron]->bias_delta = network[iLayer].neurons[iNeuron]->output * network[iLayer].neurons[iNeuron]->gradient * 0.15 + network[iLayer].neurons[iNeuron]->bias_delta * 0.5;
// Applying bias's weight change:
network[iLayer].neurons[iNeuron]->bias += network[iLayer].neurons[iNeuron]->bias_delta;
}
}
}