implementing the backpropagation algorithm

closed account (E093605o)
I have written a basic linear algebra library and a MLP (I started to learn C++ a week ago). Now, when I run the code I get an assertion error - in the backprop() method I multiply two matrices that are not of equal column/row dimension. I strictly adhered to this article http://neuralnetworksanddeeplearning.com/chap2.html
For sake of completeness I included the Main.cpp file from where the backprop() method is called but the error is in the backprop method, I have made a capslock comment above it. Can somebody tell me where the logic flaw is?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
//MLP.h
#pragma once
#include "Matrix.h"
#include <tuple>

template<typename T>
class MLP {
 public:
  std::vector<size_t> units_per_layer;
  std::vector<Matrix<T>> bias_vectors;
  std::vector<Matrix<T>> weight_matrices;
  std::vector<Matrix<T>> activations;
  std::vector<Matrix<T>> zs;
  

  double lr = .002;

  MLP(std::vector<size_t> units_per_layer):
    units_per_layer(units_per_layer),
    weight_matrices(),
    bias_vectors(),
    zs(),
    activations()
    {

  for (size_t i = 0; i < units_per_layer.size() - 1; ++i) {
    size_t in_channels{units_per_layer[i]};
    size_t out_channels{units_per_layer[i+1]};

    // initialize to random Gaussian
    auto W  = mtx<T>::randn(out_channels, in_channels);
    weight_matrices.push_back(W);

    auto b  = mtx<T>::randn(out_channels, 1);
    bias_vectors.push_back(b);

    auto z = mtx<T>::randn(out_channels,1);
    zs.push_back(z);

    activations.resize(units_per_layer.size());
  }
}

static inline auto sigmoid(double x) {
  return 1.0 / (1 + exp(-x));
}

static inline auto d_sigmoid(double x){
  return (x * (1 - x));
}   


auto forward(Matrix<T> x) {
  assert(std::get<0>(x.shape) == units_per_layer[0] && std::get<1>(x.shape));

  activations[0] = x;
  Matrix<T> prev(x);
  for (int i = 0; i < units_per_layer.size() - 1; i++) {

    Matrix<T> y = weight_matrices[i].matmul(prev);
    y = y + bias_vectors[i];
    y = y.apply_function(sigmoid);
    activations[i+1] = y;
    prev = y;
  }
  return prev;
}

std::tuple<std::vector<Matrix<T>>,std::vector<Matrix<T>>> backprop(Matrix<T> &target) {
  assert(std::get<0>(target.shape) == units_per_layer.back());

  // determine the simple error, error = target - output
  Matrix<T> error = (target - activations.back());
  Matrix<T> last_z = zs[zs.size()-1];
  Matrix<T> last_z_transformed = last_z.apply_function(d_sigmoid);
  Matrix<T> delta_L = error.multiply_elementwise(last_z_transformed);

  //the weights and bias gradients
  std::vector<Matrix<T>> nabla_w(weight_matrices.size());
  std::vector<Matrix<T>> nabla_b(bias_vectors.size());
 
  
  // backprop the error from output to input and step the weights
  for(int i = weight_matrices.size() - 1; i > 0; i--) {
    Matrix<T> z_transformed = zs[i].apply_function(d_sigmoid);
    Matrix<T> w_transposed = weight_matrices[i].T();
    
    // calculating error for previous layer
    // ERROR IN THIS LINE; MATRICES ARE NOT OF EQUAL COLUMN/ROW DIMENSION
    Matrix<T> delta_l = w_transposed.matmul(delta_L).multiply_elementwise(z_transformed);

    // calculating the change of weights and biases
    Matrix<T> a_transposed = activations[i].T();
    nabla_w[i] = delta_l.matmul(a_transposed);
    nabla_b[i] = delta_l;

    // updating the error term delta
    delta_L = delta_l;
  }
  return std::tuple<std::vector<Matrix<T>>,std::vector<Matrix<T>>>(nabla_w,nabla_b);
}

void online_GD(Matrix<T> x, Matrix<T> y){
  forward(x);
  update_parameters(&backprop(y),1.0);
}

void update_parameters(std::tuple<std::vector<Matrix<T>>,std::vector<Matrix<T>>> &params, double batch_size){
  for (int i = weight_matrices.size()-1; i > 0; i--){
    Matrix<T> nabla_w_scaled = std::get<0>(params)[i].multiply_scalar(lr/batch_size);
    Matrix<T> nabla_b_scaled = std::get<1>(params)[i].multiply_scalar(lr/batch_size);
    weight_matrices[i] = weight_matrices[i] - nabla_w_scaled;
    bias_vectors[i] = bias_vectors[i] - nabla_b_scaled;
  }
}
};

//Main.cpp
#include "Matrix.h"
#include "MLP.h"
#include <vector>
#include <iostream>
#include <fstream>
#include <math.h>

template<typename T>
void log (std::ostream& file, const Matrix<T>& x, const Matrix<T>& y, const Matrix<T>& y_hat) {
    file << x;
    file << y;
    file << y_hat;
    file << "------------" << std::endl;
}

int main() {

  // init model
  std::vector<size_t> layers = {1,8,8,8,1};


  // open file to save loss, x, y, and model(x)
  std::ofstream my_file; 
  my_file.open ("data.txt");

  int max_iter{1};

  const double PI {3.14159};
  

  MLP<double> model(layers);
  std::cout << model.bias_vectors[0];

  
  for (int i = 0; i < max_iter; i++){
    auto x = mtx<double>::randn(1, 1).multiply_scalar(PI);
    auto y = x.apply_function([](double v) -> double { return sin(v) * sin(v); });

    // forward and backward
    auto y_hat = model.forward(x);
    auto weights_biases = model.backprop(y); 
    model.update_parameters(weights_biases,1.0); // loss and grads computed in here
    

    log<double>(my_file,x,y,y_hat);
  
  }

  my_file.close();
  
}
I appreciate the information and advice you have shared.
https://www.myhtspace.net/
when I run the code I get an assertion error - in the backprop() method


What debugging of the code have you done? In cases like this you should use the debugger to trace through the code, watch the contents of the variables and see where in the code the execution deviates from that expected or where an assertion occurs. Then you know which line in the code is causing the issue and the contents of the variables involved. Once you know this then you have a clue as to the cause of the error. If you then don't understand, post the line in the above code causing the issue and the values of the appropriate variables.
Topic archived. No new replies allowed.