Двоичная перекрестная энтропийная потеря не сходится в пользовательской реализации ANN (C++). Тело:

Двоичная перекрестная энтропийная потеря не сходится в пользовательской реализации ANN (C++). Тело: ⇐ C++

1 сообщение • Страница 1 из 1

Anonymous

Двоичная перекрестная энтропийная потеря не сходится в пользовательской реализации ANN (C++). Тело:

Цитата

Сообщение Anonymous » 17 ноя 2024, 16:12

Я реализую простую искусственную нейронную сеть (ИНС) с нуля на C++. Моя сетевая архитектура следующая:
Входной уровень: 3 входа
Скрытый слой: 7 нейронов с активацией ReLU
Выходной уровень: 1 нейрон с сигмовидной активацией
Функция потерь: двоичная перекрестная энтропия (BCE)
Я использовал инициализацию He для весов и смещений и реализовал свой собственный автодифференциал в обратном режиме для вычисления градиента. Градиенты кажутся правильными при тестировании с различными функциями, включающими логарифмы.
Проблема в том, что потери BCE не сходятся во время обучения, а вместо этого увеличиваются. Однако, когда я переключаюсь на среднеквадратическую ошибку (MSE) в качестве функции потерь, потери сходятся, как и ожидалось.
Основные примечания:
Градиенты от autodiff вычисляются правильно.
Обучение работает нормально с MSE, но не работает с BCE. Я считаю, что проблема заключается в части двоичной кросс-энтропии, особенно в функции журнала, поскольку категориальная кросс-энтропия также работает неправильно.
Что может быть причиной проблемы с BCE не сходятся?
Код:

Код: Выделить всё

#include
#include
#include
#include
#include
#include
#include
#include
#include

using namespace std;

class VariableImpl {
public:
double value;
double grad;
function _backward;
vector _parents;
bool visited;

VariableImpl(double value){
this->value = value;
grad = 0.0;
_backward = [](){};
visited = false;
}
};

class Variable {
private:
shared_ptr impl;
explicit Variable(shared_ptr impl){
this->impl = move(impl);
}

public:
Variable(){}
Variable(double value){
impl = make_shared(value);
}

double getValue(){
return impl->value;
}

double getGrad(){
return impl->grad;
}

Variable operator+(const Variable& other){
auto out = make_shared(impl->value + other.impl->value);

out->_parents.push_back(impl);
out->_parents.push_back(other.impl);
out->_backward = [out, this_impl=impl, other_impl=other.impl]() {
this_impl->grad += out->grad;
other_impl->grad += out->grad;
};

return Variable(out);
}

friend Variable operator +(const double n, const Variable& other){
return Variable(n) + other;
}

Variable operator-(const Variable& other){
auto out = make_shared(impl->value - other.impl->value);

out->_parents.push_back(impl);
out->_parents.push_back(other.impl);
out->_backward = [out, this_impl=impl, other_impl=other.impl]() {
this_impl->grad += out->grad;
other_impl->grad -= out->grad;
};

return Variable(out);
}

friend Variable operator -(const double n, const Variable& other){
return Variable(n) - other;
}

Variable operator-(){
auto out = make_shared(-impl->value);
out->_parents = {impl};
out->_backward = [out, this_impl=impl]() {
this_impl->grad -= out->grad;
};
return Variable(out);
}

Variable operator*(const Variable& other) {
auto out = make_shared(impl->value * other.impl->value);

out->_parents.push_back(impl);
out->_parents.push_back(other.impl);
out->_backward = [out, this_impl=impl, other_impl=other.impl]() {
this_impl->grad += other_impl->value * out->grad;
other_impl->grad += this_impl->value * out->grad;
};

return Variable(out);
}

friend Variable operator *(const double n, const Variable& other){
return Variable(n) * other;
}

Variable operator/(const Variable& other) {
auto out = make_shared(impl->value / other.impl->value);

out->_parents.push_back(impl);
out->_parents.push_back(other.impl);
out->_backward = [out, this_impl=impl, other_impl=other.impl]() {
this_impl->grad += (1 / other_impl->value) * out->grad;
other_impl->grad -= (this_impl->value / (other_impl->value * other_impl->value)) * out->grad;
};

return Variable(out);
}

friend Variable operator /(const double n, const Variable&  other){
return Variable(n) / other;
}

Variable exponential(){
auto out = make_shared(exp(impl->value));

out->_parents.push_back(impl);
out->_backward = [out, this_impl=impl, out_impl=out]() {
this_impl->grad += out_impl->value * out->grad;
};

return Variable(out);
}

Variable logarithm(){
auto out = make_shared(log(impl->value));

out->_parents.push_back(impl);
out->_backward = [out, this_impl=impl, out_impl=out](){
this_impl->grad += (1 / this_impl->value) * out->grad;
};
return Variable(out);
}

void backward() {
impl->grad = 1.0;
vector funcs = { impl };

while (!funcs.empty()) {
auto f = funcs.back();
funcs.pop_back();
if (f->visited){
continue;
}
f->visited = true;

if (f->_backward) f->_backward();

for (const auto& parent : f->_parents) {
funcs.push_back(parent);
}
}
}
};

class Dense {
private:
vector bias;
vector weights;
int sizeRow, sizeCol;
string activationFunctionName;

public:
Dense(int sizeRow, int sizeCol, const string& activationFunctionName) {
this->sizeRow = sizeRow;
this->sizeCol = sizeCol;
this->activationFunctionName = activationFunctionName;
weights.resize(sizeRow, vector(sizeCol));
bias.resize(sizeCol);
int fan_in = sizeRow;
int fan_out = sizeCol;
long double limit = sqrt(2.0 /(fan_in));

mt19937 gen(static_cast(42));
uniform_real_distribution distrib(-limit, limit);

for (int i = 0; i < sizeRow; ++i) {
for (int j = 0; j < sizeCol; ++j) {
weights[i][j] = distrib(gen);
}
}

for (int i = 0; i < sizeCol; ++i) {
bias[i] = distrib(gen);
}

}

vector forwardPass(vector& inputs) {
vector sumArray(inputs.size(), vector(sizeCol));

for (int i = 0; i < inputs.size(); ++i) {
for (int j = 0; j < sizeCol; ++j) {
sumArray[i][j] = bias[j];

for (int k = 0; k < sizeRow; ++k) {
Variable prod = inputs[i][k] * weights[k][j];
sumArray[i][j] = sumArray[i][j] + prod;
}
}
}

auto result = activationFunction(sumArray);
return result;
}

vector activationFunction(vector& z) {
vector activatedZ(z.size(), vector(z[0].size()));
if (activationFunctionName == "sigmoid") {
for (int i = 0; i < z.size(); ++i) {
for (int j = 0; j < z[0].size(); ++j) {
activatedZ[i][j] = 1 / (1 + (-z[i][j]).exponential());
}
}
return activatedZ;
}

else if (activationFunctionName == "relu") {
for (int i = 0; i < z.size(); ++i) {
for (int j = 0; j < z[0].size(); ++j) {
if (z[i][j].getValue() > 0) {
activatedZ[i][j] = z[i][j];
}
else {
activatedZ[i][j] = 0;
}
}
}
return activatedZ;
}

}

void updateWeightsAndBiases(long double learningRate) {
for (int i = 0; i < sizeRow; ++i) {
for (int j = 0; j < sizeCol; ++j) {
weights[i][j] = weights[i][j] - Variable(learningRate) * Variable(weights[i][j].getGrad());
}
}
for (int i = 0; i < sizeCol;  ++i) {
bias[i] = bias[i] - Variable(learningRate) * Variable(bias[i].getGrad());
}
}

void getWeightsSample() {
cout 

Подробнее здесь: [url]https://stackoverflow.com/questions/79197262/binary-cross-entropy-loss-not-converging-in-custom-ann-implementation-c-body[/url]

1731849146

Anonymous

Я реализую простую искусственную нейронную сеть (ИНС) с нуля на C++. Моя сетевая архитектура следующая:
Входной уровень: 3 входа
Скрытый слой: 7 нейронов с активацией ReLU
Выходной уровень: 1 нейрон с сигмовидной активацией
Функция потерь: двоичная перекрестная энтропия (BCE)
Я использовал инициализацию He для весов и смещений и реализовал свой собственный автодифференциал в обратном режиме для вычисления градиента. Градиенты кажутся правильными при тестировании с различными функциями, включающими логарифмы.
Проблема в том, что потери BCE не сходятся во время обучения, а вместо этого увеличиваются. Однако, когда я переключаюсь на среднеквадратическую ошибку (MSE) в качестве функции потерь, потери сходятся, как и ожидалось.
Основные примечания:
Градиенты от autodiff вычисляются правильно.
Обучение работает нормально с MSE, но не работает с BCE.  Я считаю, что проблема заключается в части двоичной кросс-энтропии, особенно в функции журнала, поскольку категориальная кросс-энтропия также работает неправильно.
Что может быть причиной проблемы с BCE не сходятся?
Код:
[code]#include
#include
#include
#include
#include
#include
#include
#include
#include

using namespace std;

class VariableImpl {
public:
double value;
double grad;
function _backward;
vector _parents;
bool visited;

VariableImpl(double value){
this->value = value;
grad = 0.0;
_backward = [](){};
visited = false;
}
};

class Variable {
private:
shared_ptr impl;
explicit Variable(shared_ptr impl){
this->impl = move(impl);
}

public:
Variable(){}
Variable(double value){
impl = make_shared(value);
}

double getValue(){
return impl->value;
}

double getGrad(){
return impl->grad;
}

Variable operator+(const Variable& other){
auto out = make_shared(impl->value + other.impl->value);

out->_parents.push_back(impl);
out->_parents.push_back(other.impl);
out->_backward = [out, this_impl=impl, other_impl=other.impl]() {
this_impl->grad += out->grad;
other_impl->grad += out->grad;
};

return Variable(out);
}

friend Variable operator +(const double n, const Variable& other){
return Variable(n) + other;
}

Variable operator-(const Variable& other){
auto out = make_shared(impl->value - other.impl->value);

out->_parents.push_back(impl);
out->_parents.push_back(other.impl);
out->_backward = [out, this_impl=impl, other_impl=other.impl]() {
this_impl->grad += out->grad;
other_impl->grad -= out->grad;
};

return Variable(out);
}

friend Variable operator -(const double n, const Variable& other){
return Variable(n) - other;
}

Variable operator-(){
auto out = make_shared(-impl->value);
out->_parents = {impl};
out->_backward = [out, this_impl=impl]() {
this_impl->grad -= out->grad;
};
return Variable(out);
}

Variable operator*(const Variable& other) {
auto out = make_shared(impl->value * other.impl->value);

out->_parents.push_back(impl);
out->_parents.push_back(other.impl);
out->_backward = [out, this_impl=impl, other_impl=other.impl]() {
this_impl->grad += other_impl->value * out->grad;
other_impl->grad += this_impl->value * out->grad;
};

return Variable(out);
}

friend Variable operator *(const double n, const Variable& other){
return Variable(n) * other;
}

Variable operator/(const Variable& other) {
auto out = make_shared(impl->value / other.impl->value);

out->_parents.push_back(impl);
out->_parents.push_back(other.impl);
out->_backward = [out, this_impl=impl, other_impl=other.impl]() {
this_impl->grad += (1 / other_impl->value) * out->grad;
other_impl->grad -= (this_impl->value / (other_impl->value * other_impl->value)) * out->grad;
};

return Variable(out);
}

friend Variable operator /(const double n, const Variable&  other){
return Variable(n) / other;
}

Variable exponential(){
auto out = make_shared(exp(impl->value));

out->_parents.push_back(impl);
out->_backward = [out, this_impl=impl, out_impl=out]() {
this_impl->grad += out_impl->value * out->grad;
};

return Variable(out);
}

Variable logarithm(){
auto out = make_shared(log(impl->value));

out->_parents.push_back(impl);
out->_backward = [out, this_impl=impl, out_impl=out](){
this_impl->grad += (1 / this_impl->value) * out->grad;
};
return Variable(out);
}

void backward() {
impl->grad = 1.0;
vector funcs = { impl };

while (!funcs.empty()) {
auto f = funcs.back();
funcs.pop_back();
if (f->visited){
continue;
}
f->visited = true;

if (f->_backward) f->_backward();

for (const auto& parent : f->_parents) {
funcs.push_back(parent);
}
}
}
};

class Dense {
private:
vector bias;
vector weights;
int sizeRow, sizeCol;
string activationFunctionName;

public:
Dense(int sizeRow, int sizeCol, const string& activationFunctionName) {
this->sizeRow = sizeRow;
this->sizeCol = sizeCol;
this->activationFunctionName = activationFunctionName;
weights.resize(sizeRow, vector(sizeCol));
bias.resize(sizeCol);
int fan_in = sizeRow;
int fan_out = sizeCol;
long double limit = sqrt(2.0 /(fan_in));

mt19937 gen(static_cast(42));
uniform_real_distribution distrib(-limit, limit);

for (int i = 0; i < sizeRow; ++i) {
for (int j = 0; j < sizeCol; ++j) {
weights[i][j] = distrib(gen);
}
}

for (int i = 0; i < sizeCol; ++i) {
bias[i] = distrib(gen);
}

}

vector forwardPass(vector& inputs) {
vector sumArray(inputs.size(), vector(sizeCol));

for (int i = 0; i < inputs.size(); ++i) {
for (int j = 0; j < sizeCol; ++j) {
sumArray[i][j] = bias[j];

for (int k = 0; k < sizeRow; ++k) {
Variable prod = inputs[i][k] * weights[k][j];
sumArray[i][j] = sumArray[i][j] + prod;
}
}
}

auto result = activationFunction(sumArray);
return result;
}

vector activationFunction(vector& z) {
vector activatedZ(z.size(), vector(z[0].size()));
if (activationFunctionName == "sigmoid") {
for (int i = 0; i < z.size(); ++i) {
for (int j = 0; j < z[0].size(); ++j) {
activatedZ[i][j] = 1 / (1 + (-z[i][j]).exponential());
}
}
return activatedZ;
}

else if (activationFunctionName == "relu") {
for (int i = 0; i < z.size(); ++i) {
for (int j = 0; j < z[0].size(); ++j) {
if (z[i][j].getValue() > 0) {
activatedZ[i][j] = z[i][j];
}
else {
activatedZ[i][j] = 0;
}
}
}
return activatedZ;
}

}

void updateWeightsAndBiases(long double learningRate) {
for (int i = 0; i < sizeRow; ++i) {
for (int j = 0; j < sizeCol; ++j) {
weights[i][j] = weights[i][j] - Variable(learningRate) * Variable(weights[i][j].getGrad());
}
}
for (int i = 0; i < sizeCol;  ++i) {
bias[i] = bias[i] - Variable(learningRate) * Variable(bias[i].getGrad());
}
}

void getWeightsSample() {
cout 

Подробнее здесь: [url]https://stackoverflow.com/questions/79197262/binary-cross-entropy-loss-not-converging-in-custom-ann-implementation-c-body[/url]