Нейронная сеть MNIST с нуля жестко устойчиво при 40% точности

Нейронная сеть MNIST с нуля жестко устойчиво при 40% точности ⇐ Javascript

1 сообщение • Страница 1 из 1

Anonymous

Нейронная сеть MNIST с нуля жестко устойчиво при 40% точности

Цитата

Сообщение Anonymous » 13 авг 2025, 03:23

Как часть моего желания узнать больше об AI/машинном обучении, я предпринял идею создания нейронной сети с нуля, используя только математические библиотеки в JS. Моя цель - обучить его для высокой точности с набором данных MNIST. В настоящее время я экспериментировал с гиперпараметрами, такими как скорость обучения, размеры партии и даже способ инициализируются веса, но я не смог достичь какой -либо значительной точности в течение 50%. Даже при обучении и тестировании данных обучения, я, кажется, через некоторое время, я не вижу значительных улучшений в отношении точности или снижения потерь после нескольких эпох. Мой код ниже: (пожалуйста, простите со мной, я знаю, что он не следует за лучшими практиками, но структура позволяет мне понять каждый отдельный шаг в процессе нейронной сети) < /p>
Если у кого -то есть какие -либо идеи относительно того, почему он не изучает определенную точность % или увидит какие -либо проблемы с замешательством, я бы глубоко оценил бы помощь. Код запускается через узел и выводит точность и потери после каждой эпохи, а также размер и диапазоны определенных ключевых областей, таких как веса и градиент для каждого слоя. < /P>
const math = require('mathjs');
const mnist = require('mnist');

let weights1, biases1, weights2, biases2, weights3, biases3;
let learningRate = 0.01;
const inputSize = 784;
const hiddenSize = 128; // hidden layer
const hiddenSize2 = 12; // second hidden layer
const outputSize = 10; // digits 0–9

function init(){
const { training, test } = mnist.set(10000, 2000);

// Save data globally
global.trainingData = normalizeDataset(training);
global.testData = normalizeDataset(test);

function heInit(rows, cols) {
let stddev = Math.sqrt(2 / cols);
// math.random -> values in [0, 1)
let randMatrix = math.dotMultiply(math.random([rows, cols]), 1);
// shift to [-0.5, 0.5)
let centered = math.subtract(randMatrix, 0.5);
// multiply by stddev
return math.multiply(centered, 1);
}

// Initialize weights and biases with small random values
// Layer 1: input -> hidden1
weights1 = heInit(hiddenSize, inputSize);
biases1 = math.zeros([hiddenSize, 1]);

// Layer 2: hidden1 -> hidden2
weights2 = heInit(hiddenSize2, hiddenSize);
biases2 = math.zeros([hiddenSize2, 1]);

// Layer 3: hidden2 -> output
weights3 = heInit(outputSize, hiddenSize2);
biases3 = math.zeros([outputSize, 1]);

console.log("Initialized weights and biases.");
}

function relu(x) { return math.map(x, v => Math.max(0, v)); }
function reluDerivative(x) { return math.map(x, v => v > 0 ? 1 : 0); }

function softmax(x) {
const maxVal = math.max(x); // for numerical stability
const shifted = math.subtract(x, maxVal); // subtract max from each element

const exps = math.map(shifted, math.exp); // apply exp element-wise
const sumExp = math.sum(exps);

return math.divide(exps, sumExp); // element-wise divide
}

function logStats(name, matrix) {
const arr = matrix.flat(); // flatten to 1D array
const min = Math.min(...arr);
const max = Math.max(...arr);
const avg = arr.reduce((sum, v) => sum + v, 0) / arr.length;
console.log(`${name} - min: ${min.toFixed(5)}, max: ${max.toFixed(5)}, avg: ${avg.toFixed(5)}`);
}

function calculateLoss(dataset) {
let totalLoss = 0;
const epsilon = 1e-12;

for (let i = 0; i < dataset.length; i++) {
const X = math.matrix(dataset.input).reshape([inputSize, 1]);
const Y = math.matrix(dataset.output).reshape([outputSize, 1]);

const { a3 } = forward_prop(X);
const logProbs = math.map(a3, v => Math.log(v + epsilon));
const loss = -math.sum(math.dotMultiply(Y, logProbs));

totalLoss += loss;
}

return totalLoss / dataset.length;
}

function forward_prop(input){
input = math.resize(input, [inputSize, 1]);

let z1 = math.add(math.multiply(weights1, input), biases1);
//logStats("z1", z1);

let a1 = relu(z1);
//logStats("a1", a1);

let z2 = math.add(math.multiply(weights2, a1), biases2);
//logStats("z2", z2);

let a2 = relu(z2);
//logStats("a2", a2);

let z3 = math.add(math.multiply(weights3, a2), biases3);
//logStats("z3", z3);

let a3 = softmax(z3);
// Softmax output is probability, so min/max between 0 and 1 usually, average ~0.1
//logStats("a3", a3);

return { z1, a1, z2, a2, z3, a3 };
}

function shuffle(array) {
for (let i = array.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[array, array[j]] = [array[j], array];
}
}

function back_prop(x, y, result){

x = math.reshape(x, [inputSize, 1]);
y = math.reshape(y, [outputSize, 1]);
//should generate one gradient vector for example m. Calculate the derivatives and solve for the values for that input. Will be summed elsewhere and then averaged to find the average value of derivative for each parameter
//SOLVING FOR: dW3, dW2, dW1, and dB3, dB2, dB1. Get the accurate expressions, and then plug in values to get numeric answers as a gradient vector.
let dz3, dz2, dz1, dw3, dw2, dw1, db3, db2, db1;
//dC/dz3
dz3 = math.subtract(result.a3, y); //This is a simplified way, assuming softmax activation on the last layer, and then cross-entry for the loss function. This derivative is already solved, and basically is a clean way to already have a partial derivative for the pre-activated last layer output to the loss. Makes things easier
//solving for dw3. dC/dw3 = dz3/dw3 * dC/dz3
dw3 = math.multiply(dz3,math.transpose(result.a2)); // Should produce an output with the same shape as the weights, so each entry corresponds to one particular weight's partial derivative toward Cost
//db3. dC/db3 = dz3/db3 * dC/dz3
db3 = dz3; //This is a constant, because it derives to dz3/db3 = 1 * w*a, which simplifies to a constant 1.

dz2 = math.dotMultiply(math.multiply(math.transpose(weights3), dz3), reluDerivative(result.z2)); // This is the nifty chain rule, basically for each node in l2, changing it changes every node in l3. Changing an l2 node slightly, changes the activated output by derivative of relu, and that chains to, changes each node in l3 by its corresponding weight, and that change further contributes to the overall Cost change by that L3's node derivative. So basically we transpose the weight matrix, so that the matrix dot product, sums every weight from l2*its corresponding l3 node derivative. So, z2 changes C by z2's effect on A2, * A2's effect on Z3 (which is all the weights times each z3's derivative), * z3's effect on C.
dw2 = math.multiply(dz2,math.transpose(result.a1));
db2 = dz2;

dz1 = math.dotMultiply(math.multiply(math.transpose(weights2), dz2), reluDerivative(result.z1));
dw1 = math.multiply(dz1,math.transpose(x));
db1 = dz1;

return { dw1, db1, dw2, db2, dw3, db3 };
}

function normalizeDataset(data) {
// Normalize all inputs once, return new array
return data.map(d => ({
input: d.input.map(x => x / 255),
output: d.output
}));
}

// Pre-allocate gradient accumulators outside the epochs (reuse)
let dw1_sum, db1_sum, dw2_sum, db2_sum, dw3_sum, db3_sum;

function resetGradients() {
dw1_sum = math.zeros(math.size(weights1));
db1_sum = math.zeros(math.size(biases1));
dw2_sum = math.zeros(math.size(weights2));
db2_sum = math.zeros(math.size(biases2));
dw3_sum = math.zeros(math.size(weights3));
db3_sum = math.zeros(math.size(biases3));
}

function learn(epochs){
let batchSize = 32;

for(let e=0;e

Подробнее здесь: https://stackoverflow.com/questions/797 ... 0-accuracy

1755044610

Anonymous

 Как часть моего желания узнать больше об AI/машинном обучении, я предпринял идею создания нейронной сети с нуля, используя только математические библиотеки в JS. Моя цель - обучить его для высокой точности с набором данных MNIST. В настоящее время я экспериментировал с гиперпараметрами, такими как скорость обучения, размеры партии и даже способ инициализируются веса, но я не смог достичь какой -либо значительной точности в течение 50%. Даже при обучении и тестировании данных обучения, я, кажется, через некоторое время, я не вижу значительных улучшений в отношении точности или снижения потерь после нескольких эпох. Мой код ниже: (пожалуйста, простите со мной, я знаю, что он не следует за лучшими практиками, но структура позволяет мне понять каждый отдельный шаг в процессе нейронной сети) < /p>
Если у кого -то есть какие -либо идеи относительно того, почему он не изучает определенную точность % или увидит какие -либо проблемы с замешательством, я бы глубоко оценил бы помощь. Код запускается через узел и выводит точность и потери после каждой эпохи, а также размер и диапазоны определенных ключевых областей, таких как веса и градиент для каждого слоя.  < /P>
const math = require('mathjs');
const mnist = require('mnist');

let weights1, biases1, weights2, biases2, weights3, biases3;
let learningRate = 0.01;
const inputSize = 784;
const hiddenSize = 128;   // hidden layer
const hiddenSize2 = 12;   // second hidden layer
const outputSize = 10;    // digits 0–9

function init(){
const { training, test } = mnist.set(10000, 2000);

// Save data globally
global.trainingData = normalizeDataset(training);
global.testData = normalizeDataset(test);

function heInit(rows, cols) {
let stddev = Math.sqrt(2 / cols);
// math.random -> values in [0, 1)
let randMatrix = math.dotMultiply(math.random([rows, cols]), 1);
// shift to [-0.5, 0.5)
let centered = math.subtract(randMatrix, 0.5);
// multiply by stddev
return math.multiply(centered, 1);
}

// Initialize weights and biases with small random values
// Layer 1: input -> hidden1
weights1 = heInit(hiddenSize, inputSize);
biases1 = math.zeros([hiddenSize, 1]);

// Layer 2: hidden1 -> hidden2
weights2 = heInit(hiddenSize2, hiddenSize);
biases2 = math.zeros([hiddenSize2, 1]);

// Layer 3: hidden2 -> output
weights3 = heInit(outputSize, hiddenSize2);
biases3 = math.zeros([outputSize, 1]);

console.log("Initialized weights and biases.");
}

function relu(x) { return math.map(x, v => Math.max(0, v)); }
function reluDerivative(x) { return math.map(x, v => v > 0 ? 1 : 0); }

function softmax(x) {
const maxVal = math.max(x); // for numerical stability
const shifted = math.subtract(x, maxVal); // subtract max from each element

const exps = math.map(shifted, math.exp); // apply exp element-wise
const sumExp = math.sum(exps);

return math.divide(exps, sumExp); // element-wise divide
}

function logStats(name, matrix) {
const arr = matrix.flat(); // flatten to 1D array
const min = Math.min(...arr);
const max = Math.max(...arr);
const avg = arr.reduce((sum, v) => sum + v, 0) / arr.length;
console.log(`${name} - min: ${min.toFixed(5)}, max: ${max.toFixed(5)}, avg: ${avg.toFixed(5)}`);
}

function calculateLoss(dataset) {
let totalLoss = 0;
const epsilon = 1e-12;

for (let i = 0; i < dataset.length; i++) {
const X = math.matrix(dataset[i].input).reshape([inputSize, 1]);
const Y = math.matrix(dataset[i].output).reshape([outputSize, 1]);

const { a3 } = forward_prop(X);
const logProbs = math.map(a3, v => Math.log(v + epsilon));
const loss = -math.sum(math.dotMultiply(Y, logProbs));

totalLoss += loss;
}

return totalLoss / dataset.length;
}

function forward_prop(input){
input = math.resize(input, [inputSize, 1]);

let z1 = math.add(math.multiply(weights1, input), biases1);
//logStats("z1", z1);

let a1 = relu(z1);
//logStats("a1", a1);

let z2 = math.add(math.multiply(weights2, a1), biases2);
//logStats("z2", z2);

let a2 = relu(z2);
//logStats("a2", a2);

let z3 = math.add(math.multiply(weights3, a2), biases3);
//logStats("z3", z3);

let a3 = softmax(z3);
// Softmax output is probability, so min/max between 0 and 1 usually, average ~0.1
//logStats("a3", a3);

return { z1, a1, z2, a2, z3, a3 };
}

function shuffle(array) {
for (let i = array.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[array[i], array[j]] = [array[j], array[i]];
}
}

function back_prop(x, y, result){

x = math.reshape(x, [inputSize, 1]);
y = math.reshape(y, [outputSize, 1]);
//should generate one gradient vector for example m. Calculate the derivatives and solve for the values for that input. Will be summed elsewhere and then averaged to find the average value of derivative for each parameter
//SOLVING FOR: dW3, dW2, dW1, and dB3, dB2, dB1. Get the accurate expressions, and then plug in values to get numeric answers as a gradient vector.
let dz3, dz2, dz1, dw3, dw2, dw1, db3, db2, db1;
//dC/dz3
dz3 = math.subtract(result.a3, y); //This is a simplified way, assuming softmax activation on the last layer, and then cross-entry for the loss function.  This derivative is already solved, and basically is a clean way to already have a partial derivative for the pre-activated last layer output to the loss. Makes things easier
//solving for dw3. dC/dw3 = dz3/dw3 * dC/dz3
dw3 = math.multiply(dz3,math.transpose(result.a2)); // Should produce an output with the same shape as the weights, so each entry corresponds to one particular weight's partial derivative toward Cost
//db3. dC/db3 = dz3/db3 * dC/dz3
db3 = dz3; //This is a constant, because it derives to dz3/db3 = 1 * w*a, which simplifies to a constant 1.

dz2 = math.dotMultiply(math.multiply(math.transpose(weights3), dz3), reluDerivative(result.z2)); // This is the nifty chain rule, basically for each node in l2, changing it changes every node in l3. Changing an l2 node slightly, changes the activated output by derivative of relu, and that chains to, changes each node in l3 by its corresponding weight, and that change further contributes to the overall Cost change by that L3's node derivative. So basically we transpose the weight matrix, so that the matrix dot product, sums every weight from l2*its corresponding l3 node derivative.  So, z2 changes C by z2's effect on A2, * A2's effect on Z3 (which is all the weights times each z3's derivative), * z3's effect on C.
dw2 = math.multiply(dz2,math.transpose(result.a1));
db2 = dz2;

dz1 = math.dotMultiply(math.multiply(math.transpose(weights2), dz2), reluDerivative(result.z1));
dw1 = math.multiply(dz1,math.transpose(x));
db1 = dz1;

return { dw1, db1, dw2, db2, dw3, db3 };
}

function normalizeDataset(data) {
// Normalize all inputs once, return new array
return data.map(d => ({
input: d.input.map(x => x / 255),
output: d.output
}));
}

// Pre-allocate gradient accumulators outside the epochs (reuse)
let dw1_sum, db1_sum, dw2_sum, db2_sum, dw3_sum, db3_sum;

function resetGradients() {
dw1_sum = math.zeros(math.size(weights1));
db1_sum = math.zeros(math.size(biases1));
dw2_sum = math.zeros(math.size(weights2));
db2_sum = math.zeros(math.size(biases2));
dw3_sum = math.zeros(math.size(weights3));
db3_sum = math.zeros(math.size(biases3));
}

function learn(epochs){
let batchSize = 32;

for(let e=0;e

Подробнее здесь: [url]https://stackoverflow.com/questions/79733744/mnist-neural-net-from-scratch-is-plateauing-hard-at-40-accuracy[/url]