Implement Finny Tables to optimize Accumulator Updates

AndyGrant · AndyGrant · commit c5ba24dfcfd4 · 2023-05-12T19:17:33.000-04:00
ELO | 8.60 +- 4.67 (95%) SPRT | 10.0+0.10s Threads=1 Hash=8MB LLR | 2.95 (-2.94, 2.94) [0.00, 3.00] GAMES | N: 10544 W: 2754 L: 2493 D: 5297 http://chess.grantnet.us/test/31971/ NO FUNCTIONAL CHANGE BENCH : 3,583,142
diff --git a/src/nnue/accumulator.c b/src/nnue/accumulator.c
@@ -30,16 +30,13 @@
 #include "../thread.h"
 #include "../types.h"
 
-extern ALIGN64 int16_t in_weights[INSIZE * KPSIZE];
-extern ALIGN64 int16_t in_biases[KPSIZE];
-
 
 static int sq64_to_sq32(int sq) {
     static const int Mirror[] = { 3, 2, 1, 0, 0, 1, 2, 3 };
     return ((sq >> 1) & ~0x3) + Mirror[sq & 0x7];
 }
 
-static int nnue_index_delta(int piece, int relksq, int colour, int sq) {
+static int nnue_index(int piece, int relksq, int colour, int sq) {
 
     const int ptype   = pieceType(piece);
     const int pcolour = pieceColour(piece);
@@ -51,15 +48,10 @@ static int nnue_index_delta(int piece, int relksq, int colour, int sq) {
     return 640 * sq64_to_sq32(mksq) + (64 * (5 * (colour == pcolour) + ptype)) + mpsq;
 }
 
-static int nnue_index(Board *board, int relksq, int colour, int sq) {
-    return nnue_index_delta(board->squares[sq], relksq, colour, sq);
-}
-
-
 int nnue_can_update(NNUEAccumulator *accum, Board *board, int colour) {
 
     // Search back through the tree to find an accurate accum
-    while (accum != board->thread->nnueStack) {
+    while (accum != board->thread->nnue->stack) {
 
         // A King move prevents the entire tree from being updated
         if (   accum->changes
@@ -77,51 +69,6 @@ int nnue_can_update(NNUEAccumulator *accum, Board *board, int colour) {
     return FALSE;
 }
 
-void nnue_refresh_accumulator(NNUEAccumulator *accum, Board *board, int colour, int relsq) {
-
-    const uint64_t white = board->colours[WHITE];
-    const uint64_t black = board->colours[BLACK];
-    const uint64_t kings = board->pieces[KING];
-
-    int indices[32], count = 0;
-    uint64_t pieces = (white | black) & ~kings;
-    vepi16 *biases, *outputs, *weights, registers[NUM_REGS];
-
-    // Compute the list of indices just once, to then be used multiple
-    // times while updating the accumulator using a tiling method
-
-    while (pieces) {
-        const int sq = poplsb(&pieces);
-        indices[count++] = nnue_index(board, relsq, colour, sq);
-    }
-
-    // Refresh completely, using all pieces as inputs except the Kings
-    // We do this by tiling over the accumulator, to get the compiler to
-    // produce more optimal code that does not emit extra move instructions
-
-    for (int offset = 0; offset < KPSIZE; offset += NUM_REGS * vepi16_cnt) {
-
-        biases  = (vepi16*) &in_biases[offset];
-        outputs = (vepi16*) &accum->values[colour][offset];
-
-        for (int i = 0; i < NUM_REGS; i++)
-            registers[i] = biases[i];
-
-        for (int i = 0; i < count; i++) {
-
-            weights = (vepi16*) &in_weights[indices[i] * KPSIZE + offset];
-
-            for (int j = 0; j < NUM_REGS; j++)
-                registers[j] = vepi16_add(registers[j], weights[j]);
-        }
-
-        for (int i = 0; i < NUM_REGS; i++)
-            outputs[i] = registers[i];
-    }
-
-    accum->accurate[colour] = TRUE;
-}
-
 void nnue_update_accumulator(NNUEAccumulator *accum, Board *board, int colour, int relksq) {
 
     int add = 0, remove = 0;
@@ -141,11 +88,11 @@ void nnue_update_accumulator(NNUEAccumulator *accum, Board *board, int colour, i
 
         // Moving or placing a Piece to a Square
         if (x->to != SQUARE_NB)
-            add_list[add++] = nnue_index_delta(x->piece, relksq, colour, x->to);
+            add_list[add++] = nnue_index(x->piece, relksq, colour, x->to);
 
         // Moving or deleting a Piece from a Square
         if (x->from != SQUARE_NB)
-            remove_list[remove++] = nnue_index_delta(x->piece, relksq, colour, x->from);
+            remove_list[remove++] = nnue_index(x->piece, relksq, colour, x->from);
     }
 
     for (int offset = 0; offset < KPSIZE; offset += NUM_REGS * vepi16_cnt) {
@@ -179,3 +126,61 @@ void nnue_update_accumulator(NNUEAccumulator *accum, Board *board, int colour, i
     accum->accurate[colour] = TRUE;
     return;
 }
+
+void nnue_refresh_accumulator(NNUEEvaluator *nnue, NNUEAccumulator *accum, Board *board, int colour, int relsq) {
+
+    vepi16 *outputs, *weights, registers[NUM_REGS];
+    const int ksq = getlsb(board->pieces[KING] & board->colours[colour]);
+    NNUEAccumulatorTableEntry *entry = &nnue->table[ksq];
+
+    int set_indexes[32], set_count = 0;
+    int unset_indexes[32], unset_count = 0;
+
+    for (int c = WHITE; c <= BLACK; c++) {
+
+        for (int pt = PAWN; pt <= QUEEN; pt++) {
+
+            uint64_t pieces   = board->pieces[pt] & board->colours[c];
+            uint64_t to_set   = pieces & ~entry->occupancy[colour][c][pt];
+            uint64_t to_unset = entry->occupancy[colour][c][pt] & ~pieces;
+
+            while (to_set)
+                set_indexes[set_count++] = nnue_index(makePiece(pt, c), relsq, colour, poplsb(&to_set));
+
+            while (to_unset)
+                unset_indexes[unset_count++] = nnue_index(makePiece(pt, c), relsq, colour, poplsb(&to_unset));
+
+            entry->occupancy[colour][c][pt] = pieces;
+        }
+    }
+
+    for (int offset = 0; offset < KPSIZE; offset += NUM_REGS * vepi16_cnt) {
+
+        outputs = (vepi16*) &entry->accumulator.values[colour][offset];
+
+        for (int i = 0; i < NUM_REGS; i++)
+            registers[i] = outputs[i];
+
+        for (int i = 0; i < set_count; i++) {
+
+            weights = (vepi16*) &in_weights[set_indexes[i] * KPSIZE + offset];
+
+            for (int j = 0; j < NUM_REGS; j++)
+                registers[j] = vepi16_add(registers[j], weights[j]);
+        }
+
+        for (int i = 0; i < unset_count; i++) {
+
+            weights = (vepi16*) &in_weights[unset_indexes[i] * KPSIZE + offset];
+
+            for (int j = 0; j < NUM_REGS; j++)
+                registers[j] = vepi16_sub(registers[j], weights[j]);
+        }
+
+        for (int i = 0; i < NUM_REGS; i++)
+            outputs[i] = registers[i];
+    }
+
+    memcpy(accum->values[colour], entry->accumulator.values[colour], sizeof(int16_t) * KPSIZE);
+    accum->accurate[colour] = TRUE;
+}
diff --git a/src/nnue/accumulator.h b/src/nnue/accumulator.h
@@ -27,30 +27,42 @@
 #include "../thread.h"
 #include "../types.h"
 
-INLINE NNUEAccumulator* nnue_create_accumulators() {
-    return align_malloc(sizeof(NNUEAccumulator) * (MAX_PLY + 4));
+extern ALIGN64 int16_t in_weights[INSIZE * KPSIZE];
+extern ALIGN64 int16_t in_biases[KPSIZE];
+
+INLINE NNUEEvaluator* nnue_create_evaluator() {
+
+    NNUEEvaluator* nnue = align_malloc(sizeof(NNUEEvaluator));
+
+    for (size_t i = 0; i < SQUARE_NB; i++) {
+        memset(nnue->table[i].occupancy, 0, sizeof(nnue->table[i].occupancy));
+        memcpy(nnue->table[i].accumulator.values[WHITE], in_biases, sizeof(int16_t) * KPSIZE);
+        memcpy(nnue->table[i].accumulator.values[BLACK], in_biases, sizeof(int16_t) * KPSIZE);
+    }
+
+    return nnue;
 }
 
-INLINE void nnue_delete_accumulators(NNUEAccumulator* ptr) {
+INLINE void nnue_delete_accumulators(NNUEEvaluator* ptr) {
     align_free(ptr);
 }
 
 INLINE void nnue_pop(Board *board) {
     if (USE_NNUE && board->thread != NULL)
-        --board->thread->nnuePointer;
+        --board->thread->nnue->current;
 }
 
 INLINE void nnue_push(Board *board) {
     if (USE_NNUE && board->thread != NULL) {
-        NNUEAccumulator *accum = ++board->thread->nnuePointer;
+        NNUEAccumulator *accum = ++board->thread->nnue->current;
         accum->accurate[WHITE] = accum->accurate[BLACK] = FALSE;
         accum->changes = 0;
     }
 }
 
 INLINE void nnue_move_piece(Board *board, int piece, int from, int to) {
     if (USE_NNUE && board->thread != NULL) {
-        NNUEAccumulator *accum = board->thread->nnuePointer;
+        NNUEAccumulator *accum = board->thread->nnue->current;
         accum->deltas[accum->changes++] = (NNUEDelta) { piece, from, to };
     }
 }
@@ -65,5 +77,5 @@ INLINE void nnue_remove_piece(Board *board, int piece, int sq) {
 }
 
 int nnue_can_update(NNUEAccumulator *accum, Board *board, int colour);
-void nnue_refresh_accumulator(NNUEAccumulator *accum, Board *board, int colour, int relksq);
 void nnue_update_accumulator(NNUEAccumulator *accum, Board *board, int colour, int relksq);
+void nnue_refresh_accumulator(NNUEEvaluator *nnue, NNUEAccumulator *accum, Board *board, int colour, int relksq);
diff --git a/src/nnue/nnue.c b/src/nnue/nnue.c
@@ -476,12 +476,11 @@ int nnue_evaluate(Thread *thread, Board *board) {
     int wrelksq = relativeSquare(WHITE, getlsb(white & kings));
     int brelksq = relativeSquare(BLACK, getlsb(black & kings));
 
-    // Large enough to handle layer computations
-    ALIGN64 uint8_t out8[L1SIZE];
-    ALIGN64 float outN1[L1SIZE];
-    ALIGN64 float outN2[L1SIZE];
+    NNUEAccumulator *accum = thread->nnue->current;
 
-    NNUEAccumulator *accum = thread->nnuePointer;
+    ALIGN64 uint8_t out8[L1SIZE];
+    ALIGN64 float   outN1[L1SIZE];
+    ALIGN64 float   outN2[L1SIZE];
 
     if (!accum->accurate[WHITE]) {
 
@@ -491,7 +490,7 @@ int nnue_evaluate(Thread *thread, Board *board) {
 
         // History is missing, we must refresh completely
         else
-            nnue_refresh_accumulator(accum, board, WHITE, wrelksq);
+            nnue_refresh_accumulator(thread->nnue, accum, board, WHITE, wrelksq);
     }
 
     if (!accum->accurate[BLACK]) {
@@ -502,14 +501,14 @@ int nnue_evaluate(Thread *thread, Board *board) {
 
         // History is missing, we must refresh completely
         else
-            nnue_refresh_accumulator(accum, board, BLACK, brelksq);
+            nnue_refresh_accumulator(thread->nnue, accum, board, BLACK, brelksq);
     }
 
     // Feed-forward the entire evaluation function
     halfkp_relu(accum, out8, board->turn);
-    quant_affine_relu(l1_weights, l1_biases, out8, outN1);
+    quant_affine_relu(l1_weights, l1_biases, out8,  outN1);
     float_affine_relu(l2_weights, l2_biases, outN1, outN2);
-    output_transform(l3_weights, l3_biases, outN2, outN1);
+    output_transform (l3_weights, l3_biases, outN2, outN1);
 
     // Perform the dequantization step and upscale the Midgame
     mg_eval = 140 * ((int)(outN1[0]) >> SHIFT_L1) / 100;
diff --git a/src/nnue/types.h b/src/nnue/types.h
@@ -50,3 +50,14 @@ typedef struct NNUEAccumulator {
     NNUEDelta deltas[3];
     ALIGN64 int16_t values[COLOUR_NB][KPSIZE];
 } NNUEAccumulator;
+
+typedef struct NNUEAccumulatorTableEntry {
+    NNUEAccumulator accumulator;
+    uint64_t occupancy[COLOUR_NB][COLOUR_NB][PIECE_NB-1];
+} NNUEAccumulatorTableEntry;
+
+typedef struct NNUEEvaluator {
+    NNUEAccumulator stack[MAX_PLY + 4];         // Each ply of search
+    NNUEAccumulator *current;                   // Pointer of the current stack location
+    NNUEAccumulatorTableEntry table[SQUARE_NB]; // Finny table with Accumulators for each square
+} NNUEEvaluator;
diff --git a/src/thread.c b/src/thread.c
@@ -43,13 +43,13 @@ Thread* createThreadPool(int nthreads) {
         for (int j = 0; j < STACK_SIZE; j++)
             threads[i].nodeStates[j].continuations = NULL;
 
-        // Must dynamically allocate for the ALIGNs needed
-        threads[i].nnueStack = nnue_create_accumulators();
-
         // Threads will know of each other
-        threads[i].index = i;
-        threads[i].threads = threads;
+        threads[i].index    = i;
+        threads[i].threads  = threads;
         threads[i].nthreads = nthreads;
+
+        // Accumulator stack and table require alignment
+        threads[i].nnue     = nnue_create_evaluator();
     }
 
     return threads;
@@ -58,7 +58,7 @@ Thread* createThreadPool(int nthreads) {
 void deleteThreadPool(Thread *threads) {
 
     for (int i = 0; i < threads->nthreads; i++)
-        nnue_delete_accumulators(threads[i].nnueStack);
+        nnue_delete_accumulators(threads[i].nnue);
 
     free(threads);
 }
@@ -100,9 +100,10 @@ void newSearchThreadPool(Thread *threads, Board *board, Limits *limits, TimeMana
         memcpy(&threads[i].board, board, sizeof(Board));
         threads[i].board.thread = &threads[i];
 
-        threads[i].nnueStack[0].accurate[WHITE] = 0;
-        threads[i].nnueStack[0].accurate[BLACK] = 0;
-        threads[i].nnuePointer = &threads[i].nnueStack[0];
+        // Reset the accumulator stack. The table can remain
+        threads[i].nnue->current = &threads[i].nnue->stack[0];
+        threads[i].nnue->current->accurate[WHITE] = 0;
+        threads[i].nnue->current->accurate[BLACK] = 0;
 
         memset(threads[i].nodeStates, 0, sizeof(NodeState) * STACK_SIZE);
     }
diff --git a/src/thread.h b/src/thread.h
@@ -64,9 +64,10 @@ struct Thread {
     uint64_t nodes, tbhits;
     int depth, seldepth, height, completed;
 
-    NodeState *states, nodeStates[STACK_SIZE];
-    NNUEAccumulator *nnueStack, *nnuePointer;
+    NNUEEvaluator *nnue;
+
     Undo undoStack[STACK_SIZE];
+    NodeState *states, nodeStates[STACK_SIZE];
 
     ALIGN64 PKTable pktable;
     ALIGN64 KillerTable killers;
diff --git a/src/uci.h b/src/uci.h
@@ -22,7 +22,7 @@
 
 #include "types.h"
 
-#define VERSION_ID "14.10"
+#define VERSION_ID "14.11"
 
 #ifndef LICENSE_OWNER
     #define LICENSE_OWNER "Unlicensed"