Skip to content

Commit c5ba24d

Browse files
committed
Implement Finny Tables to optimize Accumulator Updates
ELO | 8.60 +- 4.67 (95%) SPRT | 10.0+0.10s Threads=1 Hash=8MB LLR | 2.95 (-2.94, 2.94) [0.00, 3.00] GAMES | N: 10544 W: 2754 L: 2493 D: 5297 http://chess.grantnet.us/test/31971/ NO FUNCTIONAL CHANGE BENCH : 3,583,142
1 parent 3071b40 commit c5ba24d

File tree

7 files changed

+114
-85
lines changed

7 files changed

+114
-85
lines changed

src/nnue/accumulator.c

Lines changed: 62 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,13 @@
3030
#include "../thread.h"
3131
#include "../types.h"
3232

33-
extern ALIGN64 int16_t in_weights[INSIZE * KPSIZE];
34-
extern ALIGN64 int16_t in_biases[KPSIZE];
35-
3633

3734
static int sq64_to_sq32(int sq) {
3835
static const int Mirror[] = { 3, 2, 1, 0, 0, 1, 2, 3 };
3936
return ((sq >> 1) & ~0x3) + Mirror[sq & 0x7];
4037
}
4138

42-
static int nnue_index_delta(int piece, int relksq, int colour, int sq) {
39+
static int nnue_index(int piece, int relksq, int colour, int sq) {
4340

4441
const int ptype = pieceType(piece);
4542
const int pcolour = pieceColour(piece);
@@ -51,15 +48,10 @@ static int nnue_index_delta(int piece, int relksq, int colour, int sq) {
5148
return 640 * sq64_to_sq32(mksq) + (64 * (5 * (colour == pcolour) + ptype)) + mpsq;
5249
}
5350

54-
static int nnue_index(Board *board, int relksq, int colour, int sq) {
55-
return nnue_index_delta(board->squares[sq], relksq, colour, sq);
56-
}
57-
58-
5951
int nnue_can_update(NNUEAccumulator *accum, Board *board, int colour) {
6052

6153
// Search back through the tree to find an accurate accum
62-
while (accum != board->thread->nnueStack) {
54+
while (accum != board->thread->nnue->stack) {
6355

6456
// A King move prevents the entire tree from being updated
6557
if ( accum->changes
@@ -77,51 +69,6 @@ int nnue_can_update(NNUEAccumulator *accum, Board *board, int colour) {
7769
return FALSE;
7870
}
7971

80-
void nnue_refresh_accumulator(NNUEAccumulator *accum, Board *board, int colour, int relsq) {
81-
82-
const uint64_t white = board->colours[WHITE];
83-
const uint64_t black = board->colours[BLACK];
84-
const uint64_t kings = board->pieces[KING];
85-
86-
int indices[32], count = 0;
87-
uint64_t pieces = (white | black) & ~kings;
88-
vepi16 *biases, *outputs, *weights, registers[NUM_REGS];
89-
90-
// Compute the list of indices just once, to then be used multiple
91-
// times while updating the accumulator using a tiling method
92-
93-
while (pieces) {
94-
const int sq = poplsb(&pieces);
95-
indices[count++] = nnue_index(board, relsq, colour, sq);
96-
}
97-
98-
// Refresh completely, using all pieces as inputs except the Kings
99-
// We do this by tiling over the accumulator, to get the compiler to
100-
// produce more optimal code that does not emit extra move instructions
101-
102-
for (int offset = 0; offset < KPSIZE; offset += NUM_REGS * vepi16_cnt) {
103-
104-
biases = (vepi16*) &in_biases[offset];
105-
outputs = (vepi16*) &accum->values[colour][offset];
106-
107-
for (int i = 0; i < NUM_REGS; i++)
108-
registers[i] = biases[i];
109-
110-
for (int i = 0; i < count; i++) {
111-
112-
weights = (vepi16*) &in_weights[indices[i] * KPSIZE + offset];
113-
114-
for (int j = 0; j < NUM_REGS; j++)
115-
registers[j] = vepi16_add(registers[j], weights[j]);
116-
}
117-
118-
for (int i = 0; i < NUM_REGS; i++)
119-
outputs[i] = registers[i];
120-
}
121-
122-
accum->accurate[colour] = TRUE;
123-
}
124-
12572
void nnue_update_accumulator(NNUEAccumulator *accum, Board *board, int colour, int relksq) {
12673

12774
int add = 0, remove = 0;
@@ -141,11 +88,11 @@ void nnue_update_accumulator(NNUEAccumulator *accum, Board *board, int colour, i
14188

14289
// Moving or placing a Piece to a Square
14390
if (x->to != SQUARE_NB)
144-
add_list[add++] = nnue_index_delta(x->piece, relksq, colour, x->to);
91+
add_list[add++] = nnue_index(x->piece, relksq, colour, x->to);
14592

14693
// Moving or deleting a Piece from a Square
14794
if (x->from != SQUARE_NB)
148-
remove_list[remove++] = nnue_index_delta(x->piece, relksq, colour, x->from);
95+
remove_list[remove++] = nnue_index(x->piece, relksq, colour, x->from);
14996
}
15097

15198
for (int offset = 0; offset < KPSIZE; offset += NUM_REGS * vepi16_cnt) {
@@ -179,3 +126,61 @@ void nnue_update_accumulator(NNUEAccumulator *accum, Board *board, int colour, i
179126
accum->accurate[colour] = TRUE;
180127
return;
181128
}
129+
130+
void nnue_refresh_accumulator(NNUEEvaluator *nnue, NNUEAccumulator *accum, Board *board, int colour, int relsq) {
131+
132+
vepi16 *outputs, *weights, registers[NUM_REGS];
133+
const int ksq = getlsb(board->pieces[KING] & board->colours[colour]);
134+
NNUEAccumulatorTableEntry *entry = &nnue->table[ksq];
135+
136+
int set_indexes[32], set_count = 0;
137+
int unset_indexes[32], unset_count = 0;
138+
139+
for (int c = WHITE; c <= BLACK; c++) {
140+
141+
for (int pt = PAWN; pt <= QUEEN; pt++) {
142+
143+
uint64_t pieces = board->pieces[pt] & board->colours[c];
144+
uint64_t to_set = pieces & ~entry->occupancy[colour][c][pt];
145+
uint64_t to_unset = entry->occupancy[colour][c][pt] & ~pieces;
146+
147+
while (to_set)
148+
set_indexes[set_count++] = nnue_index(makePiece(pt, c), relsq, colour, poplsb(&to_set));
149+
150+
while (to_unset)
151+
unset_indexes[unset_count++] = nnue_index(makePiece(pt, c), relsq, colour, poplsb(&to_unset));
152+
153+
entry->occupancy[colour][c][pt] = pieces;
154+
}
155+
}
156+
157+
for (int offset = 0; offset < KPSIZE; offset += NUM_REGS * vepi16_cnt) {
158+
159+
outputs = (vepi16*) &entry->accumulator.values[colour][offset];
160+
161+
for (int i = 0; i < NUM_REGS; i++)
162+
registers[i] = outputs[i];
163+
164+
for (int i = 0; i < set_count; i++) {
165+
166+
weights = (vepi16*) &in_weights[set_indexes[i] * KPSIZE + offset];
167+
168+
for (int j = 0; j < NUM_REGS; j++)
169+
registers[j] = vepi16_add(registers[j], weights[j]);
170+
}
171+
172+
for (int i = 0; i < unset_count; i++) {
173+
174+
weights = (vepi16*) &in_weights[unset_indexes[i] * KPSIZE + offset];
175+
176+
for (int j = 0; j < NUM_REGS; j++)
177+
registers[j] = vepi16_sub(registers[j], weights[j]);
178+
}
179+
180+
for (int i = 0; i < NUM_REGS; i++)
181+
outputs[i] = registers[i];
182+
}
183+
184+
memcpy(accum->values[colour], entry->accumulator.values[colour], sizeof(int16_t) * KPSIZE);
185+
accum->accurate[colour] = TRUE;
186+
}

src/nnue/accumulator.h

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,30 +27,42 @@
2727
#include "../thread.h"
2828
#include "../types.h"
2929

30-
INLINE NNUEAccumulator* nnue_create_accumulators() {
31-
return align_malloc(sizeof(NNUEAccumulator) * (MAX_PLY + 4));
30+
extern ALIGN64 int16_t in_weights[INSIZE * KPSIZE];
31+
extern ALIGN64 int16_t in_biases[KPSIZE];
32+
33+
INLINE NNUEEvaluator* nnue_create_evaluator() {
34+
35+
NNUEEvaluator* nnue = align_malloc(sizeof(NNUEEvaluator));
36+
37+
for (size_t i = 0; i < SQUARE_NB; i++) {
38+
memset(nnue->table[i].occupancy, 0, sizeof(nnue->table[i].occupancy));
39+
memcpy(nnue->table[i].accumulator.values[WHITE], in_biases, sizeof(int16_t) * KPSIZE);
40+
memcpy(nnue->table[i].accumulator.values[BLACK], in_biases, sizeof(int16_t) * KPSIZE);
41+
}
42+
43+
return nnue;
3244
}
3345

34-
INLINE void nnue_delete_accumulators(NNUEAccumulator* ptr) {
46+
INLINE void nnue_delete_accumulators(NNUEEvaluator* ptr) {
3547
align_free(ptr);
3648
}
3749

3850
INLINE void nnue_pop(Board *board) {
3951
if (USE_NNUE && board->thread != NULL)
40-
--board->thread->nnuePointer;
52+
--board->thread->nnue->current;
4153
}
4254

4355
INLINE void nnue_push(Board *board) {
4456
if (USE_NNUE && board->thread != NULL) {
45-
NNUEAccumulator *accum = ++board->thread->nnuePointer;
57+
NNUEAccumulator *accum = ++board->thread->nnue->current;
4658
accum->accurate[WHITE] = accum->accurate[BLACK] = FALSE;
4759
accum->changes = 0;
4860
}
4961
}
5062

5163
INLINE void nnue_move_piece(Board *board, int piece, int from, int to) {
5264
if (USE_NNUE && board->thread != NULL) {
53-
NNUEAccumulator *accum = board->thread->nnuePointer;
65+
NNUEAccumulator *accum = board->thread->nnue->current;
5466
accum->deltas[accum->changes++] = (NNUEDelta) { piece, from, to };
5567
}
5668
}
@@ -65,5 +77,5 @@ INLINE void nnue_remove_piece(Board *board, int piece, int sq) {
6577
}
6678

6779
int nnue_can_update(NNUEAccumulator *accum, Board *board, int colour);
68-
void nnue_refresh_accumulator(NNUEAccumulator *accum, Board *board, int colour, int relksq);
6980
void nnue_update_accumulator(NNUEAccumulator *accum, Board *board, int colour, int relksq);
81+
void nnue_refresh_accumulator(NNUEEvaluator *nnue, NNUEAccumulator *accum, Board *board, int colour, int relksq);

src/nnue/nnue.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -476,12 +476,11 @@ int nnue_evaluate(Thread *thread, Board *board) {
476476
int wrelksq = relativeSquare(WHITE, getlsb(white & kings));
477477
int brelksq = relativeSquare(BLACK, getlsb(black & kings));
478478

479-
// Large enough to handle layer computations
480-
ALIGN64 uint8_t out8[L1SIZE];
481-
ALIGN64 float outN1[L1SIZE];
482-
ALIGN64 float outN2[L1SIZE];
479+
NNUEAccumulator *accum = thread->nnue->current;
483480

484-
NNUEAccumulator *accum = thread->nnuePointer;
481+
ALIGN64 uint8_t out8[L1SIZE];
482+
ALIGN64 float outN1[L1SIZE];
483+
ALIGN64 float outN2[L1SIZE];
485484

486485
if (!accum->accurate[WHITE]) {
487486

@@ -491,7 +490,7 @@ int nnue_evaluate(Thread *thread, Board *board) {
491490

492491
// History is missing, we must refresh completely
493492
else
494-
nnue_refresh_accumulator(accum, board, WHITE, wrelksq);
493+
nnue_refresh_accumulator(thread->nnue, accum, board, WHITE, wrelksq);
495494
}
496495

497496
if (!accum->accurate[BLACK]) {
@@ -502,14 +501,14 @@ int nnue_evaluate(Thread *thread, Board *board) {
502501

503502
// History is missing, we must refresh completely
504503
else
505-
nnue_refresh_accumulator(accum, board, BLACK, brelksq);
504+
nnue_refresh_accumulator(thread->nnue, accum, board, BLACK, brelksq);
506505
}
507506

508507
// Feed-forward the entire evaluation function
509508
halfkp_relu(accum, out8, board->turn);
510-
quant_affine_relu(l1_weights, l1_biases, out8, outN1);
509+
quant_affine_relu(l1_weights, l1_biases, out8, outN1);
511510
float_affine_relu(l2_weights, l2_biases, outN1, outN2);
512-
output_transform(l3_weights, l3_biases, outN2, outN1);
511+
output_transform (l3_weights, l3_biases, outN2, outN1);
513512

514513
// Perform the dequantization step and upscale the Midgame
515514
mg_eval = 140 * ((int)(outN1[0]) >> SHIFT_L1) / 100;

src/nnue/types.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,14 @@ typedef struct NNUEAccumulator {
5050
NNUEDelta deltas[3];
5151
ALIGN64 int16_t values[COLOUR_NB][KPSIZE];
5252
} NNUEAccumulator;
53+
54+
typedef struct NNUEAccumulatorTableEntry {
55+
NNUEAccumulator accumulator;
56+
uint64_t occupancy[COLOUR_NB][COLOUR_NB][PIECE_NB-1];
57+
} NNUEAccumulatorTableEntry;
58+
59+
typedef struct NNUEEvaluator {
60+
NNUEAccumulator stack[MAX_PLY + 4]; // Each ply of search
61+
NNUEAccumulator *current; // Pointer of the current stack location
62+
NNUEAccumulatorTableEntry table[SQUARE_NB]; // Finny table with Accumulators for each square
63+
} NNUEEvaluator;

src/thread.c

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,13 @@ Thread* createThreadPool(int nthreads) {
4343
for (int j = 0; j < STACK_SIZE; j++)
4444
threads[i].nodeStates[j].continuations = NULL;
4545

46-
// Must dynamically allocate for the ALIGNs needed
47-
threads[i].nnueStack = nnue_create_accumulators();
48-
4946
// Threads will know of each other
50-
threads[i].index = i;
51-
threads[i].threads = threads;
47+
threads[i].index = i;
48+
threads[i].threads = threads;
5249
threads[i].nthreads = nthreads;
50+
51+
// Accumulator stack and table require alignment
52+
threads[i].nnue = nnue_create_evaluator();
5353
}
5454

5555
return threads;
@@ -58,7 +58,7 @@ Thread* createThreadPool(int nthreads) {
5858
void deleteThreadPool(Thread *threads) {
5959

6060
for (int i = 0; i < threads->nthreads; i++)
61-
nnue_delete_accumulators(threads[i].nnueStack);
61+
nnue_delete_accumulators(threads[i].nnue);
6262

6363
free(threads);
6464
}
@@ -100,9 +100,10 @@ void newSearchThreadPool(Thread *threads, Board *board, Limits *limits, TimeMana
100100
memcpy(&threads[i].board, board, sizeof(Board));
101101
threads[i].board.thread = &threads[i];
102102

103-
threads[i].nnueStack[0].accurate[WHITE] = 0;
104-
threads[i].nnueStack[0].accurate[BLACK] = 0;
105-
threads[i].nnuePointer = &threads[i].nnueStack[0];
103+
// Reset the accumulator stack. The table can remain
104+
threads[i].nnue->current = &threads[i].nnue->stack[0];
105+
threads[i].nnue->current->accurate[WHITE] = 0;
106+
threads[i].nnue->current->accurate[BLACK] = 0;
106107

107108
memset(threads[i].nodeStates, 0, sizeof(NodeState) * STACK_SIZE);
108109
}

src/thread.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,10 @@ struct Thread {
6464
uint64_t nodes, tbhits;
6565
int depth, seldepth, height, completed;
6666

67-
NodeState *states, nodeStates[STACK_SIZE];
68-
NNUEAccumulator *nnueStack, *nnuePointer;
67+
NNUEEvaluator *nnue;
68+
6969
Undo undoStack[STACK_SIZE];
70+
NodeState *states, nodeStates[STACK_SIZE];
7071

7172
ALIGN64 PKTable pktable;
7273
ALIGN64 KillerTable killers;

src/uci.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
#include "types.h"
2424

25-
#define VERSION_ID "14.10"
25+
#define VERSION_ID "14.11"
2626

2727
#ifndef LICENSE_OWNER
2828
#define LICENSE_OWNER "Unlicensed"

0 commit comments

Comments
 (0)