node/test_trimmed_mean.py at master · autonet-code/node · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
"""
Test script for Trimmed Mean aggregation in Aggregator node.

This script tests the trimmed mean functionality without requiring blockchain.
"""

import logging
import numpy as np

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def test_trimmed_mean_basic():
    """Test basic trimmed mean functionality."""
    print("\n=== Test 1: Basic Trimmed Mean ===")

    # Create mock weight deltas with one malicious update
    updates = []
    for i in range(10):
        if i == 0:
            # Malicious update with extreme values
            weight_delta = {
                "layer1.weight": [[100.0, 100.0], [100.0, 100.0]],
                "layer1.bias": [100.0, 100.0],
            }
        elif i == 9:
            # Another malicious update with extreme values
            weight_delta = {
                "layer1.weight": [[-100.0, -100.0], [-100.0, -100.0]],
                "layer1.bias": [-100.0, -100.0],
            }
        else:
            # Normal update
            weight_delta = {
                "layer1.weight": [[1.0 + i*0.1, 2.0 + i*0.1], [3.0 + i*0.1, 4.0 + i*0.1]],
                "layer1.bias": [0.5 + i*0.1, 1.0 + i*0.1],
            }

        updates.append({
            "weight_delta": weight_delta,
            "metrics": {
                "loss": 0.1 + i*0.01,
                "accuracy": 0.9 - i*0.01,
                "num_samples": 100,
            }
        })

    print(f"Created {len(updates)} updates (2 malicious with extreme values)")

    # Simulate trimmed mean aggregation (trim_ratio=0.2, so trim 2 from each end)
    trim_ratio = 0.2
    trim_count = int(len(updates) * trim_ratio)

    print(f"Trim ratio: {trim_ratio} -> trimming {trim_count} from each end")
    print(f"Using {len(updates) - 2*trim_count} middle updates for aggregation")

    # Aggregate layer1.weight
    deltas = [u["weight_delta"] for u in updates]
    param_key = "layer1.weight"

    # Collect all values for this parameter
    param_values = [np.array(delta[param_key]) for delta in deltas]
    stacked = np.stack(param_values, axis=0)

    print(f"\nParameter: {param_key}")
    print(f"  Shape: {stacked.shape}")
    print(f"  Min value across all updates: {stacked.min()}")
    print(f"  Max value across all updates: {stacked.max()}")

    # Sort and trim
    sorted_values = np.sort(stacked, axis=0)
    trimmed_values = sorted_values[trim_count:-trim_count]
    aggregated_value = np.mean(trimmed_values, axis=0)

    print(f"  After trimming:")
    print(f"    Min value: {trimmed_values.min()}")
    print(f"    Max value: {trimmed_values.max()}")
    print(f"    Aggregated mean:\n{aggregated_value}")

    # For comparison, show what regular mean would give
    regular_mean = np.mean(stacked, axis=0)
    print(f"  Regular mean (without trimming) would be:\n{regular_mean}")

    # The trimmed mean should be much closer to normal values
    assert aggregated_value.min() > -10, "Trimmed mean should exclude extreme negatives"
    assert aggregated_value.max() < 10, "Trimmed mean should exclude extreme positives"
    print("\n[PASS] Test passed: Trimmed mean successfully excluded malicious updates")


def test_trimmed_mean_few_updates():
    """Test trimmed mean with too few updates."""
    print("\n=== Test 2: Too Few Updates for Trimming ===")

    # Create only 4 updates (with trim_ratio=0.2, trim_count=0, so we need more)
    # Use trim_ratio=0.3 to ensure we get trim_count=1, which means we need > 2 updates
    updates = []
    for i in range(4):
        updates.append({
            "weight_delta": {
                "layer1.weight": [[1.0 + i, 2.0 + i]],
            },
            "metrics": {
                "loss": 0.1,
                "accuracy": 0.9,
                "num_samples": 100,
            }
        })

    trim_ratio = 0.3  # With 4 updates, trim_count = 1
    trim_count = int(len(updates) * trim_ratio)

    print(f"Created {len(updates)} updates")
    print(f"Trim ratio: {trim_ratio} -> would trim {trim_count} from each end")
    print(f"Total to trim: {trim_count * 2} = {trim_count * 2} updates")
    print(f"Available for mean: {len(updates) - trim_count * 2} updates")

    # With 4 updates and trim_count=1, we'd trim 2 total, leaving 2 for mean
    # This is borderline but should work
    # Let's test the case where trim_count * 2 >= num_updates
    if trim_count * 2 >= len(updates):
        print("[PASS] Correctly identified: Not enough updates for trimming")
        print("  Will use regular mean instead")
    else:
        print("[PASS] Sufficient updates for trimming")
        print(f"  Will use {len(updates) - trim_count * 2} updates for mean")


def test_trimmed_mean_metrics():
    """Test trimmed mean on metrics (loss/accuracy)."""
    print("\n=== Test 3: Trimmed Mean on Metrics ===")

    # Create updates with varying metrics
    losses = [0.05, 0.10, 0.12, 0.13, 0.14, 0.15, 0.16, 0.20, 0.50, 1.00]  # Last two are outliers
    accuracies = [0.95, 0.90, 0.88, 0.87, 0.86, 0.85, 0.84, 0.80, 0.50, 0.10]  # Last two are outliers

    print(f"Losses: {losses}")
    print(f"Accuracies: {accuracies}")

    trim_ratio = 0.2
    trim_count = int(len(losses) * trim_ratio)

    print(f"\nTrim count: {trim_count} from each end")

    # Trimmed mean
    losses_sorted = sorted(losses)
    accuracies_sorted = sorted(accuracies)
    trimmed_loss = np.mean(losses_sorted[trim_count:-trim_count])
    trimmed_accuracy = np.mean(accuracies_sorted[trim_count:-trim_count])

    # Regular mean
    regular_loss = np.mean(losses)
    regular_accuracy = np.mean(accuracies)

    print(f"\nLoss:")
    print(f"  Regular mean: {regular_loss:.4f}")
    print(f"  Trimmed mean: {trimmed_loss:.4f}")
    print(f"  Difference: {abs(regular_loss - trimmed_loss):.4f}")

    print(f"\nAccuracy:")
    print(f"  Regular mean: {regular_accuracy:.4f}")
    print(f"  Trimmed mean: {trimmed_accuracy:.4f}")
    print(f"  Difference: {abs(regular_accuracy - trimmed_accuracy):.4f}")

    # Trimmed mean should be better (less affected by outliers)
    assert trimmed_loss < regular_loss, "Trimmed mean should reduce impact of high outliers"
    assert trimmed_accuracy > regular_accuracy, "Trimmed mean should reduce impact of low outliers"
    print("\n[PASS] Test passed: Trimmed mean improved metric robustness")


def test_byzantine_resistance():
    """Test Byzantine resistance with multiple attack scenarios."""
    print("\n=== Test 4: Byzantine Resistance ===")

    # Scenario: 10 total nodes, 2 are Byzantine (20%)
    num_honest = 8
    num_byzantine = 2
    total_nodes = num_honest + num_byzantine

    print(f"Scenario: {total_nodes} nodes ({num_honest} honest, {num_byzantine} Byzantine)")

    # Honest nodes produce similar gradients
    honest_gradients = []
    for i in range(num_honest):
        honest_gradients.append(1.0 + np.random.normal(0, 0.1))  # Small noise

    # Byzantine nodes try to poison (extreme values)
    byzantine_gradients = [1000.0, -1000.0]

    all_gradients = honest_gradients + byzantine_gradients

    print(f"\nGradients:")
    print(f"  Honest: {[f'{g:.3f}' for g in honest_gradients]}")
    print(f"  Byzantine: {byzantine_gradients}")

    # Regular mean (vulnerable)
    regular_mean = np.mean(all_gradients)

    # Trimmed mean (resistant)
    trim_ratio = 0.2
    trim_count = int(len(all_gradients) * trim_ratio)
    sorted_gradients = sorted(all_gradients)
    trimmed_gradients = sorted_gradients[trim_count:-trim_count]
    trimmed_mean = np.mean(trimmed_gradients)

    # Ground truth (mean of honest nodes)
    honest_mean = np.mean(honest_gradients)

    print(f"\nResults:")
    print(f"  Ground truth (honest only): {honest_mean:.4f}")
    print(f"  Regular mean (vulnerable):  {regular_mean:.4f}")
    print(f"  Trimmed mean (resistant):   {trimmed_mean:.4f}")

    print(f"\nError from ground truth:")
    print(f"  Regular mean: {abs(regular_mean - honest_mean):.4f}")
    print(f"  Trimmed mean: {abs(trimmed_mean - honest_mean):.4f}")

    # Trimmed mean should be much closer to ground truth
    assert abs(trimmed_mean - honest_mean) < abs(regular_mean - honest_mean)
    print("\n[PASS] Test passed: Trimmed mean is Byzantine-resistant")


if __name__ == "__main__":
    print("=" * 70)
    print("Testing Trimmed Mean Aggregation for Autonet")
    print("=" * 70)

    try:
        test_trimmed_mean_basic()
        test_trimmed_mean_few_updates()
        test_trimmed_mean_metrics()
        test_byzantine_resistance()

        print("\n" + "=" * 70)
        print("ALL TESTS PASSED")
        print("=" * 70)
        print("\nTrimmed Mean aggregation is working correctly!")
        print("It successfully:")
        print("  - Trims extreme values from top and bottom")
        print("  - Handles edge cases (too few updates)")
        print("  - Improves metric robustness")
        print("  - Resists Byzantine attacks (up to 20% malicious nodes)")

    except Exception as e:
        print(f"\n[FAIL] TEST FAILED: {e}")
        import traceback
        traceback.print_exc()