Skip to content

Commit afa6958

Browse files
fjljorgemmsilva
authored andcommitted
p2p/discover: fix crash when revalidated node is removed (ethereum#29864)
In ethereum#29572, I assumed the revalidation list that the node is contained in could only ever be changed by the outcome of a revalidation request. But turns out that's not true: if the node gets removed due to FINDNODE failure, it will also be removed from the list it is in. This causes a crash. The invariant is: while node is in table, it is always in exactly one of the two lists. So it seems best to store a pointer to the current list within the node itself.
1 parent 459bad9 commit afa6958

File tree

4 files changed

+125
-28
lines changed

4 files changed

+125
-28
lines changed

p2p/discover/node.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ type BucketNode struct {
4141
// The fields of Node may not be modified.
4242
type node struct {
4343
*enode.Node
44+
revalList *revalidationList
4445
addedToTable time.Time // first time node was added to bucket or replacement list
4546
addedToBucket time.Time // time it was added in the actual bucket
4647
livenessChecks uint // how often liveness was checked

p2p/discover/table_reval.go

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ type tableRevalidation struct {
3939
type revalidationResponse struct {
4040
n *node
4141
newRecord *enode.Node
42-
list *revalidationList
4342
didRespond bool
4443
}
4544

@@ -60,34 +59,35 @@ func (tr *tableRevalidation) nodeAdded(tab *Table, n *node) {
6059

6160
// nodeRemoved is called when a node was removed from the table.
6261
func (tr *tableRevalidation) nodeRemoved(n *node) {
63-
if !tr.fast.remove(n) {
64-
tr.slow.remove(n)
62+
if n.revalList == nil {
63+
panic(fmt.Errorf("removed node %v has nil revalList", n.ID()))
6564
}
65+
n.revalList.remove(n)
6666
}
6767

6868
// run performs node revalidation.
6969
// It returns the next time it should be invoked, which is used in the Table main loop
7070
// to schedule a timer. However, run can be called at any time.
7171
func (tr *tableRevalidation) run(tab *Table, now mclock.AbsTime) (nextTime mclock.AbsTime) {
7272
if n := tr.fast.get(now, &tab.rand, tr.activeReq); n != nil {
73-
tr.startRequest(tab, &tr.fast, n)
73+
tr.startRequest(tab, n)
7474
tr.fast.schedule(now, &tab.rand)
7575
}
7676
if n := tr.slow.get(now, &tab.rand, tr.activeReq); n != nil {
77-
tr.startRequest(tab, &tr.slow, n)
77+
tr.startRequest(tab, n)
7878
tr.slow.schedule(now, &tab.rand)
7979
}
8080

8181
return min(tr.fast.nextTime, tr.slow.nextTime)
8282
}
8383

8484
// startRequest spawns a revalidation request for node n.
85-
func (tr *tableRevalidation) startRequest(tab *Table, list *revalidationList, n *node) {
85+
func (tr *tableRevalidation) startRequest(tab *Table, n *node) {
8686
if _, ok := tr.activeReq[n.ID()]; ok {
87-
panic(fmt.Errorf("duplicate startRequest (list %q, node %v)", list.name, n.ID()))
87+
panic(fmt.Errorf("duplicate startRequest (node %v)", n.ID()))
8888
}
8989
tr.activeReq[n.ID()] = struct{}{}
90-
resp := revalidationResponse{n: n, list: list}
90+
resp := revalidationResponse{n: n}
9191

9292
// Fetch the node while holding lock.
9393
tab.mutex.Lock()
@@ -120,11 +120,28 @@ func (tab *Table) doRevalidate(resp revalidationResponse, node *enode.Node) {
120120

121121
// handleResponse processes the result of a revalidation request.
122122
func (tr *tableRevalidation) handleResponse(tab *Table, resp revalidationResponse) {
123-
now := tab.cfg.Clock.Now()
124-
n := resp.n
125-
b := tab.bucket(n.ID())
123+
var (
124+
now = tab.cfg.Clock.Now()
125+
n = resp.n
126+
b = tab.bucket(n.ID())
127+
)
126128
delete(tr.activeReq, n.ID())
127129

130+
// If the node was removed from the table while getting checked, we need to stop
131+
// processing here to avoid re-adding it.
132+
if n.revalList == nil {
133+
return
134+
}
135+
136+
// Store potential seeds in database.
137+
// This is done via defer to avoid holding Table lock while writing to DB.
138+
defer func() {
139+
if n.isValidatedLive && n.livenessChecks > 5 {
140+
tab.db.UpdateNode(resp.n.Node)
141+
}
142+
}()
143+
144+
// Remaining logic needs access to Table internals.
128145
tab.mutex.Lock()
129146
defer tab.mutex.Unlock()
130147

@@ -134,7 +151,7 @@ func (tr *tableRevalidation) handleResponse(tab *Table, resp revalidationRespons
134151
if n.livenessChecks <= 0 {
135152
tab.deleteInBucket(b, n.ID())
136153
} else {
137-
tr.moveToList(&tr.fast, resp.list, n, now, &tab.rand)
154+
tr.moveToList(&tr.fast, n, now, &tab.rand)
138155
}
139156
return
140157
}
@@ -151,27 +168,23 @@ func (tr *tableRevalidation) handleResponse(tab *Table, resp revalidationRespons
151168
n.isValidatedLive = false
152169
}
153170
}
154-
tab.log.Debug("Revalidated node", "b", b.index, "id", n.ID(), "checks", n.livenessChecks, "q", resp.list.name)
171+
tab.log.Debug("Revalidated node", "b", b.index, "id", n.ID(), "checks", n.livenessChecks, "q", n.revalList)
155172

156173
// Move node over to slow queue after first validation.
157174
if !endpointChanged {
158-
tr.moveToList(&tr.slow, resp.list, n, now, &tab.rand)
175+
tr.moveToList(&tr.slow, n, now, &tab.rand)
159176
} else {
160-
tr.moveToList(&tr.fast, resp.list, n, now, &tab.rand)
161-
}
162-
163-
// Store potential seeds in database.
164-
if n.isValidatedLive && n.livenessChecks > 5 {
165-
tab.db.UpdateNode(resp.n.Node)
177+
tr.moveToList(&tr.fast, n, now, &tab.rand)
166178
}
167179
}
168180

169-
func (tr *tableRevalidation) moveToList(dest, source *revalidationList, n *node, now mclock.AbsTime, rand randomSource) {
170-
if source == dest {
181+
// moveToList ensures n is in the 'dest' list.
182+
func (tr *tableRevalidation) moveToList(dest *revalidationList, n *node, now mclock.AbsTime, rand randomSource) {
183+
if n.revalList == dest {
171184
return
172185
}
173-
if !source.remove(n) {
174-
panic(fmt.Errorf("moveToList(%q -> %q): node %v not in source list", source.name, dest.name, n.ID()))
186+
if n.revalList != nil {
187+
n.revalList.remove(n)
175188
}
176189
dest.push(n, now, rand)
177190
}
@@ -208,16 +221,23 @@ func (list *revalidationList) push(n *node, now mclock.AbsTime, rand randomSourc
208221
if list.nextTime == never {
209222
list.schedule(now, rand)
210223
}
224+
n.revalList = list
211225
}
212226

213-
func (list *revalidationList) remove(n *node) bool {
227+
func (list *revalidationList) remove(n *node) {
214228
i := slices.Index(list.nodes, n)
215229
if i == -1 {
216-
return false
230+
panic(fmt.Errorf("node %v not found in list", n.ID()))
217231
}
218232
list.nodes = slices.Delete(list.nodes, i, i+1)
219233
if len(list.nodes) == 0 {
220234
list.nextTime = never
221235
}
222-
return true
236+
n.revalList = nil
237+
}
238+
239+
func (list *revalidationList) contains(id enode.ID) bool {
240+
return slices.ContainsFunc(list.nodes, func(n *node) bool {
241+
return n.ID() == id
242+
})
223243
}

p2p/discover/table_reval_test.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// Copyright 2024 The go-ethereum Authors
2+
// This file is part of the go-ethereum library.
3+
//
4+
// The go-ethereum library is free software: you can redistribute it and/or modify
5+
// it under the terms of the GNU Lesser General Public License as published by
6+
// the Free Software Foundation, either version 3 of the License, or
7+
// (at your option) any later version.
8+
//
9+
// The go-ethereum library is distributed in the hope that it will be useful,
10+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
// GNU Lesser General Public License for more details.
13+
//
14+
// You should have received a copy of the GNU Lesser General Public License
15+
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
16+
17+
package discover
18+
19+
import (
20+
"net"
21+
"testing"
22+
"time"
23+
24+
"github.com/ethereum/go-ethereum/common/mclock"
25+
)
26+
27+
// This test checks that revalidation can handle a node disappearing while
28+
// a request is active.
29+
func TestRevalidationNodeRemoved(t *testing.T) {
30+
var (
31+
clock mclock.Simulated
32+
transport = newPingRecorder()
33+
tab, db = newInactiveTestTable(transport, Config{Clock: &clock})
34+
tr = &tab.revalidation
35+
)
36+
defer db.Close()
37+
38+
// Fill a bucket.
39+
node := nodeAtDistance(tab.self().ID(), 255, net.IP{77, 88, 99, 1})
40+
tab.handleAddNode(addNodeOp{node: node})
41+
42+
// Start a revalidation request. Schedule once to get the next start time,
43+
// then advance the clock to that point and schedule again to start.
44+
next := tr.run(tab, clock.Now())
45+
clock.Run(time.Duration(next + 1))
46+
tr.run(tab, clock.Now())
47+
if len(tr.activeReq) != 1 {
48+
t.Fatal("revalidation request did not start:", tr.activeReq)
49+
}
50+
51+
// Delete the node.
52+
tab.deleteInBucket(tab.bucket(node.ID()), node.ID())
53+
54+
// Now finish the revalidation request.
55+
var resp revalidationResponse
56+
select {
57+
case resp = <-tab.revalResponseCh:
58+
case <-time.After(1 * time.Second):
59+
t.Fatal("timed out waiting for revalidation")
60+
}
61+
tr.handleResponse(tab, resp)
62+
63+
// Ensure the node was not re-added to the table.
64+
if tab.getNode(node.ID()) != nil {
65+
t.Fatal("node was re-added to Table")
66+
}
67+
if tr.fast.contains(node.ID()) || tr.slow.contains(node.ID()) {
68+
t.Fatal("removed node contained in revalidation list")
69+
}
70+
}

p2p/discover/table_util_test.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,15 @@ func init() {
4343
}
4444

4545
func newTestTable(t transport, cfg Config) (*Table, *enode.DB) {
46+
tab, db := newInactiveTestTable(t, cfg)
47+
go tab.loop()
48+
return tab, db
49+
}
50+
51+
// newInactiveTestTable creates a Table without running the main loop.
52+
func newInactiveTestTable(t transport, cfg Config) (*Table, *enode.DB) {
4653
db, _ := enode.OpenDB("")
4754
tab, _ := newTable(t, db, cfg)
48-
go tab.loop()
4955
return tab, db
5056
}
5157

0 commit comments

Comments
 (0)