Skip to content

Commit 56d085b

Browse files
authored
Fix panic on fast reconnection of node (#2536)
* Fix panic on fast reconnection of node * Use parameter captured in closure as per review request
1 parent 92e587a commit 56d085b

File tree

2 files changed

+103
-3
lines changed

2 files changed

+103
-3
lines changed

hscontrol/notifier/notifier.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,26 @@ func (n *Notifier) Close() {
6363
n.closed = true
6464
n.b.close()
6565

66-
for _, c := range n.nodes {
67-
close(c)
66+
// Close channels safely using the helper method
67+
for nodeID, c := range n.nodes {
68+
n.safeCloseChannel(nodeID, c)
6869
}
70+
71+
// Clear node map after closing channels
72+
n.nodes = make(map[types.NodeID]chan<- types.StateUpdate)
73+
}
74+
75+
// safeCloseChannel closes a channel and panic recovers if already closed
76+
func (n *Notifier) safeCloseChannel(nodeID types.NodeID, c chan<- types.StateUpdate) {
77+
defer func() {
78+
if r := recover(); r != nil {
79+
log.Error().
80+
Uint64("node.id", nodeID.Uint64()).
81+
Any("recover", r).
82+
Msg("recovered from panic when closing channel in Close()")
83+
}
84+
}()
85+
close(c)
6986
}
7087

7188
func (n *Notifier) tracef(nID types.NodeID, msg string, args ...any) {
@@ -90,7 +107,11 @@ func (n *Notifier) AddNode(nodeID types.NodeID, c chan<- types.StateUpdate) {
90107
// connection. Close the old channel and replace it.
91108
if curr, ok := n.nodes[nodeID]; ok {
92109
n.tracef(nodeID, "channel present, closing and replacing")
93-
close(curr)
110+
// Use the safeCloseChannel helper in a goroutine to avoid deadlocks
111+
// if/when someone is waiting to send on this channel
112+
go func(ch chan<- types.StateUpdate) {
113+
n.safeCloseChannel(nodeID, ch)
114+
}(curr)
94115
}
95116

96117
n.nodes[nodeID] = c
@@ -161,6 +182,7 @@ func (n *Notifier) IsLikelyConnected(nodeID types.NodeID) bool {
161182
return false
162183
}
163184

185+
// LikelyConnectedMap returns a thread safe map of connected nodes
164186
func (n *Notifier) LikelyConnectedMap() *xsync.MapOf[types.NodeID, bool] {
165187
return n.connected
166188
}

hscontrol/notifier/notifier_test.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@ package notifier
22

33
import (
44
"context"
5+
"fmt"
6+
"math/rand"
57
"net/netip"
68
"sort"
9+
"sync"
710
"testing"
811
"time"
912

@@ -263,3 +266,78 @@ func TestBatcher(t *testing.T) {
263266
})
264267
}
265268
}
269+
270+
// TestIsLikelyConnectedRaceCondition tests for a race condition in IsLikelyConnected
271+
// Multiple goroutines calling AddNode and RemoveNode cause panics when trying to
272+
// close a channel that was already closed, which can happen when a node changes
273+
// network transport quickly (eg mobile->wifi) and reconnects whilst also disconnecting
274+
func TestIsLikelyConnectedRaceCondition(t *testing.T) {
275+
// mock config for the notifier
276+
cfg := &types.Config{
277+
Tuning: types.Tuning{
278+
NotifierSendTimeout: 1 * time.Second,
279+
BatchChangeDelay: 1 * time.Second,
280+
NodeMapSessionBufferedChanSize: 30,
281+
},
282+
}
283+
284+
notifier := NewNotifier(cfg)
285+
defer notifier.Close()
286+
287+
nodeID := types.NodeID(1)
288+
updateChan := make(chan types.StateUpdate, 10)
289+
290+
var wg sync.WaitGroup
291+
292+
// Number of goroutines to spawn for concurrent access
293+
concurrentAccessors := 100
294+
iterations := 100
295+
296+
// Add node to notifier
297+
notifier.AddNode(nodeID, updateChan)
298+
299+
// Track errors
300+
errChan := make(chan string, concurrentAccessors*iterations)
301+
302+
// Start goroutines to cause a race
303+
wg.Add(concurrentAccessors)
304+
for i := 0; i < concurrentAccessors; i++ {
305+
go func(routineID int) {
306+
defer wg.Done()
307+
308+
for j := 0; j < iterations; j++ {
309+
// Simulate race by having some goroutines check IsLikelyConnected
310+
// while others add/remove the node
311+
if routineID%3 == 0 {
312+
// This goroutine checks connection status
313+
isConnected := notifier.IsLikelyConnected(nodeID)
314+
if isConnected != true && isConnected != false {
315+
errChan <- fmt.Sprintf("Invalid connection status: %v", isConnected)
316+
}
317+
} else if routineID%3 == 1 {
318+
// This goroutine removes the node
319+
notifier.RemoveNode(nodeID, updateChan)
320+
} else {
321+
// This goroutine adds the node back
322+
notifier.AddNode(nodeID, updateChan)
323+
}
324+
325+
// Small random delay to increase chance of races
326+
time.Sleep(time.Duration(rand.Intn(100)) * time.Microsecond)
327+
}
328+
}(i)
329+
}
330+
331+
wg.Wait()
332+
close(errChan)
333+
334+
// Collate errors
335+
var errors []string
336+
for err := range errChan {
337+
errors = append(errors, err)
338+
}
339+
340+
if len(errors) > 0 {
341+
t.Errorf("Detected %d race condition errors: %v", len(errors), errors)
342+
}
343+
}

0 commit comments

Comments
 (0)