diff --git a/cmd/cmd.go b/cmd/cmd.go index 33d4bf150..a82b7d93a 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -52,6 +52,7 @@ func New() *cobra.Command { newRecreatePrivateKeysCmd(runRecreatePrivateKeys), newAddOperatorsCmd(runAddOperators), newRemoveOperatorsCmd(runRemoveOperators), + newReplaceOperatorCmd(runReplaceOperator), ), newTestCmd( newTestAllCmd(runTestAll), diff --git a/cmd/edit.go b/cmd/edit.go index 1daa4f4fd..38701791e 100644 --- a/cmd/edit.go +++ b/cmd/edit.go @@ -10,7 +10,7 @@ func newEditCmd(cmds ...*cobra.Command) *cobra.Command { root := &cobra.Command{ Use: "edit", Short: "Subcommands provide functionality to modify existing cluster configurations", - Long: "Subcommands allow users to modify existing distributed validator cluster configurations, such as adding and removing operators.", + Long: "Subcommands allow users to modify existing distributed validator cluster configurations, such as adding, removing or replacing operators.", } root.AddCommand(cmds...) diff --git a/cmd/edit_replaceoperator.go b/cmd/edit_replaceoperator.go new file mode 100644 index 000000000..f77a740e1 --- /dev/null +++ b/cmd/edit_replaceoperator.go @@ -0,0 +1,163 @@ +// Copyright © 2022-2025 Obol Labs Inc. Licensed under the terms of a Business Source License 1.1 + +package cmd + +import ( + "context" + "os" + "slices" + "time" + + libp2plog "github.com/ipfs/go-log/v2" + "github.com/spf13/cobra" + + "github.com/obolnetwork/charon/app" + "github.com/obolnetwork/charon/app/errors" + "github.com/obolnetwork/charon/app/log" + "github.com/obolnetwork/charon/app/z" + "github.com/obolnetwork/charon/cluster" + "github.com/obolnetwork/charon/dkg" + "github.com/obolnetwork/charon/eth2util/enr" +) + +func newReplaceOperatorCmd(runFunc func(context.Context, dkg.ReplaceOperatorConfig, dkg.Config) error) *cobra.Command { + var ( + config dkg.ReplaceOperatorConfig + dkgConfig dkg.Config + ) + + cmd := &cobra.Command{ + Use: "replace-operator", + Short: "Replace an operator in an existing distributed validator cluster", + Long: `Replaces an operator in an existing distributed validator cluster, keeping validator public keys unchanged.`, + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { //nolint:revive // keep args variable name for clarity + if err := log.InitLogger(dkgConfig.Log); err != nil { + return err + } + + libp2plog.SetPrimaryCore(log.LoggerCore()) // Set libp2p logger to use charon logger + + return runFunc(cmd.Context(), config, dkgConfig) + }, + } + + cmd.Flags().StringVar(&config.PrivateKeyPath, "private-key-file", ".charon/charon-enr-private-key", "The path to the charon enr private key file. ") + cmd.Flags().StringVar(&config.LockFilePath, "lock-file", ".charon/cluster-lock.json", "The path to the cluster lock file defining the distributed validator cluster.") + cmd.Flags().StringVar(&config.ValidatorKeysDir, "validator-keys-dir", ".charon/validator_keys", "Path to the directory containing the validator private key share files and passwords.") + cmd.Flags().StringVar(&config.OutputDir, "output-dir", "distributed_validator", "The destination folder for the new cluster data. Must be empty.") + cmd.Flags().StringVar(&config.NewENR, "new-operator-enr", "", "The new operator to be added (Charon ENR address).") + cmd.Flags().StringVar(&config.OldENR, "old-operator-enr", "", "The old operator to be replaced (Charon ENR address).") + cmd.Flags().DurationVar(&dkgConfig.Timeout, "timeout", time.Minute, "Timeout for the protocol, should be increased if protocol times out.") + + bindNoVerifyFlag(cmd.Flags(), &dkgConfig.NoVerify) + bindP2PFlags(cmd, &dkgConfig.P2P, defaultAlphaRelay) + bindLogFlags(cmd.Flags(), &dkgConfig.Log) + bindEth1Flag(cmd.Flags(), &dkgConfig.ExecutionEngineAddr) + bindShutdownDelayFlag(cmd.Flags(), &dkgConfig.ShutdownDelay) + + return cmd +} + +func runReplaceOperator(ctx context.Context, config dkg.ReplaceOperatorConfig, dkgConfig dkg.Config) error { + if err := validateReplaceOperatorConfig(ctx, &config, &dkgConfig); err != nil { + return err + } + + log.Info(ctx, "Starting replace-operator ceremony", z.Str("lockFilePath", config.LockFilePath), z.Str("outputDir", config.OutputDir)) + + if err := dkg.RunReplaceOperatorProtocol(ctx, config, dkgConfig); err != nil { + return errors.Wrap(err, "run replace operator protocol") + } + + log.Info(ctx, "Successfully completed replace-operator ceremony 🎉") + log.Info(ctx, "IMPORTANT:") + log.Info(ctx, "You need to shut down your node (charon and VC) and restart it with the new data directory: "+config.OutputDir) + + return nil +} + +func validateReplaceOperatorConfig(ctx context.Context, config *dkg.ReplaceOperatorConfig, dkgConfig *dkg.Config) error { + if config.OutputDir == "" { + return errors.New("output-dir is required") + } + + if len(config.NewENR) == 0 { + return errors.New("new-operator-enr is required") + } + + if len(config.OldENR) == 0 { + return errors.New("old-operator-enr is required") + } + + if config.OldENR == config.NewENR { + return errors.New("old-operator-enr and new-operator-enr cannot be the same") + } + + if !app.FileExists(config.LockFilePath) { + return errors.New("lock-file does not exist") + } + + if dkgConfig.Timeout < time.Minute { + return errors.New("timeout must be at least 1 minute") + } + + lock, err := dkg.LoadAndVerifyClusterLock(ctx, config.LockFilePath, dkgConfig.ExecutionEngineAddr, dkgConfig.NoVerify) + if err != nil { + return err + } + + key, err := dkg.LoadPrivKey(config.PrivateKeyPath) + if err != nil { + return err + } + + r, err := enr.New(key) + if err != nil { + return err + } + + thisENR := r.String() + + if config.OldENR == thisENR { + return errors.New("the old-operator-enr shall not participate in the ceremony") + } + + for _, o := range lock.Operators { + if o.ENR == config.NewENR { + return errors.New("new-operator-enr matches an existing operator", z.Str("enr", config.NewENR)) + } + } + + containsOldENR := slices.ContainsFunc(lock.Operators, func(op cluster.Operator) bool { + return op.ENR == config.OldENR + }) + if !containsOldENR { + return errors.New("old-operator-enr does not match any existing operator in the cluster lock") + } + + // Validate validator keys based on node role + if config.NewENR == thisENR { + // New operator should not have existing validator keys + entries, err := os.ReadDir(config.ValidatorKeysDir) + if err != nil && !os.IsNotExist(err) { + return errors.Wrap(err, "read validator keys directory") + } + + if len(entries) > 0 { + return errors.New("new operator should not have existing validator keys") + } + } else if config.OldENR != thisENR { + // Continuing operators must have validator keys + secrets, err := dkg.LoadSecrets(config.ValidatorKeysDir) + if err != nil { + return errors.Wrap(err, "load validator keys") + } + + if len(secrets) != lock.NumValidators { + return errors.New("number of secret keys does not match validators in cluster lock") + } + } + + return nil +} diff --git a/cmd/edit_replaceoperator_internal_test.go b/cmd/edit_replaceoperator_internal_test.go new file mode 100644 index 000000000..cfc43e148 --- /dev/null +++ b/cmd/edit_replaceoperator_internal_test.go @@ -0,0 +1,159 @@ +// Copyright © 2022-2025 Obol Labs Inc. Licensed under the terms of a Business Source License 1.1 + +package cmd + +import ( + "bytes" + "path" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/obolnetwork/charon/dkg" + "github.com/obolnetwork/charon/eth2util" +) + +func TestNewReplaceOperatorCmd(t *testing.T) { + cmd := newReplaceOperatorCmd(runReplaceOperator) + require.NotNil(t, cmd) + require.Equal(t, "replace-operator", cmd.Use) + require.Equal(t, "Replace an operator in an existing distributed validator cluster", cmd.Short) + require.Empty(t, cmd.Flags().Args()) +} + +func TestValidateReplaceOperatorConfig(t *testing.T) { + srcDir := t.TempDir() + conf := clusterConfig{ + ClusterDir: srcDir, + Name: t.Name(), + NumNodes: 4, + Threshold: 3, + NumDVs: 3, + Network: eth2util.Holesky.Name, + TargetGasLimit: 36000000, + FeeRecipientAddrs: []string{feeRecipientAddr, feeRecipientAddr, feeRecipientAddr}, + WithdrawalAddrs: []string{feeRecipientAddr, feeRecipientAddr, feeRecipientAddr}, + } + + var buf bytes.Buffer + + err := runCreateCluster(t.Context(), &buf, conf) + require.NoError(t, err) + + lock, err := dkg.LoadAndVerifyClusterLock(t.Context(), path.Join(nodeDir(srcDir, 0), clusterLockFile), "", true) + require.NoError(t, err) + + tests := []struct { + name string + cmdConfig dkg.ReplaceOperatorConfig + dkgConfig dkg.Config + errMsg string + }{ + { + name: "output dir is required", + cmdConfig: dkg.ReplaceOperatorConfig{}, + errMsg: "output-dir is required", + }, + { + name: "new operator enr is required", + cmdConfig: dkg.ReplaceOperatorConfig{ + OutputDir: ".", + }, + errMsg: "new-operator-enr is required", + }, + { + name: "old operator enr is required", + cmdConfig: dkg.ReplaceOperatorConfig{ + OutputDir: ".", + NewENR: "enr:-IS4QH", + }, + errMsg: "old-operator-enr is required", + }, + { + name: "old and new operator enr cannot be the same", + cmdConfig: dkg.ReplaceOperatorConfig{ + OutputDir: ".", + NewENR: "enr:-IS4QH", + OldENR: "enr:-IS4QH", + }, + errMsg: "old-operator-enr and new-operator-enr cannot be the same", + }, + { + name: "lock-file does not exist", + cmdConfig: dkg.ReplaceOperatorConfig{ + OutputDir: ".", + NewENR: "enr:-IS4QH", + OldENR: "enr:-IS4QJ", + }, + errMsg: "lock-file does not exist", + }, + { + name: "timeout too low", + cmdConfig: dkg.ReplaceOperatorConfig{ + OutputDir: ".", + LockFilePath: path.Join(nodeDir(srcDir, 0), clusterLockFile), + PrivateKeyPath: path.Join(nodeDir(srcDir, 0), enrPrivateKeyFile), + NewENR: "enr:-IS4QH", + OldENR: lock.Operators[1].ENR, + }, + dkgConfig: dkg.Config{ + Timeout: time.Second, + }, + errMsg: "timeout must be at least 1 minute", + }, + { + name: "old operator enr shall not participate in the ceremony", + cmdConfig: dkg.ReplaceOperatorConfig{ + OutputDir: ".", + LockFilePath: path.Join(nodeDir(srcDir, 0), clusterLockFile), + PrivateKeyPath: path.Join(nodeDir(srcDir, 0), enrPrivateKeyFile), + NewENR: "enr:-IS4QH", + OldENR: lock.Operators[0].ENR, + }, + dkgConfig: dkg.Config{ + Timeout: time.Minute, + }, + errMsg: "the old-operator-enr shall not participate in the ceremony", + }, + { + name: "new operator enr matches existing", + cmdConfig: dkg.ReplaceOperatorConfig{ + OutputDir: ".", + LockFilePath: path.Join(nodeDir(srcDir, 0), clusterLockFile), + PrivateKeyPath: path.Join(nodeDir(srcDir, 0), enrPrivateKeyFile), + NewENR: lock.Operators[1].ENR, + OldENR: lock.Operators[2].ENR, + }, + dkgConfig: dkg.Config{ + Timeout: time.Minute, + }, + errMsg: "new-operator-enr matches an existing operator", + }, + { + name: "old operator enr does not match any existing operator", + cmdConfig: dkg.ReplaceOperatorConfig{ + OutputDir: ".", + LockFilePath: path.Join(nodeDir(srcDir, 0), clusterLockFile), + PrivateKeyPath: path.Join(nodeDir(srcDir, 0), enrPrivateKeyFile), + NewENR: "enr:-IS4QH", + OldENR: "enr:-IS4QJ", + }, + dkgConfig: dkg.Config{ + Timeout: time.Minute, + }, + errMsg: "old-operator-enr does not match any existing operator in the cluster lock", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateReplaceOperatorConfig(t.Context(), &tt.cmdConfig, &tt.dkgConfig) + if tt.errMsg != "" { + require.Equal(t, tt.errMsg, err.Error()) + } else { + require.NoError(t, err) + } + }) + } +} diff --git a/dkg/protocol_replaceoperator.go b/dkg/protocol_replaceoperator.go new file mode 100644 index 000000000..136dc7a0d --- /dev/null +++ b/dkg/protocol_replaceoperator.go @@ -0,0 +1,136 @@ +// Copyright © 2022-2025 Obol Labs Inc. Licensed under the terms of a Business Source License 1.1 + +package dkg + +import ( + "context" + "slices" + + "github.com/libp2p/go-libp2p/core/peer" + + "github.com/obolnetwork/charon/app/errors" + "github.com/obolnetwork/charon/cluster" + "github.com/obolnetwork/charon/dkg/bcast" + "github.com/obolnetwork/charon/dkg/pedersen" + "github.com/obolnetwork/charon/eth2util/enr" + "github.com/obolnetwork/charon/p2p" +) + +// ReplaceOperatorConfig contains the configuration for the replace-operator protocol. +// Typically populated from command line flags. +type ReplaceOperatorConfig struct { + PrivateKeyPath string + LockFilePath string + ValidatorKeysDir string + OutputDir string + NewENR string + OldENR string +} + +// RunReplaceOperatorProtocol runs the replace-operator DKG protocol. +func RunReplaceOperatorProtocol(ctx context.Context, config ReplaceOperatorConfig, dkgConfig Config) error { + return RunProtocol(ctx, + newReplaceOperatorProtocol(config), + config.LockFilePath, + config.PrivateKeyPath, + config.ValidatorKeysDir, + dkgConfig) +} + +type replaceOperatorProtocol struct { + outputDir string + newENR string + oldENR string + replacingIndex int + newLockENRs []string + config *pedersen.Config + board *pedersen.Board +} + +var _ Protocol = (*replaceOperatorProtocol)(nil) + +func newReplaceOperatorProtocol(config ReplaceOperatorConfig) *replaceOperatorProtocol { + return &replaceOperatorProtocol{ + outputDir: config.OutputDir, + newENR: config.NewENR, + oldENR: config.OldENR, + } +} + +func (p *replaceOperatorProtocol) GetPeers(lock *cluster.Lock) ([]p2p.Peer, error) { + // Replace the old operator with the new operator at the same index position. + // This maintains share index consistency - the new operator inherits the position. + peers, err := lock.Peers() + if err != nil { + return nil, err + } + + // Find and store the index of the operator being replaced + p.replacingIndex = slices.IndexFunc(lock.Operators, func(op cluster.Operator) bool { + return op.ENR == p.oldENR + }) + if p.replacingIndex == -1 { + return nil, errors.New("old operator not found in lock") + } + + newRec, err := enr.Parse(p.newENR) + if err != nil { + return nil, errors.Wrap(err, "parse enr") + } + + // Create new peer at the same index as the old operator + newPeer, err := p2p.NewPeerFromENR(newRec, p.replacingIndex) + if err != nil { + return nil, errors.Wrap(err, "new peer from enr") + } + + // Build the peer list: keep all continuing operators at their original indices, + // and place the new operator at the replacing index + newPeers := make([]p2p.Peer, len(peers)) + copy(newPeers, peers) + newPeers[p.replacingIndex] = newPeer + + // Build allENRs for the final cluster lock + for i, op := range lock.Operators { + if i == p.replacingIndex { + p.newLockENRs = append(p.newLockENRs, p.newENR) + } else { + p.newLockENRs = append(p.newLockENRs, op.ENR) + } + } + + return newPeers, nil +} + +func (p *replaceOperatorProtocol) PostInit(ctx context.Context, pctx *ProtocolContext) error { + pctx.SigExchanger = newExchanger(pctx.ThisNode, pctx.ThisNodeIdx.PeerIdx, pctx.PeerIDs, []sigType{sigLock}, pctx.Config.Timeout) + pctx.Caster = bcast.New(pctx.ThisNode, pctx.PeerIDs, pctx.ENRPrivateKey) + pctx.NodeSigCaster = newNodeSigBcast(pctx.Peers, pctx.ThisNodeIdx, pctx.Caster) + + // For replace operator: identify the old and new peer IDs at the replacement position. + // The old operator is being removed (OldPeers), the new operator is being added (NewPeers). + // Since they occupy the same index position, this is a one-for-one swap. + // Note: replacingIndex was already calculated and validated in GetPeers. + allPeers, err := pctx.Lock.Peers() + if err != nil { + return err + } + + oldPeerID := allPeers[p.replacingIndex].ID + newPeerID := pctx.Peers[p.replacingIndex].ID + + reshareConfig := pedersen.NewReshareConfig(len(pctx.Lock.Validators), pctx.Lock.Threshold, []peer.ID{newPeerID}, []peer.ID{oldPeerID}) + p.config = pedersen.NewConfig(pctx.ThisPeerID, pctx.PeerMap, pctx.Lock.Threshold, pctx.Lock.DefinitionHash, pctx.Config.Timeout/6, reshareConfig) + p.board = pedersen.NewBoard(ctx, pctx.ThisNode, p.config, pctx.Caster) + + return nil +} + +func (p *replaceOperatorProtocol) Steps(pctx *ProtocolContext) []ProtocolStep { + return []ProtocolStep{ + &reshareProtocolStep{config: p.config, board: p.board}, + &updateLockProtocolStep{threshold: pctx.Lock.Threshold, operators: p.newLockENRs}, + &updateNodeSignaturesProtocolStep{}, + &writeArtifactsProtocolStep{outputDir: p.outputDir}, + } +} diff --git a/dkg/protocol_test.go b/dkg/protocol_test.go index dc25939cb..c3375a2b5 100644 --- a/dkg/protocol_test.go +++ b/dkg/protocol_test.go @@ -196,6 +196,68 @@ func TestRunAddOperatorsProtocol(t *testing.T) { verifyClusterValidators(t, numValidators, getNodeDirs(dstClusterDir, totalNodes)) } +func TestRunReplaceOperatorProtocol(t *testing.T) { + const ( + numValidators = 3 + numNodes = 4 + threshold = 3 + ) + + srcClusterDir := createTestCluster(t, numNodes, threshold, numValidators) + dstClusterDir := t.TempDir() + + lockFilePath := path.Join(nodeDir(srcClusterDir, 0), clusterLockFile) + lock, err := dkg.LoadAndVerifyClusterLock(t.Context(), lockFilePath, "", false) + require.NoError(t, err) + + // replacing operator at index 2 + replacingIndex := 2 + + // Create a separate directory for the new operator (since it's a different entity) + newOpNodeDir := nodeDir(srcClusterDir, numNodes) // Create directory for new operator who doesn't have existing cluster data + newOpEnr := createENR(t, newOpNodeDir) + + err = app.CopyFile(path.Join(nodeDir(srcClusterDir, 0), clusterLockFile), path.Join(newOpNodeDir, clusterLockFile)) + require.NoError(t, err) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + // All 4 nodes run the protocol, but we map the new operator to replace the old one + runProtocol(t, numNodes, func(relayAddr string, n int) error { + dkgConfig := createDKGConfig(t, relayAddr) + + var ndir string + + if n == replacingIndex { + // New operator takes over index 2 (uses new operator's directory and keys) + ndir = newOpNodeDir + } else { + // Continuing operators use their original directories + ndir = nodeDir(srcClusterDir, n) + } + + replaceConfig := dkg.ReplaceOperatorConfig{ + LockFilePath: path.Join(ndir, clusterLockFile), + PrivateKeyPath: p2p.KeyPath(ndir), + ValidatorKeysDir: path.Join(ndir, validatorKeysDir), + OutputDir: nodeDir(dstClusterDir, n), + NewENR: newOpEnr, + OldENR: lock.Operators[replacingIndex].ENR, + } + + err := dkg.RunReplaceOperatorProtocol(ctx, replaceConfig, dkgConfig) + if err != nil { + cancel() + require.FailNowf(t, "Protocol failed", "Node %d failed: %v", n, err) + } + + return nil + }) + + verifyClusterValidators(t, numValidators, getNodeDirs(dstClusterDir, numNodes)) +} + func TestRunReshareProtocol(t *testing.T) { const ( numValidators = 3 @@ -386,9 +448,8 @@ func nodeDir(clusterDir string, i int) string { return fmt.Sprintf("%s/node%d", clusterDir, i) } -func getNodeDirs(clusterDir string, numNodes int, skip ...int) []string { //nolint:unparam - dirs := make([]string, 0) - +func getNodeDirs(clusterDir string, numNodes int, skip ...int) []string { + dirs := make([]string, 0, numNodes) for i := range numNodes { if slices.Contains(skip, i) { continue