paritytech · paritytech-processbot · May 16, 2023 · May 1, 2023 · May 1, 2023 · May 1, 2023
diff --git a/node/core/candidate-validation/src/lib.rs b/node/core/candidate-validation/src/lib.rs
@@ -636,6 +636,8 @@ where
 			Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(
 				"ambiguous worker death".to_string(),
 			))),
+		Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(err))) =>
+			Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),
 		Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::PrepareError(e))) => {
 			// In principle if preparation of the `WASM` fails, the current candidate can not be the
 			// reason for that. So we can't say whether it is invalid or not. In addition, with
@@ -709,13 +711,20 @@ trait ValidationBackend {
 			self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await;
 
 		// Allow limited retries for each kind of error.
+		//
+		// TODO: Should we stop retrying after some time has passed?
+		//       We would need to know if we came from backing or approval.
 		let mut num_internal_retries_left = 1;
 		let mut num_awd_retries_left = 1;
+		let mut num_panic_retries_left = 1;
 		loop {
 			match validation_result {
 				Err(ValidationError::InvalidCandidate(
 					WasmInvalidCandidate::AmbiguousWorkerDeath,
 				)) if num_awd_retries_left > 0 => num_awd_retries_left -= 1,
+				Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(_)))
+					if num_panic_retries_left > 0 =>
+					num_panic_retries_left -= 1,
 				Err(ValidationError::InternalError(_)) if num_internal_retries_left > 0 =>
 					num_internal_retries_left -= 1,
 				_ => break,

diff --git a/node/core/candidate-validation/src/tests.rs b/node/core/candidate-validation/src/tests.rs
@@ -540,6 +540,7 @@ fn candidate_validation_bad_return_is_invalid() {
 	assert_matches!(v, Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)));
 }
 
+// Test that we vote valid if we get `AmbiguousWorkerDeath`, retry, and then succeed.
 #[test]
 fn candidate_validation_one_ambiguous_error_is_valid() {
 	let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() };
@@ -728,6 +729,62 @@ fn candidate_validation_retry_internal_errors() {
 	assert_matches!(v, Err(ValidationFailed(s)) if s == "bar".to_string());
 }
 
+// Test that we retry on panic errors.
+#[test]
+fn candidate_validation_retry_panic_errors() {
+	let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() };
+
+	let pov = PoV { block_data: BlockData(vec![1; 32]) };
+	let validation_code = ValidationCode(vec![2; 16]);
+
+	let descriptor = make_valid_candidate_descriptor(
+		ParaId::from(1_u32),
+		dummy_hash(),
+		validation_data.hash(),
+		pov.hash(),
+		validation_code.hash(),
+		dummy_hash(),
+		dummy_hash(),
+		Sr25519Keyring::Alice,
+	);
+
+	let check = perform_basic_checks(
+		&descriptor,
+		validation_data.max_pov_size,
+		&pov,
+		&validation_code.hash(),
+	);
+	assert!(check.is_ok());
+
+	let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };
+
+	let pool = TaskExecutor::new();
+	let (mut ctx, ctx_handle) =
+		test_helpers::make_subsystem_context::<AllMessages, _>(pool.clone());
+	let metrics = Metrics::default();
+
+	let v = test_with_executor_params(ctx_handle, || {
+		validate_candidate_exhaustive(
+			ctx.sender(),
+			MockValidateCandidateBackend::with_hardcoded_result_list(vec![
+				Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic("foo".into()))),
+				// Throw an AWD error, we should still retry again.
+				Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
+				// Throw another panic error.
+				Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic("bar".into()))),
+			]),
+			validation_data,
+			validation_code,
+			candidate_receipt,
+			Arc::new(pov),
+			PvfExecTimeoutKind::Backing,
+			&metrics,
+		)
+	});
+
+	assert_matches!(v, Err(ValidationFailed(s)) if s == "bar".to_string());
+}
+
 #[test]
 fn candidate_validation_timeout_is_internal_error() {
 	let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() };

diff --git a/node/core/pvf/src/error.rs b/node/core/pvf/src/error.rs
@@ -103,7 +103,7 @@ pub enum InvalidCandidate {
 	///     an `rlimit` (if set) or, again, invited OOM killer. Another possibility is a bug in
 	///     wasmtime allowed the PVF to gain control over the execution worker.
 	///
-	/// We attribute such an event to an invalid candidate in either case.
+	/// We attribute such an event to an *invalid candidate* in either case.
 	///
 	/// The rationale for this is that a glitch may lead to unfair rejecting candidate by a single
 	/// validator. If the glitch is somewhat more persistent the validator will reject all candidate
@@ -113,6 +113,11 @@ pub enum InvalidCandidate {
 	AmbiguousWorkerDeath,
 	/// PVF execution (compilation is not included) took more time than was allotted.
 	HardTimeout,
+	/// A panic occurred and we can't be sure whether the candidate is really invalid or some internal glitch occurred.
+	/// Whenever we are unsure, we can never treat an error as internal as we would abstain from voting. This is bad
+	/// because if the issue was due to the candidate, then all validators would abstain, stalling finality on the
+	/// chain. So we will first retry the candidate, and if the issue persists we are forced to vote invalid.
+	Panic(String),
 }
 
 impl From<PrepareError> for ValidationError {

diff --git a/node/core/pvf/src/execute/queue.rs b/node/core/pvf/src/execute/queue.rs
@@ -334,15 +334,17 @@ fn handle_job_finish(
 			Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))),
 			None,
 		),
-		Outcome::InternalError { err, idle_worker } =>
-			(Some(idle_worker), Err(ValidationError::InternalError(err)), None),
+		Outcome::InternalError { err } => (None, Err(ValidationError::InternalError(err)), None),
 		Outcome::HardTimeout =>
 			(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None),
+		// "Maybe invalid" errors (will retry).
 		Outcome::IoErr => (
 			None,
 			Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)),
 			None,
 		),
+		Outcome::Panic { err } =>
+			(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::Panic(err))), None),
 	};
 
 	queue.metrics.execute_finished();
@@ -356,7 +358,7 @@ fn handle_job_finish(
 			err
 		);
 	} else {
-		gum::debug!(
+		gum::trace!(
 			target: LOG_TARGET,
 			?artifact_id,
 			?worker,

diff --git a/node/core/pvf/src/execute/worker_intf.rs b/node/core/pvf/src/execute/worker_intf.rs
@@ -64,6 +64,8 @@ pub async fn spawn(
 }
 
 /// Outcome of PVF execution.
+///
+/// If the idle worker token is not returned, it means the worker must be terminated.
 pub enum Outcome {
 	/// PVF execution completed successfully and the result is returned. The worker is ready for
 	/// another job.
@@ -73,18 +75,23 @@ pub enum Outcome {
 	InvalidCandidate { err: String, idle_worker: IdleWorker },
 	/// An internal error happened during the validation. Such an error is most likely related to
 	/// some transient glitch.
-	InternalError { err: String, idle_worker: IdleWorker },
+	///
+	/// Should only ever be used for errors independent of the candidate. Therefore it may be a problem with the worker,
+	/// so we terminate it.
+	InternalError { err: String },
 	/// The execution time exceeded the hard limit. The worker is terminated.
 	HardTimeout,
 	/// An I/O error happened during communication with the worker. This may mean that the worker
 	/// process already died. The token is not returned in any case.
 	IoErr,
+	/// An unexpected panic has occurred in the execution worker.
+	Panic { err: String },
 }
 
 /// Given the idle token of a worker and parameters of work, communicates with the worker and
 /// returns the outcome.
 ///
-/// NOTE: Returning the `HardTimeout` or `IoErr` errors will trigger the child process being killed.
+/// NOTE: Not returning the idle worker token in `Outcome` will trigger the child process being killed.
 pub async fn start_work(
 	worker: IdleWorker,
 	artifact: ArtifactPathId,
@@ -171,8 +178,8 @@ pub async fn start_work(
 		Response::InvalidCandidate(err) =>
 			Outcome::InvalidCandidate { err, idle_worker: IdleWorker { stream, pid } },
 		Response::TimedOut => Outcome::HardTimeout,
-		Response::InternalError(err) =>
-			Outcome::InternalError { err, idle_worker: IdleWorker { stream, pid } },
+		Response::Panic(err) => Outcome::Panic { err },
+		Response::InternalError(err) => Outcome::InternalError { err },
 	}
 }
 
@@ -223,7 +230,11 @@ pub enum Response {
 	InvalidCandidate(String),
 	/// The job timed out.
 	TimedOut,
-	/// Some internal error occurred. Should only be used for errors independent of the candidate.
+	/// An unexpected panic has occurred in the execution worker.
+	Panic(String),
+	/// Some internal error occurred.
+	///
+	/// Should only ever be used for errors independent of the candidate.
 	InternalError(String),
 }
 

diff --git a/node/core/pvf/worker/src/common.rs b/node/core/pvf/worker/src/common.rs
@@ -18,15 +18,12 @@ use crate::LOG_TARGET;
 use cpu_time::ProcessTime;
 use futures::never::Never;
 use std::{
+	any::Any,
 	path::PathBuf,
 	sync::mpsc::{Receiver, RecvTimeoutError},
 	time::Duration,
 };
-use tokio::{
-	io,
-	net::UnixStream,
-	runtime::{Handle, Runtime},
-};
+use tokio::{io, net::UnixStream, runtime::Runtime};
 
 /// Some allowed overhead that we account for in the "CPU time monitor" thread's sleeps, on the
 /// child process.
@@ -44,7 +41,7 @@ pub fn worker_event_loop<F, Fut>(
 	node_version: Option<&str>,
 	mut event_loop: F,
 ) where
-	F: FnMut(Handle, UnixStream) -> Fut,
+	F: FnMut(UnixStream) -> Fut,
 	Fut: futures::Future<Output = io::Result<Never>>,
 {
 	let worker_pid = std::process::id();
@@ -68,13 +65,12 @@ pub fn worker_event_loop<F, Fut>(
 
 	// Run the main worker loop.
 	let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it.");
-	let handle = rt.handle();
 	let err = rt
 		.block_on(async move {
 			let stream = UnixStream::connect(socket_path).await?;
 			let _ = tokio::fs::remove_file(socket_path).await;
 
-			let result = event_loop(handle.clone(), stream).await;
+			let result = event_loop(stream).await;
 
 			result
 		})
@@ -124,6 +120,20 @@ pub fn cpu_time_monitor_loop(
 	}
 }
 
+/// Attempt to convert an opaque panic payload to a string.
+///
+/// This is a best effort, and is not guaranteed to provide the most accurate value.
+pub fn stringify_panic_payload(payload: Box<dyn Any + Send + 'static>) -> String {
+	match payload.downcast::<&'static str>() {
+		Ok(msg) => msg.to_string(),
+		Err(payload) => match payload.downcast::<String>() {
+			Ok(msg) => *msg,
+			// At least we tried...
+			Err(_) => "unknown panic payload".to_string(),
+		},
+	}
+}
+
 /// In case of node and worker version mismatch (as a result of in-place upgrade), send `SIGTERM`
 /// to the node to tear it down and prevent it from raising disputes on valid candidates. Node
 /// restart should be handled by the node owner. As node exits, unix sockets opened to workers