Skip to content
This repository was archived by the owner on Nov 15, 2023. It is now read-only.

Commit ee0abe2

Browse files
committed
Add compilation_timeout parameter for PVF preparation job
1 parent 10d1460 commit ee0abe2

4 files changed

Lines changed: 71 additions & 21 deletions

File tree

node/core/pvf/src/host.rs

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,16 @@ use std::{
3838
time::{Duration, SystemTime},
3939
};
4040

41+
/// The time period after which the precheck preparation worker is considered unresponsive and will
42+
/// be killed.
43+
// NOTE: If you change this make sure to fix the buckets of `pvf_preparation_time` metric.
44+
const PRECHECK_COMPILATION_TIMEOUT: Duration = Duration::from_secs(60);
45+
46+
/// The time period after which the execute preparation worker is considered unresponsive and will
47+
/// be killed.
48+
// NOTE: If you change this make sure to fix the buckets of `pvf_preparation_time` metric.
49+
const EXECUTE_COMPILATION_TIMEOUT: Duration = Duration::from_secs(180);
50+
4151
/// An alias to not spell the type for the oneshot sender for the PVF execution result.
4252
pub(crate) type ResultSender = oneshot::Sender<Result<ValidationResult, ValidationError>>;
4353

@@ -92,7 +102,7 @@ impl ValidationHost {
92102

93103
/// Sends a signal to the validation host requesting to prepare a list of the given PVFs.
94104
///
95-
/// This is async to accommodate the fact a possibility of back-pressure. In the vast majority of
105+
/// This is async to accommodate the possibility of back-pressure. In the vast majority of
96106
/// situations this function should return immediately.
97107
///
98108
/// Returns an error if the request cannot be sent to the validation host, i.e. if it shut down.
@@ -443,8 +453,15 @@ async fn handle_precheck_pvf(
443453
}
444454
} else {
445455
artifacts.insert_preparing(artifact_id, vec![result_sender]);
446-
send_prepare(prepare_queue, prepare::ToQueue::Enqueue { priority: Priority::Normal, pvf })
447-
.await?;
456+
send_prepare(
457+
prepare_queue,
458+
prepare::ToQueue::Enqueue {
459+
priority: Priority::Normal,
460+
pvf,
461+
compilation_timeout: PRECHECK_COMPILATION_TIMEOUT,
462+
},
463+
)
464+
.await?;
448465
}
449466
Ok(())
450467
}
@@ -470,7 +487,7 @@ async fn handle_execute_pvf(
470487

471488
if let Some(state) = artifacts.artifact_state_mut(&artifact_id) {
472489
match state {
473-
ArtifactState::Prepared { ref mut last_time_needed } => {
490+
ArtifactState::Prepared { last_time_needed } => {
474491
*last_time_needed = SystemTime::now();
475492

476493
send_execute(
@@ -495,7 +512,15 @@ async fn handle_execute_pvf(
495512
// Artifact is unknown: register it and enqueue a job with the corresponding priority and
496513
// PVF.
497514
artifacts.insert_preparing(artifact_id.clone(), Vec::new());
498-
send_prepare(prepare_queue, prepare::ToQueue::Enqueue { priority, pvf }).await?;
515+
send_prepare(
516+
prepare_queue,
517+
prepare::ToQueue::Enqueue {
518+
priority,
519+
pvf,
520+
compilation_timeout: EXECUTE_COMPILATION_TIMEOUT,
521+
},
522+
)
523+
.await?;
499524

500525
awaiting_prepare.add(artifact_id, execution_timeout, params, result_tx);
501526
}
@@ -528,7 +553,11 @@ async fn handle_heads_up(
528553

529554
send_prepare(
530555
prepare_queue,
531-
prepare::ToQueue::Enqueue { priority: Priority::Normal, pvf: active_pvf },
556+
prepare::ToQueue::Enqueue {
557+
priority: Priority::Normal,
558+
pvf: active_pvf,
559+
compilation_timeout: EXECUTE_COMPILATION_TIMEOUT,
560+
},
532561
)
533562
.await?;
534563
}

node/core/pvf/src/prepare/pool.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,12 @@ pub enum ToPool {
6161
///
6262
/// In either case, the worker is considered busy and no further `StartWork` messages should be
6363
/// sent until either `Concluded` or `Rip` message is received.
64-
StartWork { worker: Worker, code: Arc<Vec<u8>>, artifact_path: PathBuf },
64+
StartWork {
65+
worker: Worker,
66+
code: Arc<Vec<u8>>,
67+
artifact_path: PathBuf,
68+
compilation_timeout: Duration,
69+
},
6570
}
6671

6772
/// A message sent from pool to its client.
@@ -205,7 +210,7 @@ fn handle_to_pool(
205210
metrics.prepare_worker().on_begin_spawn();
206211
mux.push(spawn_worker_task(program_path.to_owned(), spawn_timeout).boxed());
207212
},
208-
ToPool::StartWork { worker, code, artifact_path } => {
213+
ToPool::StartWork { worker, code, artifact_path, compilation_timeout } => {
209214
if let Some(data) = spawned.get_mut(worker) {
210215
if let Some(idle) = data.idle.take() {
211216
let preparation_timer = metrics.time_preparation();
@@ -216,6 +221,7 @@ fn handle_to_pool(
216221
code,
217222
cache_path.to_owned(),
218223
artifact_path,
224+
compilation_timeout,
219225
preparation_timer,
220226
)
221227
.boxed(),
@@ -263,9 +269,11 @@ async fn start_work_task<Timer>(
263269
code: Arc<Vec<u8>>,
264270
cache_path: PathBuf,
265271
artifact_path: PathBuf,
272+
compilation_timeout: Duration,
266273
_preparation_timer: Option<Timer>,
267274
) -> PoolEvent {
268-
let outcome = worker::start_work(idle, code, &cache_path, artifact_path).await;
275+
let outcome =
276+
worker::start_work(idle, code, &cache_path, artifact_path, compilation_timeout).await;
269277
PoolEvent::StartWork(worker, outcome)
270278
}
271279

node/core/pvf/src/prepare/queue.rs

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ use crate::{artifacts::ArtifactId, metrics::Metrics, PrepareResult, Priority, Pv
2121
use always_assert::{always, never};
2222
use async_std::path::PathBuf;
2323
use futures::{channel::mpsc, stream::StreamExt as _, Future, SinkExt};
24-
use std::collections::{HashMap, VecDeque};
24+
use std::{
25+
collections::{HashMap, VecDeque},
26+
time::Duration,
27+
};
2528

2629
/// A request to pool.
2730
#[derive(Debug)]
@@ -30,7 +33,7 @@ pub enum ToQueue {
3033
///
3134
/// Note that it is incorrect to enqueue the same PVF again without first receiving the
3235
/// [`FromQueue`] response.
33-
Enqueue { priority: Priority, pvf: Pvf },
36+
Enqueue { priority: Priority, pvf: Pvf, compilation_timeout: Duration },
3437
}
3538

3639
/// A response from queue.
@@ -76,6 +79,8 @@ struct JobData {
7679
/// The priority of this job. Can be bumped.
7780
priority: Priority,
7881
pvf: Pvf,
82+
/// The timeout for the preparation job.
83+
compilation_timeout: Duration,
7984
worker: Option<Worker>,
8085
}
8186

@@ -203,18 +208,24 @@ impl Queue {
203208

204209
async fn handle_to_queue(queue: &mut Queue, to_queue: ToQueue) -> Result<(), Fatal> {
205210
match to_queue {
206-
ToQueue::Enqueue { priority, pvf } => {
207-
handle_enqueue(queue, priority, pvf).await?;
211+
ToQueue::Enqueue { priority, pvf, compilation_timeout } => {
212+
handle_enqueue(queue, priority, pvf, compilation_timeout).await?;
208213
},
209214
}
210215
Ok(())
211216
}
212217

213-
async fn handle_enqueue(queue: &mut Queue, priority: Priority, pvf: Pvf) -> Result<(), Fatal> {
218+
async fn handle_enqueue(
219+
queue: &mut Queue,
220+
priority: Priority,
221+
pvf: Pvf,
222+
compilation_timeout: Duration,
223+
) -> Result<(), Fatal> {
214224
gum::debug!(
215225
target: LOG_TARGET,
216226
validation_code_hash = ?pvf.code_hash,
217227
?priority,
228+
?compilation_timeout,
218229
"PVF is enqueued for preparation.",
219230
);
220231
queue.metrics.prepare_enqueued();
@@ -236,7 +247,7 @@ async fn handle_enqueue(queue: &mut Queue, priority: Priority, pvf: Pvf) -> Resu
236247
return Ok(())
237248
}
238249

239-
let job = queue.jobs.insert(JobData { priority, pvf, worker: None });
250+
let job = queue.jobs.insert(JobData { priority, pvf, compilation_timeout, worker: None });
240251
queue.artifact_id_to_job.insert(artifact_id, job);
241252

242253
if let Some(available) = find_idle_worker(queue) {
@@ -424,7 +435,12 @@ async fn assign(queue: &mut Queue, worker: Worker, job: Job) -> Result<(), Fatal
424435

425436
send_pool(
426437
&mut queue.to_pool_tx,
427-
pool::ToPool::StartWork { worker, code: job_data.pvf.code.clone(), artifact_path },
438+
pool::ToPool::StartWork {
439+
worker,
440+
code: job_data.pvf.code.clone(),
441+
artifact_path,
442+
compilation_timeout: job_data.compilation_timeout,
443+
},
428444
)
429445
.await?;
430446

node/core/pvf/src/prepare/worker.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,6 @@ use parity_scale_codec::{Decode, Encode};
3232
use sp_core::hexdisplay::HexDisplay;
3333
use std::{panic, sync::Arc, time::Duration};
3434

35-
/// The time period after which the preparation worker is considered unresponsive and will be killed.
36-
// NOTE: If you change this make sure to fix the buckets of `pvf_preparation_time` metric.
37-
const COMPILATION_TIMEOUT: Duration = Duration::from_secs(60);
38-
3935
/// Spawns a new worker with the given program path that acts as the worker and the spawn timeout.
4036
///
4137
/// The program should be able to handle `<program-path> prepare-worker <socket-path>` invocation.
@@ -69,6 +65,7 @@ pub async fn start_work(
6965
code: Arc<Vec<u8>>,
7066
cache_path: &Path,
7167
artifact_path: PathBuf,
68+
compilation_timeout: Duration,
7269
) -> Outcome {
7370
let IdleWorker { mut stream, pid } = worker;
7471

@@ -103,7 +100,7 @@ pub async fn start_work(
103100
}
104101

105102
let selected =
106-
match async_std::future::timeout(COMPILATION_TIMEOUT, framed_recv(&mut stream)).await {
103+
match async_std::future::timeout(compilation_timeout, framed_recv(&mut stream)).await {
107104
Ok(Ok(response_bytes)) => {
108105
// Received bytes from worker within the time limit.
109106
// By convention we expect encoded `PrepareResult`.

0 commit comments

Comments
 (0)