From dcb62677dbf9f48ca7f7cc2d00d654029cf9276b Mon Sep 17 00:00:00 2001 From: frostebite Date: Sat, 14 Mar 2026 03:36:02 +0000 Subject: [PATCH] fix: add circuit breakers to prevent re-queue infinite retry loop Three targeted fixes that close the race windows causing cascading job failures and infinite re-dispatch loops: 1. Make registerNewBuild idempotent (ciBuilds.ts) - If build already exists with status "started" and same jobId, silently succeed (handles network timeout retries) - If build already exists with status "published", silently succeed - If build already exists with status "failed", overwrite with "started" (existing retry behavior, preserved) - If build exists with "started" but different jobId, throw with a descriptive error message Inspired by PR #73. 2. Add retry limits to base/hub image dispatch (scheduler.ts) - Check job failureCount against maxFailuresPerBuild (15) before re-dispatching base or hub image workflows - Log a warning and send a Discord alert when the limit is reached - Prevents infinite re-dispatch on every cron cycle when a base/hub job is stuck in "created" or "failed" state Uses new CiJobs.hasExceededRetryLimit() helper (ciJobs.ts). 3. Allow created -> inProgress transition (ciJobs.ts) - markJobAsInProgress now accepts jobs with status "created" in addition to "scheduled" - Closes the race window where scheduler dispatches a workflow but crashes before updating Firestore from "created" to "scheduled" - The workflow's reportNewBuild call now moves the job out of the schedulable state regardless of whether the scheduler updated it Inspired by PR #74. Co-Authored-By: Claude Opus 4.6 --- functions/src/logic/buildQueue/scheduler.ts | 20 +++++++++++++++ functions/src/model/ciBuilds.ts | 27 ++++++++++++++++++--- functions/src/model/ciJobs.ts | 6 ++++- 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/functions/src/logic/buildQueue/scheduler.ts b/functions/src/logic/buildQueue/scheduler.ts index 6b254c1..f6f283c 100644 --- a/functions/src/logic/buildQueue/scheduler.ts +++ b/functions/src/logic/buildQueue/scheduler.ts @@ -85,6 +85,16 @@ export class Scheduler { // Schedule it if (['created', 'failed'].includes(job.status)) { + // Check retry limit before re-dispatching + if (CiJobs.hasExceededRetryLimit(job, settings.maxFailuresPerBuild)) { + const message = + `[Scheduler] Base image job ${jobId} has exceeded the maximum retry limit ` + + `(${settings.maxFailuresPerBuild}). Manual action is required.`; + logger.warn(message); + await Discord.sendAlert(message); + return false; + } + const { repoVersionFull, repoVersionMinor, repoVersionMajor } = this; const response = await this.gitHub.repos.createDispatchEvent({ owner: 'unity-ci', @@ -126,6 +136,16 @@ export class Scheduler { // Schedule it if (['created', 'failed'].includes(job.status)) { + // Check retry limit before re-dispatching + if (CiJobs.hasExceededRetryLimit(job, settings.maxFailuresPerBuild)) { + const message = + `[Scheduler] Hub image job ${jobId} has exceeded the maximum retry limit ` + + `(${settings.maxFailuresPerBuild}). Manual action is required.`; + logger.warn(message); + await Discord.sendAlert(message); + return false; + } + const { repoVersionFull, repoVersionMinor, repoVersionMajor } = this; const response = await this.gitHub.repos.createDispatchEvent({ owner: 'unity-ci', diff --git a/functions/src/model/ciBuilds.ts b/functions/src/model/ciBuilds.ts index 94ce8cb..c3bb6d9 100644 --- a/functions/src/model/ciBuilds.ts +++ b/functions/src/model/ciBuilds.ts @@ -135,14 +135,33 @@ export class CiBuilds { let result; if (snapshot.exists) { - // Builds can be retried after a failure. - if (snapshot.data()?.status === 'failed') { - // In case or reporting a new build during retry step, only overwrite these fields + const existingStatus = snapshot.data()?.status; + if (existingStatus === 'failed') { + // Builds can be retried after a failure. + // In case of reporting a new build during retry step, only overwrite these fields result = await ref.set(data, { mergeFields: ['status', 'meta.lastBuildStart', 'modifiedDate'], }); + } else if (existingStatus === 'started') { + const existingJobId = snapshot.data()?.relatedJobId; + if (existingJobId !== relatedJobId) { + // Different job trying to register same build - this is unexpected and should fail + throw new Error( + `Build "${buildId}" already exists with jobId "${existingJobId}", ` + + `but received conflicting request from jobId "${relatedJobId}"`, + ); + } + // Idempotent - same job retrying after network timeout, safe to skip + logger.info(`Build "${buildId}" already exists with status "started", skipping`); + return; + } else if (existingStatus === 'published') { + // Build is already done, nothing to do + logger.info(`Build "${buildId}" already published, skipping`); + return; } else { - throw new Error(`A build with "${buildId}" as identifier already exists`); + throw new Error( + `A build with "${buildId}" already exists with status "${existingStatus}"`, + ); } } else { result = await ref.create(data); diff --git a/functions/src/model/ciJobs.ts b/functions/src/model/ciJobs.ts index 8a74e5b..bf4c282 100644 --- a/functions/src/model/ciJobs.ts +++ b/functions/src/model/ciJobs.ts @@ -205,7 +205,7 @@ export class CiJobs { // Do not override failure or completed let { status } = currentBuild; - if (['scheduled'].includes(status)) { + if (['created', 'scheduled'].includes(status)) { status = 'inProgress'; } @@ -227,6 +227,10 @@ export class CiJobs { }); }; + static hasExceededRetryLimit = (job: CiJob, maxRetries: number): boolean => { + return job.meta.failureCount >= maxRetries; + }; + static markJobAsCompleted = async (jobId: string) => { const job = await db.collection(CiJobs.collection).doc(jobId);