Add support for admin-only evaluation feedback.

veluca93 · veluca93 · commit dd3b21c5f372 · 2026-03-01T23:52:59.000+01:00
Admin-only feedback is either automatically generated by the white diff
comparison step, or by outputting an additional line on stderr from the
checker that starts with `ADMIN_MESSAGE:`.
diff --git a/cms/db/submission.py b/cms/db/submission.py
@@ -766,6 +766,9 @@ class Evaluation(Base):
         nullable=False,
         default=[])
 
+    # Admin-facing output from the grader.
+    admin_text: str | None = Column(String, nullable=True, default=None)
+
     # Evaluation's time and wall-clock time, in seconds.
     execution_time: float | None = Column(
         Float,
diff --git a/cms/grading/Job.py b/cms/grading/Job.py
@@ -93,6 +93,7 @@ def __init__(
         info: str | None = None,
         success: bool | None = None,
         text: list[str] | None = None,
+        admin_text: str | None = None,
         files: dict[str, File] | None = None,
         managers: dict[str, Manager] | None = None,
         executables: dict[str, Executable] | None = None,
@@ -121,6 +122,8 @@ def __init__(
             to be presented to the user. The first item is a string,
             potentially with %-escaping; the following items are the
             values to be %-formatted into the first.
+        admin_text: description of the outcome of the job,
+            to be shown to admins.
         files: files submitted by the user.
         managers: managers provided by the admins.
         executables: executables created in the compilation.
@@ -155,6 +158,7 @@ def __init__(
 
         self.success = success
         self.text = text
+        self.admin_text = admin_text
 
         self.files = files
         self.managers = managers
@@ -178,6 +182,7 @@ def export_to_dict(self) -> dict:
             'info': self.info,
             'success': self.success,
             'text': self.text,
+            'admin_text': self.admin_text,
             'files': dict((k, v.digest)
                           for k, v in self.files.items()),
             'managers': dict((k, v.digest)
@@ -316,6 +321,7 @@ def __init__(
         compilation_success: bool | None = None,
         executables: dict[str, Executable] | None = None,
         text: list[str] | None = None,
+        admin_text: str | None = None,
         plus: dict | None = None,
     ):
         """Initialization.
@@ -331,7 +337,7 @@ def __init__(
         Job.__init__(self, operation, task_type, task_type_parameters,
                      language, multithreaded_sandbox, archive_sandbox,
                      shard, keep_sandbox, sandboxes, sandbox_digests, info, success,
-                     text, files, managers, executables)
+                     text, admin_text, files, managers, executables)
         self.compilation_success = compilation_success
         self.plus = plus
 
@@ -537,6 +543,7 @@ def __init__(
         success: bool | None = None,
         outcome: str | None = None,
         text: list[str] | None = None,
+        admin_text: list[str] | None = None,
         user_output: str | None = None,
         plus: dict | None = None,
         only_execution: bool | None = False,
@@ -567,7 +574,7 @@ def __init__(
         Job.__init__(self, operation, task_type, task_type_parameters,
                      language, multithreaded_sandbox, archive_sandbox,
                      shard, keep_sandbox, sandboxes, sandbox_digests, info, success,
-                     text, files, managers, executables)
+                     text, admin_text, files, managers, executables)
         self.input = input
         self.output = output
         self.time_limit = time_limit
@@ -653,6 +660,7 @@ def to_submission(self, sr: SubmissionResult):
 
         sr.evaluations += [Evaluation(
             text=self.text,
+            admin_text=self.admin_text,
             outcome=self.outcome,
             execution_time=self.plus.get('execution_time'),
             execution_wall_clock_time=self.plus.get(
diff --git a/cms/grading/scoretypes/abc.py b/cms/grading/scoretypes/abc.py
@@ -144,8 +144,8 @@ def get_html_details(
                                             translation=translation,
                                             gettext=_, ngettext=n_)
             except Exception:
-                logger.error("Found an invalid score details string. "
-                             "Try invalidating scores.")
+                logger.exception("Found an invalid score details string. "
+                                 "Try invalidating scores.")
                 return _("Score details temporarily unavailable.")
 
     @abstractmethod
diff --git a/cms/grading/steps/trusted.py b/cms/grading/steps/trusted.py
@@ -77,13 +77,14 @@ def _sanitize_message(string: str) -> str:
     return string.replace('%', '%%')
 
 
-def extract_outcome_and_text(sandbox: Sandbox) -> tuple[float, list[str]]:
+def extract_outcome_and_text(sandbox: Sandbox) -> tuple[float, list[str], str]:
     """Extract the outcome and the text from the a standard manager output.
 
     sandbox: the sandbox whose last execution was a manager writing
         a standard manager output.
 
-    return: outcome and text.
+    return: outcome, contestant-facing text and admin-facing text
+        (not translated).
 
     raise (ValueError): if cannot decode the data.
     raise (FileNotFoundError): if any of the sandbox stdout or stderr file
@@ -108,6 +109,23 @@ def extract_outcome_and_text(sandbox: Sandbox) -> tuple[float, list[str]]:
             logger.error("Manager stderr (text) is malformed. %r", error)
             raise error
 
+        # Parse special commands
+        admin_text = None
+        for line in stderr_file.readlines():
+            line = line.strip()
+            if not line:
+                continue
+
+            PREFIX = "ADMIN_MESSAGE:"
+            if line.startswith(PREFIX):
+                line = _sanitize_message(line[len(PREFIX):].strip())
+                if admin_text is not None:
+                    admin_text = admin_text + " " + line
+                else:
+                    admin_text = line
+            else:
+                logger.warning(f"Unknown special manager command `{line}`")
+
     try:
         outcome = float(outcome)
     except ValueError:
@@ -125,7 +143,7 @@ def extract_outcome_and_text(sandbox: Sandbox) -> tuple[float, list[str]]:
             logger.warning("Manager asked to translate text, but string "
                            "'%s' is not recognized." % remaining)
 
-    return outcome, [text]
+    return outcome, [text], admin_text
 
 
 def trusted_step(
@@ -213,7 +231,8 @@ def checker_step(
     extra_args: extra arguments to pass to the checker.
 
     return: success (true if the checker was able to check the solution
-        successfully), outcome and text (both None if success is False).
+        successfully), outcome, text and admin_text (all None if success
+        is False).
 
     """
     # Check that the file we are going to inject in the sandbox are not already
@@ -224,12 +243,12 @@ def checker_step(
         if sandbox.file_exists(filename):
             logger.error("File %s already in the sandbox for the checker.",
                          filename)
-            return False, None, None
+            return False, None, None, None
 
     # Copy the checker in the sandbox, after making sure it was provided.
     if checker_digest is None:
         logger.error("Configuration error: missing checker in task managers.")
-        return False, None, None
+        return False, None, None, None
     sandbox.create_file_from_storage(CHECKER_FILENAME, checker_digest,
                                      executable=True)
 
@@ -247,17 +266,17 @@ def checker_step(
     if not box_success or not success:
         logger.error("Sandbox failed during checker step. "
                      "See previous logs for the reason.")
-        return False, None, None
+        return False, None, None, None
 
     # Extract outcome and text assuming a standard manager output.
     try:
-        outcome, text = extract_outcome_and_text(sandbox)
+        outcome, text, admin_text = extract_outcome_and_text(sandbox)
     except ValueError as e:
         logger.error("Invalid output from checker: %s", e)
-        return False, None, None
+        return False, None, None, None
     except FileNotFoundError as e:
         # This should not happen, as the redirect is handled by the sandbox.
         logger.error("Missing stdout or stderr file from checker: %s", e)
-        return False, None, None
+        return False, None, None, None
 
-    return True, outcome, text
+    return True, outcome, text, admin_text
diff --git a/cms/grading/steps/whitediff.py b/cms/grading/steps/whitediff.py
@@ -89,28 +89,35 @@ def _white_diff(output: typing.BinaryIO, res: typing.BinaryIO) -> bool:
 
     """
 
+    line = 0
+
     while True:
         lout = output.readline()
         lres = res.readline()
+        line += 1
 
         # Both files finished: comparison succeded
         if len(lres) == 0 and len(lout) == 0:
-            return True
+            return True, None
 
         # Only one file finished: ok if the other contains only blanks
         elif len(lres) == 0 or len(lout) == 0:
             lout = lout.strip(b''.join(_WHITES))
             lres = lres.strip(b''.join(_WHITES))
-            if len(lout) > 0 or len(lres) > 0:
-                return False
+            if len(lout) > 0:
+                return False, "Contestant output too long"
+            if len(lres) > 0:
+                return False, "Contestant output too short"
 
         # Both file still have lines to go: ok if they agree except
         # for the number of whitespaces
         else:
             lout = _white_diff_canonicalize(lout)
             lres = _white_diff_canonicalize(lres)
             if lout != lres:
-                return False
+                lout = lout.decode("utf-8", errors='backslashreplace')
+                lres = lres.decode("utf-8", errors='backslashreplace')
+                return False, f"Expected `{lres}`, found `{lout}` on line {line}"
 
 
 def white_diff_fobj_step(
@@ -129,10 +136,11 @@ def white_diff_fobj_step(
     return: the outcome as above and a description text.
 
     """
-    if _white_diff(output_fobj, correct_output_fobj):
-        return 1.0, [EVALUATION_MESSAGES.get("success").message]
+    correct, admin_text = _white_diff(output_fobj, correct_output_fobj)
+    if correct:
+        return 1.0, [EVALUATION_MESSAGES.get("success").message], admin_text
     else:
-        return 0.0, [EVALUATION_MESSAGES.get("wrong").message]
+        return 0.0, [EVALUATION_MESSAGES.get("wrong").message], admin_text
 
 
 def white_diff_step(
diff --git a/cms/grading/tasktypes/Batch.py b/cms/grading/tasktypes/Batch.py
@@ -367,7 +367,7 @@ def _evaluate_step(self, job, file_cacher, output_file_params, outcome, text, st
         if box_success:
             assert (output_file_params is None) == (outcome is not None)
             if output_file_params is not None:
-                box_success, outcome, text = eval_output(
+                box_success, outcome, text, admin_text = eval_output(
                     file_cacher, job,
                     self.CHECKER_CODENAME
                     if self._uses_checker() else None,
@@ -378,6 +378,7 @@ def _evaluate_step(self, job, file_cacher, output_file_params, outcome, text, st
         job.outcome = str(outcome) if outcome is not None else None
         job.text = text
         job.plus = stats
+        job.admin_text = admin_text
 
         if sandbox is not None:
             delete_sandbox(sandbox, job)
diff --git a/cms/grading/tasktypes/Communication.py b/cms/grading/tasktypes/Communication.py
@@ -415,7 +415,7 @@ def evaluate(self, job, file_cacher):
 
         # Otherwise, we use the manager to obtain the outcome.
         else:
-            outcome, text = extract_outcome_and_text(sandbox_mgr)
+            outcome, text, admin_text = extract_outcome_and_text(sandbox_mgr)
 
         # If asked so, save the output file with additional information,
         # provided that it exists.
@@ -433,6 +433,7 @@ def evaluate(self, job, file_cacher):
         job.outcome = "%s" % outcome if outcome is not None else None
         job.text = text
         job.plus = stats_user
+        job.admin_text = admin_text
 
         delete_sandbox(sandbox_mgr, job)
         for s in sandbox_user:
diff --git a/cms/grading/tasktypes/OutputOnly.py b/cms/grading/tasktypes/OutputOnly.py
@@ -124,7 +124,7 @@ def evaluate(self, job, file_cacher):
             return
 
         # First and only step: eval the user output.
-        box_success, outcome, text = eval_output(
+        box_success, outcome, text, admin_text = eval_output(
             file_cacher, job,
             OutputOnly.CHECKER_CODENAME if self._uses_checker() else None,
             user_output_digest=job.files[user_output_filename].digest)
@@ -133,5 +133,6 @@ def evaluate(self, job, file_cacher):
         job.success = box_success
         job.outcome = str(outcome) if outcome is not None else None
         job.text = text
+        job.admin_text = admin_text
         # There is no actual evaluation, so no statistics.
         job.plus = {} if box_success else None
diff --git a/cms/grading/tasktypes/TwoSteps.py b/cms/grading/tasktypes/TwoSteps.py
@@ -333,7 +333,7 @@ def evaluate(self, job, file_cacher):
 
                 # Otherwise evaluate the output file.
                 else:
-                    box_success, outcome, text = eval_output(
+                    box_success, outcome, text, admin_text = eval_output(
                         file_cacher, job,
                         TwoSteps.CHECKER_CODENAME
                         if self._uses_checker() else None,
@@ -344,6 +344,7 @@ def evaluate(self, job, file_cacher):
         job.success = box_success
         job.outcome = str(outcome) if outcome is not None else None
         job.text = text
+        job.admin_text = admin_text
         job.plus = stats
 
         delete_sandbox(first_sandbox, job)
diff --git a/cms/grading/tasktypes/util.py b/cms/grading/tasktypes/util.py
@@ -221,7 +221,7 @@ def eval_output(
     user_output_digest: str | None = None,
     user_output_filename: str = "",
     extra_args: list[str] | None = None
-) -> tuple[bool, float | None, list[str] | None]:
+) -> tuple[bool, float | None, list[str] | None, str]:
     """Evaluate ("check") a user output using a white diff or a checker.
 
     file_cacher: file cacher to use to get files.
@@ -237,8 +237,8 @@ def eval_output(
     extra_args: additional arguments to pass to the checker
 
     return: tuple of success (true if the checker was
-        able to check the solution successfully), outcome and text (both None
-        if success is False).
+        able to check the solution successfully), outcome, text and admin_text
+        (both None if success is False).
 
     """
     if (user_output_path is None) == (user_output_digest is None):
@@ -256,7 +256,7 @@ def eval_output(
 
     if checker_codename is not None:
         if not check_manager_present(job, checker_codename):
-            return False, None, None
+            return False, None, None, None
 
         # Create a brand-new sandbox just for checking.
         sandbox = create_sandbox(file_cacher, name="check")
@@ -275,12 +275,12 @@ def eval_output(
 
         checker_digest = job.managers[checker_codename].digest \
             if checker_codename in job.managers else None
-        success, outcome, text = checker_step(
+        success, outcome, text, admin_text = checker_step(
             sandbox, checker_digest, job.input, job.output,
             EVAL_USER_OUTPUT_FILENAME, extra_args)
 
         delete_sandbox(sandbox, job, success)
-        return success, outcome, text
+        return success, outcome, text, admin_text
 
     else:
         if user_output_path is not None:
@@ -289,6 +289,6 @@ def eval_output(
             user_output_fobj = file_cacher.get_file(user_output_digest)
         with user_output_fobj:
             with file_cacher.get_file(job.output) as correct_output_fobj:
-                outcome, text = white_diff_fobj_step(
+                outcome, text, admin_text = white_diff_fobj_step(
                     user_output_fobj, correct_output_fobj)
-        return True, outcome, text
+        return True, outcome, text, admin_text
diff --git a/cms/server/admin/templates/submission.html b/cms/server/admin/templates/submission.html