Fix Sonar issues: cognitive complexity, params, dup, test smells

harini-venkataraman · claude · harini-venkataraman · commit e35af2f6138c · 2026-04-06T20:03:59.000+05:30
- legacy_executor: extract _run_pipeline_answer_step helper to drop
  _handle_structure_pipeline cognitive complexity from 18 to under 15
- legacy_executor: bundle 9 prompt-run scalars into a prompt_run_args
  dict so _run_line_item_extraction has 8 params (was 15, limit 13)
- legacy_executor: merge implicitly concatenated log string
- structure_tool_task: extract _write_pipeline_outputs helper used by
  both _execute_structure_tool_impl and _run_agentic_extraction to
  remove the duplicated INFILE / COPY_TO_FOLDER write block (fixes
  the 6.1% duplication on new code)
- test_context_retrieval_metrics: use pytest.approx for float compare,
  drop unused executor local, drop always-true if is_single_pass

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py
@@ -574,23 +574,57 @@ def _handle_structure_pipeline(self, context: ExecutionContext) -> ExecutionResu
                 input_file_path=input_file_path,
             )
 
-        # ---- Step 4b: Force full-context retrieval for single pass ----
-        # Single pass reads the whole file in one LLM call; force
-        # chunk-size=0 so the fallback path (no cloud plugin) uses
-        # retrieve_complete_context instead of vector DB retrieval.
+        # ---- Step 5: Answer prompt / Single pass ----
+        answer_result = self._run_pipeline_answer_step(
+            context=context,
+            answer_params=answer_params,
+            is_single_pass=is_single_pass,
+            shim=shim,
+            step=step,
+        )
+        if not answer_result.success:
+            return answer_result
+
+        # ---- Step 6: Merge results ----
+        structured_output = answer_result.data
+        self._finalize_pipeline_result(
+            structured_output=structured_output,
+            source_file_name=source_file_name,
+            extracted_text=extracted_text,
+            index_metrics=index_metrics,
+        )
+
+        shim.stream_log("Pipeline completed successfully")
+        return ExecutionResult(success=True, data=structured_output)
+
+    def _run_pipeline_answer_step(
+        self,
+        context: ExecutionContext,
+        answer_params: dict,
+        is_single_pass: bool,
+        shim: ExecutorToolShim,
+        step: int,
+    ) -> ExecutionResult:
+        """Run the answer-prompt step of the structure pipeline.
+
+        For single pass, forces ``chunk-size=0`` (full-context retrieval)
+        and dispatches ``_handle_single_pass_extraction``. Otherwise
+        dispatches ``_handle_answer_prompt``.
+        """
         if is_single_pass:
+            # Single pass reads the whole file in one LLM call; force
+            # chunk-size=0 so the fallback path (no cloud plugin) uses
+            # retrieve_complete_context instead of vector DB retrieval.
             for output in answer_params.get("outputs", []):
                 output["chunk-size"] = 0
                 output["chunk-overlap"] = 0
+            operation = Operation.SINGLE_PASS_EXTRACTION.value
+            mode_label = "single pass"
+        else:
+            operation = Operation.ANSWER_PROMPT.value
+            mode_label = "prompt"
 
-        # ---- Step 5: Answer prompt / Single pass ----
-        mode_label = "single pass" if is_single_pass else "prompt"
         shim.stream_log(f"Pipeline step {step}: Running {mode_label} execution...")
-        operation = (
-            Operation.SINGLE_PASS_EXTRACTION.value
-            if is_single_pass
-            else Operation.ANSWER_PROMPT.value
-        )
         answer_ctx = ExecutionContext(
             executor_name=context.executor_name,
             operation=operation,
@@ -602,23 +636,8 @@ def _handle_structure_pipeline(self, context: ExecutionContext) -> ExecutionResu
             log_events_id=context.log_events_id,
         )
         if is_single_pass:
-            answer_result = self._handle_single_pass_extraction(answer_ctx)
-        else:
-            answer_result = self._handle_answer_prompt(answer_ctx)
-        if not answer_result.success:
-            return answer_result
-
-        # ---- Step 6: Merge results ----
-        structured_output = answer_result.data
-        self._finalize_pipeline_result(
-            structured_output=structured_output,
-            source_file_name=source_file_name,
-            extracted_text=extracted_text,
-            index_metrics=index_metrics,
-        )
-
-        shim.stream_log("Pipeline completed successfully")
-        return ExecutionResult(success=True, data=structured_output)
+            return self._handle_single_pass_extraction(answer_ctx)
+        return self._handle_answer_prompt(answer_ctx)
 
     @staticmethod
     def _inject_table_settings(
@@ -1500,15 +1519,17 @@ def _execute_single_prompt(
                 structured_output=structured_output,
                 metadata=metadata,
                 metrics=metrics,
-                run_id=run_id,
-                execution_id=execution_id,
-                execution_source=execution_source,
-                platform_api_key=platform_api_key,
-                tool_id=tool_id,
-                doc_name=doc_name,
-                prompt_name=prompt_name,
-                file_path=file_path,
-                tool_settings=tool_settings,
+                prompt_run_args={
+                    "run_id": run_id,
+                    "execution_id": execution_id,
+                    "execution_source": execution_source,
+                    "platform_api_key": platform_api_key,
+                    "tool_id": tool_id,
+                    "doc_name": doc_name,
+                    "prompt_name": prompt_name,
+                    "file_path": file_path,
+                    "tool_settings": tool_settings,
+                },
                 shim=shim,
             )
             return
@@ -1724,20 +1745,19 @@ def _run_line_item_extraction(
         structured_output: dict[str, Any],
         metadata: dict[str, Any],
         metrics: dict[str, Any],
-        run_id: str,
-        execution_id: str,
-        execution_source: str,
-        platform_api_key: str,
-        tool_id: str,
-        doc_name: str,
-        prompt_name: str,
-        file_path: str,
-        tool_settings: dict[str, Any],
+        prompt_run_args: dict[str, Any],
         shim: Any,
     ) -> None:
-        """Delegate LINE_ITEM prompt to the line_item executor plugin."""
+        """Delegate LINE_ITEM prompt to the line_item executor plugin.
+
+        ``prompt_run_args`` bundles the per-prompt scalars passed from
+        ``_handle_outputs``: ``run_id``, ``execution_id``,
+        ``execution_source``, ``platform_api_key``, ``tool_id``,
+        ``doc_name``, ``prompt_name``, ``file_path``, ``tool_settings``.
+        """
         from executor.executors.constants import PromptServiceConstants as PSKeys
 
+        prompt_name = prompt_run_args["prompt_name"]
         try:
             line_item_executor = ExecutorRegistry.get("line_item")
         except KeyError:
@@ -1750,20 +1770,20 @@ def _run_line_item_extraction(
         line_item_ctx = ExecutionContext(
             executor_name="line_item",
             operation="line_item_extract",
-            run_id=run_id,
-            execution_source=execution_source,
+            run_id=prompt_run_args["run_id"],
+            execution_source=prompt_run_args["execution_source"],
             organization_id=context.organization_id,
             request_id=context.request_id,
             executor_params={
                 "llm_adapter_instance_id": output.get(PSKeys.LLM, ""),
-                "tool_settings": tool_settings,
+                "tool_settings": prompt_run_args["tool_settings"],
                 "output": output,
                 "prompt": output.get(PSKeys.PROMPTX, ""),
-                "file_path": file_path,
-                "PLATFORM_SERVICE_API_KEY": platform_api_key,
-                "execution_id": execution_id,
-                "tool_id": tool_id,
-                "file_name": doc_name,
+                "file_path": prompt_run_args["file_path"],
+                "PLATFORM_SERVICE_API_KEY": prompt_run_args["platform_api_key"],
+                "execution_id": prompt_run_args["execution_id"],
+                "tool_id": prompt_run_args["tool_id"],
+                "file_name": prompt_run_args["doc_name"],
                 "prompt_name": prompt_name,
             },
         )
@@ -1953,7 +1973,7 @@ def _inject_context_retrieval_metrics(
             elapsed = round(time.monotonic() - start, 4)
         except Exception:
             logger.warning(
-                "Could not measure context_retrieval time for " "single_pass (run_id=%s)",
+                "Could not measure context_retrieval time for single_pass (run_id=%s)",
                 context.run_id,
             )
             return
diff --git a/workers/file_processing/structure_tool_task.py b/workers/file_processing/structure_tool_task.py
@@ -407,33 +407,19 @@ def _execute_structure_tool_impl(params: dict) -> dict:
 
     # ---- Step 7: Write output files ----
     # (metadata/metrics merging already done by executor pipeline)
-    try:
-        output_path = Path(output_dir_path) / f"{Path(source_file_name).stem}.json"
-        logger.info("Writing output to %s", output_path)
-        fs.json_dump(path=output_path, data=structured_output)
-
-        # Overwrite INFILE with JSON output (matches Docker-based tool behavior).
-        # The destination connector reads from INFILE and checks MIME type —
-        # if we don't overwrite it, INFILE still has the original PDF.
-        logger.info("Overwriting INFILE with structured output: %s", input_file_path)
-        fs.json_dump(path=input_file_path, data=structured_output)
-
-        # Write to COPY_TO_FOLDER for FS destinations.
-        # The old Docker flow created this via ToolExecutor._setup_for_run();
-        # the destination connector expects output at
-        # {file_execution_dir}/COPY_TO_FOLDER/{filename}.json
-        copy_to_folder = str(Path(execution_data_dir) / "COPY_TO_FOLDER")
-        fs.mkdir(copy_to_folder)
-        copy_output_path = str(
-            Path(copy_to_folder) / f"{Path(source_file_name).stem}.json"
-        )
-        fs.json_dump(path=copy_output_path, data=structured_output)
-        logger.info("Output written to COPY_TO_FOLDER: %s", copy_output_path)
-
-        logger.info("Output written successfully to workflow storage")
-    except Exception as e:
-        logger.error("Failed to write output files: %s", e, exc_info=True)
-        return ExecutionResult.failure(error=f"Error writing output file: {e}").to_dict()
+    write_error = _write_pipeline_outputs(
+        fs=fs,
+        structured_output=structured_output,
+        output_dir_path=output_dir_path,
+        input_file_path=input_file_path,
+        execution_data_dir=execution_data_dir,
+        source_file_name=source_file_name,
+        label="structured",
+    )
+    if write_error:
+        return ExecutionResult.failure(
+            error=f"Error writing output file: {write_error}"
+        ).to_dict()
 
     # Write tool result + tool_metadata to METADATA.json
     # (destination connector reads output_type from tool_metadata)
@@ -620,33 +606,80 @@ def _run_agentic_extraction(
     elapsed = time.monotonic() - start_time
 
     # Write output files (matches regular pipeline path)
+    write_error = _write_pipeline_outputs(
+        fs=fs,
+        structured_output=structured_output,
+        output_dir_path=output_dir_path,
+        input_file_path=input_file_path,
+        execution_data_dir=execution_data_dir,
+        source_file_name=source_file_name,
+        label="agentic",
+    )
+    if write_error:
+        return ExecutionResult.failure(
+            error=f"Error writing agentic output: {write_error}"
+        ).to_dict()
+
+    # Write tool result + tool_metadata to METADATA.json
+    _write_tool_result(fs, execution_data_dir, structured_output, elapsed)
+
+    return ExecutionResult(success=True, data=structured_output).to_dict()
+
+
+def _write_pipeline_outputs(
+    fs: Any,
+    structured_output: dict,
+    output_dir_path: str,
+    input_file_path: str,
+    execution_data_dir: str,
+    source_file_name: str,
+    label: str,
+) -> str | None:
+    """Write structure-tool / agentic outputs to disk.
+
+    Mirrors the old Docker tool's output layout so the destination
+    connector finds what it expects:
+
+    1. ``{output_dir_path}/{stem}.json`` — primary output file.
+    2. INFILE overwritten with JSON (destination connector reads INFILE
+       and checks MIME type — without this it still sees the original
+       PDF).
+    3. ``{execution_data_dir}/COPY_TO_FOLDER/{stem}.json`` — what the
+       old ``ToolExecutor._setup_for_run()`` created for FS destinations.
+
+    Args:
+        label: Short label for log lines (``"structured"`` or
+            ``"agentic"``).
+
+    Returns:
+        ``None`` on success, or the error string on failure.
+    """
     try:
-        output_path = Path(output_dir_path) / f"{Path(source_file_name).stem}.json"
-        logger.info("Writing agentic output to %s", output_path)
+        stem = Path(source_file_name).stem
+        output_path = Path(output_dir_path) / f"{stem}.json"
+        logger.info("Writing %s output to %s", label, output_path)
         fs.json_dump(path=output_path, data=structured_output)
 
-        # Overwrite INFILE with JSON output so destination connector reads JSON, not PDF
-        logger.info("Overwriting INFILE with agentic output: %s", input_file_path)
+        logger.info(
+            "Overwriting INFILE with %s output: %s", label, input_file_path
+        )
         fs.json_dump(path=input_file_path, data=structured_output)
 
-        # Write to COPY_TO_FOLDER for FS destinations (same as regular pipeline)
         copy_to_folder = str(Path(execution_data_dir) / "COPY_TO_FOLDER")
         fs.mkdir(copy_to_folder)
-        copy_output_path = str(
-            Path(copy_to_folder) / f"{Path(source_file_name).stem}.json"
-        )
+        copy_output_path = str(Path(copy_to_folder) / f"{stem}.json")
         fs.json_dump(path=copy_output_path, data=structured_output)
-        logger.info("Agentic output written to COPY_TO_FOLDER: %s", copy_output_path)
-    except Exception as e:
-        logger.error("Failed to write agentic output files: %s", e, exc_info=True)
-        return ExecutionResult.failure(
-            error=f"Error writing agentic output: {e}"
-        ).to_dict()
-
-    # Write tool result + tool_metadata to METADATA.json
-    _write_tool_result(fs, execution_data_dir, structured_output, elapsed)
+        logger.info(
+            "%s output written to COPY_TO_FOLDER: %s",
+            label.capitalize(),
+            copy_output_path,
+        )
 
-    return ExecutionResult(success=True, data=structured_output).to_dict()
+        logger.info("Output written successfully to workflow storage")
+        return None
+    except Exception as e:
+        logger.error("Failed to write %s output files: %s", label, e, exc_info=True)
+        return str(e)
 
 
 def _write_tool_result(
diff --git a/workers/tests/test_context_retrieval_metrics.py b/workers/tests/test_context_retrieval_metrics.py
@@ -111,7 +111,9 @@ def test_preserves_existing_context_retrieval(self, mock_fs):
         executor._inject_context_retrieval_metrics(result, ctx)
 
         # field_a's existing timing preserved
-        assert result.data["metrics"]["field_a"]["context_retrieval"]["time_taken(s)"] == 0.999
+        assert result.data["metrics"]["field_a"]["context_retrieval"][
+            "time_taken(s)"
+        ] == pytest.approx(0.999)
         # field_b gets new timing
         assert "context_retrieval" in result.data["metrics"]["field_b"]
 
@@ -253,8 +255,6 @@ def test_single_pass_forces_chunk_size_zero(self, mock_fs):
         fs.exists.return_value = False
         mock_fs.return_value = fs
 
-        executor = _get_executor()
-
         # Build minimal answer_params with non-zero chunk-size
         outputs = [
             {
@@ -299,11 +299,10 @@ def test_single_pass_forces_chunk_size_zero(self, mock_fs):
         }
 
         # Apply the same logic as _handle_structure_pipeline step 4b
-        is_single_pass = True
-        if is_single_pass:
-            for output in answer_params.get("outputs", []):
-                output["chunk-size"] = 0
-                output["chunk-overlap"] = 0
+        # (single pass forces chunk-size=0 to use full-context retrieval)
+        for output in answer_params.get("outputs", []):
+            output["chunk-size"] = 0
+            output["chunk-overlap"] = 0
 
         # Verify outputs were modified
         for output in answer_params["outputs"]: