diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py index 7dfa870b4..d0ffef311 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py +++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py @@ -509,6 +509,49 @@ def build_index_payload( storage_type=StorageType.PERMANENT, env_name=FileStorageKeys.PERMANENT_REMOTE_STORAGE, ) + + # Compute x2text_config_hash early so the marker check below (and the + # post-success callback) can both consume the same value. + x2text_metadata = default_profile.x2text.metadata or {} + x2text_config_hash = ToolUtils.hash_str( + json.dumps(x2text_metadata, sort_keys=True) + ) + + # Manage Documents → Index: mirror the pre-async dynamic_extractor + # behaviour. If the extraction marker says this x2text_config_hash + + # enable_highlight combination is already extracted, read the existing + # extract file from disk and reuse it so the executor can skip the + # extract step. Any failure here falls back to full extraction. + reused_extracted_text: str | None = None + try: + already_extracted = PromptStudioIndexHelper.check_extraction_status( + document_id=document_id, + profile_manager=default_profile, + x2text_config_hash=x2text_config_hash, + enable_highlight=tool.enable_highlight, + ) + if already_extracted: + try: + reused_extracted_text = fs_instance.read( + path=extract_file_path, mode="r" + ) + logger.info( + "Manage Documents index: marker valid, reusing existing " + "extract file for document=%s", + document_id, + ) + except FileNotFoundError: + logger.warning( + "Marker says extracted but extract file missing: %s. " + "Will re-extract.", + extract_file_path, + ) + except Exception: + logger.warning( + "check_extraction_status raised; falling back to full extraction", + exc_info=True, + ) + util = PromptIdeBaseTool(log_level=LogLevel.INFO, org_id=org_id) doc_id_key = IndexingUtils.generate_index_key( vector_db=str(default_profile.vector_store.id), @@ -550,6 +593,11 @@ def build_index_payload( "platform_api_key": platform_api_key, } + # On marker-hit, pre-populate the extracted text so the executor's + # _handle_ide_index skips the extract step entirely. + if reused_extracted_text: + index_params[IKeys.EXTRACTED_TEXT] = reused_extracted_text + log_events_id = StateStore.get(Common.LOG_EVENTS_ID) or "" request_id = StateStore.get(Common.REQUEST_ID) or "" @@ -568,12 +616,9 @@ def build_index_payload( log_events_id=log_events_id, ) - # x2text config hash for extraction status tracking in callback - x2text_metadata = default_profile.x2text.metadata or {} - x2text_config_hash = ToolUtils.hash_str( - json.dumps(x2text_metadata, sort_keys=True) - ) - + # x2text_config_hash (computed above) is forwarded to the callback so + # ide_index_complete can refresh the extraction marker via + # mark_extraction_status. cb_kwargs = { "log_events_id": log_events_id, "request_id": request_id, diff --git a/backend/prompt_studio/prompt_studio_core_v2/tests/__init__.py b/backend/prompt_studio/prompt_studio_core_v2/tests/__init__.py new file mode 100644 index 000000000..41e4777a2 --- /dev/null +++ b/backend/prompt_studio/prompt_studio_core_v2/tests/__init__.py @@ -0,0 +1 @@ +# Tests for prompt_studio_core_v2. diff --git a/backend/prompt_studio/prompt_studio_core_v2/tests/test_build_index_payload.py b/backend/prompt_studio/prompt_studio_core_v2/tests/test_build_index_payload.py new file mode 100644 index 000000000..adb571324 --- /dev/null +++ b/backend/prompt_studio/prompt_studio_core_v2/tests/test_build_index_payload.py @@ -0,0 +1,492 @@ +"""Regression tests for ``PromptStudioHelper.build_index_payload``. + +These tests pin the Manage Documents → Index marker-reuse behaviour +introduced to fix the "extract runs every time" QA bug. The helper +must: + + 1. On a valid extraction marker + readable extract file, pre-populate + ``index_params[IKeys.EXTRACTED_TEXT]`` so the executor's + ``_handle_ide_index`` skips the extract step entirely. + 2. On a marker hit where the extract file is missing, fall back to + full extraction (do NOT pre-populate the field). + 3. On a marker miss, fall back to full extraction. + 4. On an error inside ``check_extraction_status``, swallow the error + and fall back to full extraction — the dispatch must not fail. + +The backend test environment has no ``pytest-django``, no SQLite +fallback, and the helper has a heavy Django-coupled import surface. +Rather than spin up Django, we stub every collaborator as a +``MagicMock`` on ``sys.modules`` *before* importing the helper, and +then patch ``PromptStudioHelper`` class methods per-test. This mirrors +the ``usage_v2/tests/test_helper.py`` approach. + +If the helper module cannot be imported in a given environment (for +example because the stub surface has drifted), all tests in the module +are skipped with a clear reason. +""" + +from __future__ import annotations + +import sys +import types +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + + +# --------------------------------------------------------------------------- +# Stub every collaborator module on sys.modules before importing the helper. +# These stubs are intentionally broad MagicMocks — the tests patch the +# specific attributes they care about via ``unittest.mock.patch``. +# --------------------------------------------------------------------------- + + +def _install(name: str, attrs: dict[str, Any] | None = None) -> types.ModuleType: + """Install (or replace) a fake module into ``sys.modules``. + + Always creates a fresh ``ModuleType``; this is important because the + real module may already have been imported before these stubs run + (via pytest collection, conftest, etc.), and we need our fake to + actually take effect. + """ + mod = types.ModuleType(name) + if attrs: + for key, value in attrs.items(): + setattr(mod, key, value) + sys.modules[name] = mod + return mod + + +def _install_package(name: str) -> types.ModuleType: + """Install a fake package (marked with ``__path__``). + + Only stubs the package if it is not already in ``sys.modules``. + This prevents clobbering packages like ``unstract.core`` that must + retain their real ``__path__`` for submodule resolution. The child + modules we care about are always replaced explicitly via + ``_install``. + """ + if name in sys.modules: + return sys.modules[name] + mod = types.ModuleType(name) + mod.__path__ = [] # type: ignore[attr-defined] + sys.modules[name] = mod + return mod + + +try: + # Account / adapter stubs + _install_package("account_v2") + _install( + "account_v2.constants", + {"Common": type("Common", (), {"LOG_EVENTS_ID": "log_events_id", + "REQUEST_ID": "request_id"})}, + ) + _install("account_v2.models", {"User": MagicMock(name="User")}) + _install_package("adapter_processor_v2") + _install( + "adapter_processor_v2.constants", + {"AdapterKeys": type("AdapterKeys", (), {})}, + ) + _install( + "adapter_processor_v2.models", + {"AdapterInstance": MagicMock(name="AdapterInstance")}, + ) + + # Plugins stub + _install("plugins", {"get_plugin": MagicMock(return_value=None)}) + + # utils stubs + _install_package("utils") + _install_package("utils.file_storage") + _install( + "utils.file_storage.constants", + { + "FileStorageKeys": type( + "FileStorageKeys", + (), + {"PERMANENT_REMOTE_STORAGE": "permanent"}, + ) + }, + ) + _install_package("utils.file_storage.helpers") + _install( + "utils.file_storage.helpers.prompt_studio_file_helper", + {"PromptStudioFileHelper": MagicMock(name="PromptStudioFileHelper")}, + ) + _install( + "utils.local_context", + {"StateStore": MagicMock(name="StateStore")}, + ) + + # backend.celery_service stub + _install_package("backend") + _install( + "backend.celery_service", + {"app": MagicMock(name="celery_app")}, + ) + + # prompt_studio stubs + _install_package("prompt_studio") + _install_package("prompt_studio.prompt_profile_manager_v2") + _install( + "prompt_studio.prompt_profile_manager_v2.models", + {"ProfileManager": MagicMock(name="ProfileManager")}, + ) + _install( + "prompt_studio.prompt_profile_manager_v2.profile_manager_helper", + {"ProfileManagerHelper": MagicMock(name="ProfileManagerHelper")}, + ) + + _install_package("prompt_studio.prompt_studio_document_manager_v2") + _install( + "prompt_studio.prompt_studio_document_manager_v2.models", + {"DocumentManager": MagicMock(name="DocumentManager")}, + ) + + _install_package("prompt_studio.prompt_studio_index_manager_v2") + _install( + "prompt_studio.prompt_studio_index_manager_v2.prompt_studio_index_helper", + {"PromptStudioIndexHelper": MagicMock(name="PromptStudioIndexHelper")}, + ) + + _install_package("prompt_studio.prompt_studio_output_manager_v2") + _install( + "prompt_studio.prompt_studio_output_manager_v2.output_manager_helper", + {"OutputManagerHelper": MagicMock(name="OutputManagerHelper")}, + ) + + _install_package("prompt_studio.prompt_studio_v2") + _install( + "prompt_studio.prompt_studio_v2.models", + {"ToolStudioPrompt": MagicMock(name="ToolStudioPrompt")}, + ) + + # Stub the prompt_studio_core_v2 sibling modules too — several of them + # transitively import modules (like ``utils.cache_service``) that we + # don't want to pull in for these unit tests. + _install_package("prompt_studio.prompt_studio_core_v2") + _install( + "prompt_studio.prompt_studio_core_v2.document_indexing_service", + {"DocumentIndexingService": MagicMock(name="DocumentIndexingService")}, + ) + + # Real exception classes — build_index_payload uses ``raise``. + class _FakeExc(Exception): + pass + + _install( + "prompt_studio.prompt_studio_core_v2.exceptions", + { + "AnswerFetchError": type("AnswerFetchError", (_FakeExc,), {}), + "DefaultProfileError": type("DefaultProfileError", (_FakeExc,), {}), + "EmptyPromptError": type("EmptyPromptError", (_FakeExc,), {}), + "ExtractionAPIError": type("ExtractionAPIError", (_FakeExc,), {}), + "IndexingAPIError": type("IndexingAPIError", (_FakeExc,), {}), + "NoPromptsFound": type("NoPromptsFound", (_FakeExc,), {}), + "OperationNotSupported": type("OperationNotSupported", (_FakeExc,), {}), + "PermissionError": type("PermissionError", (_FakeExc,), {}), + }, + ) + _install( + "prompt_studio.prompt_studio_core_v2.migration_utils", + {"SummarizeMigrationUtils": MagicMock(name="SummarizeMigrationUtils")}, + ) + _install( + "prompt_studio.prompt_studio_core_v2.models", + {"CustomTool": MagicMock(name="CustomTool")}, + ) + _install( + "prompt_studio.prompt_studio_core_v2.prompt_ide_base_tool", + {"PromptIdeBaseTool": MagicMock(name="PromptIdeBaseTool")}, + ) + _install( + "prompt_studio.prompt_studio_core_v2.prompt_variable_service", + {"PromptStudioVariableService": MagicMock(name="PromptStudioVariableService")}, + ) + + # unstract.core.pubsub_helper stub (LogPublisher isn't used by + # build_index_payload but the module-level import must succeed). + _install_package("unstract.core") + _install( + "unstract.core.pubsub_helper", + {"LogPublisher": MagicMock(name="LogPublisher")}, + ) + + # unstract.sdk1 stubs — these heavy modules transitively pull in + # ``unstract.core.cache.redis_client`` which isn't on the python + # path for the backend tests. We only need the leaf classes. + _install_package("unstract.sdk1") + _install( + "unstract.sdk1.constants", + { + "LogLevel": type( + "LogLevel", (), {"INFO": "INFO", "WARN": "WARN", "ERROR": "ERROR"} + ) + }, + ) + _install( + "unstract.sdk1.exceptions", + { + "IndexingError": type("IndexingError", (Exception,), {}), + "SdkError": type("SdkError", (Exception,), {}), + }, + ) + _install_package("unstract.sdk1.execution") + + class _FakeExecutionContext: + """Minimal ExecutionContext that keeps ``executor_params`` as + the real dict we pass in (the tests inspect it).""" + + def __init__(self, **kwargs: Any) -> None: + self.executor_name = kwargs.get("executor_name") + self.operation = kwargs.get("operation") + self.run_id = kwargs.get("run_id") + self.execution_source = kwargs.get("execution_source") + self.organization_id = kwargs.get("organization_id") + self.executor_params = kwargs.get("executor_params") or {} + self.request_id = kwargs.get("request_id") + self.log_events_id = kwargs.get("log_events_id") + + _install( + "unstract.sdk1.execution.context", + {"ExecutionContext": _FakeExecutionContext}, + ) + _install( + "unstract.sdk1.execution.dispatcher", + {"ExecutionDispatcher": MagicMock(name="ExecutionDispatcher")}, + ) + _install_package("unstract.sdk1.file_storage") + _install( + "unstract.sdk1.file_storage.constants", + {"StorageType": type("StorageType", (), {"PERMANENT": "permanent"})}, + ) + _install( + "unstract.sdk1.file_storage.env_helper", + {"EnvHelper": MagicMock(name="EnvHelper")}, + ) + _install_package("unstract.sdk1.utils") + _install( + "unstract.sdk1.utils.indexing", + {"IndexingUtils": MagicMock(name="IndexingUtils")}, + ) + _install( + "unstract.sdk1.utils.tool", + {"ToolUtils": MagicMock(name="ToolUtils")}, + ) + + # Now import the helper module. If this fails, all tests below will + # be skipped via the ``_IMPORT_ERROR`` sentinel. + from prompt_studio.prompt_studio_core_v2 import prompt_studio_helper as _psh_mod # noqa: E402 + + PromptStudioHelper = _psh_mod.PromptStudioHelper + IKeys = _psh_mod.IKeys + _IMPORT_ERROR: str | None = None +except Exception as exc: # pragma: no cover — environment guard + _IMPORT_ERROR = ( + f"prompt_studio_helper could not be imported in this environment: " + f"{type(exc).__name__}: {exc}" + ) + PromptStudioHelper = None # type: ignore[assignment] + IKeys = None # type: ignore[assignment] + + +pytestmark = pytest.mark.skipif( + _IMPORT_ERROR is not None, reason=_IMPORT_ERROR or "" +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_tool(enable_highlight: bool = False, summarize_context: bool = False): + tool = MagicMock(name="CustomTool") + tool.enable_highlight = enable_highlight + tool.summarize_context = summarize_context + return tool + + +def _make_profile(): + profile = MagicMock(name="ProfileManager") + profile.x2text.id = "x2t-1" + profile.x2text.metadata = {"model": "default"} + profile.embedding_model.id = "emb-1" + profile.vector_store.id = "vdb-1" + profile.chunk_size = 512 + profile.chunk_overlap = 64 + profile.profile_id = "profile-1" + return profile + + +def _dispatch_build( + *, + check_return: bool | Exception, + read_return: str | Exception, + tool: Any = None, + profile: Any = None, +): + """Run ``build_index_payload`` with all collaborators patched. + + ``check_return`` / ``read_return`` configure the two branches we + care about: + * ``check_return`` — ``check_extraction_status`` return value + or an exception to raise. + * ``read_return`` — ``fs_instance.read`` return value or an + exception to raise. + + Returns the ``ExecutionContext`` built by ``build_index_payload``. + """ + tool = tool or _make_tool() + profile = profile or _make_profile() + + fs_instance = MagicMock(name="fs_instance") + if isinstance(read_return, Exception): + fs_instance.read.side_effect = read_return + else: + fs_instance.read.return_value = read_return + + check_mock = MagicMock(name="check_extraction_status") + if isinstance(check_return, Exception): + check_mock.side_effect = check_return + else: + check_mock.return_value = check_return + + # Patch everything via context managers so each test starts clean. + patches = [ + patch.object( + _psh_mod.CustomTool, + "objects", + MagicMock(get=MagicMock(return_value=tool)), + ), + patch.object( + _psh_mod.PromptStudioFileHelper, + "get_or_create_prompt_studio_subdirectory", + return_value="/prompt-studio/org/user/tool", + ), + patch.object( + _psh_mod.ProfileManager, + "get_default_llm_profile", + return_value=profile, + ), + patch.object( + PromptStudioHelper, + "validate_adapter_status", + return_value=None, + ), + patch.object( + PromptStudioHelper, + "validate_profile_manager_owner_access", + return_value=None, + ), + patch.object( + PromptStudioHelper, + "_get_platform_api_key", + return_value="pk-test", + ), + patch.object( + PromptStudioHelper, + "_build_summarize_params", + return_value=(None, "", MagicMock()), + ), + patch.object( + _psh_mod.EnvHelper, + "get_storage", + return_value=fs_instance, + ), + patch.object( + _psh_mod.PromptStudioIndexHelper, + "check_extraction_status", + check_mock, + ), + patch.object( + _psh_mod.IndexingUtils, + "generate_index_key", + return_value="doc-key-1", + ), + patch.object( + _psh_mod, + "PromptIdeBaseTool", + MagicMock(return_value=MagicMock()), + ), + patch.object( + _psh_mod.StateStore, + "get", + return_value="", + ), + ] + for p in patches: + p.start() + try: + context, cb_kwargs = PromptStudioHelper.build_index_payload( + tool_id="tool-1", + file_name="doc.pdf", + org_id="org-1", + user_id="user-1", + document_id="doc-1", + run_id="run-1", + ) + return context, cb_kwargs, fs_instance, check_mock + finally: + for p in patches: + p.stop() + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestBuildIndexPayloadMarker: + """Verify that build_index_payload honours the extraction marker.""" + + def test_marker_hit_prepopulates_extracted_text(self) -> None: + """Marker True + file readable → EXTRACTED_TEXT is pre-populated.""" + context, _cb, fs_instance, check_mock = _dispatch_build( + check_return=True, + read_return="existing extracted content", + ) + index_params = context.executor_params["index_params"] + assert index_params[IKeys.EXTRACTED_TEXT] == "existing extracted content" + fs_instance.read.assert_called_once() + check_mock.assert_called_once() + + def test_marker_hit_missing_file_does_not_prepopulate(self) -> None: + """Marker True + FileNotFoundError → field NOT set, fall back to extract.""" + context, _cb, fs_instance, _check = _dispatch_build( + check_return=True, + read_return=FileNotFoundError("missing"), + ) + index_params = context.executor_params["index_params"] + assert IKeys.EXTRACTED_TEXT not in index_params + fs_instance.read.assert_called_once() + + def test_marker_miss_does_not_prepopulate(self) -> None: + """Marker False → EXTRACTED_TEXT NOT set, extract runs as before.""" + context, _cb, fs_instance, _check = _dispatch_build( + check_return=False, + read_return="should-not-be-read", + ) + index_params = context.executor_params["index_params"] + assert IKeys.EXTRACTED_TEXT not in index_params + fs_instance.read.assert_not_called() + + def test_check_extraction_status_raises_is_swallowed(self, caplog) -> None: + """check_extraction_status error → warn, field NOT set, no re-raise.""" + import logging as _logging + + caplog.set_level(_logging.WARNING, logger=_psh_mod.logger.name) + context, _cb, fs_instance, _check = _dispatch_build( + check_return=RuntimeError("db down"), + read_return="should-not-be-read", + ) + index_params = context.executor_params["index_params"] + assert IKeys.EXTRACTED_TEXT not in index_params + fs_instance.read.assert_not_called() + # A warning should have been emitted about the fallback. + assert any( + "falling back to full extraction" in rec.getMessage() + for rec in caplog.records + ) diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py index 9a3bf710e..18e1f4774 100644 --- a/workers/executor/executors/legacy_executor.py +++ b/workers/executor/executors/legacy_executor.py @@ -408,20 +408,30 @@ def _handle_ide_index(self, context: ExecutionContext) -> ExecutionResult: error=f"ide_index missing required params: {', '.join(missing)}" ) - # Step 1: Extract - extract_ctx = ExecutionContext( - executor_name=context.executor_name, - operation=Operation.EXTRACT.value, - run_id=context.run_id, - execution_source=context.execution_source, - organization_id=context.organization_id, - executor_params=extract_params, - request_id=context.request_id, - log_events_id=context.log_events_id, - ) - extract_result = self._handle_extract(extract_ctx) - if not extract_result.success: - return extract_result + # Step 1: Extract (or reuse pre-extracted text on marker hit) + pre_extracted_text = index_params.get(IKeys.EXTRACTED_TEXT, "") or "" + if pre_extracted_text: + logger.info( + "ide_index: marker hit, skipping extract step " "(len=%d, run_id=%s)", + len(pre_extracted_text), + context.run_id, + ) + extracted_text = pre_extracted_text + else: + extract_ctx = ExecutionContext( + executor_name=context.executor_name, + operation=Operation.EXTRACT.value, + run_id=context.run_id, + execution_source=context.execution_source, + organization_id=context.organization_id, + executor_params=extract_params, + request_id=context.request_id, + log_events_id=context.log_events_id, + ) + extract_result = self._handle_extract(extract_ctx) + if not extract_result.success: + return extract_result + extracted_text = extract_result.data.get(IKeys.EXTRACTED_TEXT, "") # Step 2: Optional summarize summarize_params = params.get("summarize_params") @@ -433,7 +443,6 @@ def _handle_ide_index(self, context: ExecutionContext) -> ExecutionResult: return result # Step 3: Index — inject extracted text - extracted_text = extract_result.data.get(IKeys.EXTRACTED_TEXT, "") index_params[IKeys.EXTRACTED_TEXT] = extracted_text index_ctx = ExecutionContext( diff --git a/workers/tests/test_sanity_phase5.py b/workers/tests/test_sanity_phase5.py index 31675b8f9..4d00a571f 100644 --- a/workers/tests/test_sanity_phase5.py +++ b/workers/tests/test_sanity_phase5.py @@ -377,6 +377,167 @@ def test_ide_index_extract_failure( assert not result.success assert "X2Text" in result.error + @patch(_PATCH_INDEX_DEPS) + @patch(_PATCH_FS) + @patch(_PATCH_X2TEXT) + @patch(_PATCH_SHIM) + def test_ide_index_reuses_pre_extracted_text( + self, + mock_shim, + mock_x2text, + mock_fs, + mock_index_deps, + eager_app, + ): + """Marker-hit path: extracted_text pre-populated → extract is skipped.""" + # If _handle_extract runs, this will blow up — the whole point is + # that it must NOT be called when index_params carries extracted_text. + x2t_instance = MagicMock() + x2t_instance.process.side_effect = AssertionError( + "extract must not run when index_params contains pre-extracted text" + ) + mock_x2text.return_value = x2t_instance + + fs = MagicMock() + fs.exists.return_value = False + mock_fs.return_value = fs + + # Mock index — capture the extracted_text that reached _handle_index. + index_inst = MagicMock() + index_inst.generate_index_key.return_value = "idx-doc-reuse" + index_inst.is_document_indexed.return_value = False + mock_index_deps.return_value = ( + MagicMock(return_value=index_inst), + MagicMock(), + MagicMock(), + ) + + ctx = ExecutionContext( + executor_name="legacy", + operation="ide_index", + run_id="run-ide-reuse", + execution_source="ide", + organization_id="org-test", + executor_params={ + "extract_params": { + "x2text_instance_id": "x2t-1", + "file_path": "/data/doc.pdf", + "enable_highlight": False, + "output_file_path": "/data/extract/doc.txt", + "platform_api_key": "pk-test", + "usage_kwargs": {}, + }, + "index_params": { + "tool_id": "tool-1", + "embedding_instance_id": "emb-1", + "vector_db_instance_id": "vdb-1", + "x2text_instance_id": "x2t-1", + "file_path": "/data/extract/doc.txt", + "file_hash": None, + "chunk_overlap": 64, + "chunk_size": 512, + "reindex": True, + "enable_highlight": False, + "usage_kwargs": {}, + "run_id": "run-ide-reuse", + "execution_source": "ide", + "platform_api_key": "pk-test", + "extracted_text": "reused extracted payload", + }, + }, + ) + + result_dict = _run_task(eager_app, ctx.to_dict()) + result = ExecutionResult.from_dict(result_dict) + + assert result.success + assert result.data["doc_id"] == "idx-doc-reuse" + # Extract adapter must never have been called. + x2t_instance.process.assert_not_called() + # perform_indexing received the pre-populated text. + perform_call_kwargs = index_inst.perform_indexing.call_args.kwargs + assert ( + perform_call_kwargs.get("extracted_text") == "reused extracted payload" + ) + + @patch(_PATCH_INDEX_DEPS) + @patch(_PATCH_FS) + @patch(_PATCH_X2TEXT) + @patch(_PATCH_SHIM) + def test_ide_index_without_pre_extracted_text_runs_extract( + self, + mock_shim, + mock_x2text, + mock_fs, + mock_index_deps, + eager_app, + ): + """Marker-miss path: extract runs as before when extracted_text is absent.""" + x2t_instance = MagicMock() + x2t_instance.process.return_value = _mock_process_response( + "freshly extracted" + ) + mock_x2text.return_value = x2t_instance + + fs = MagicMock() + fs.exists.return_value = False + mock_fs.return_value = fs + + index_inst = MagicMock() + index_inst.generate_index_key.return_value = "idx-doc-fresh" + index_inst.is_document_indexed.return_value = False + mock_index_deps.return_value = ( + MagicMock(return_value=index_inst), + MagicMock(), + MagicMock(), + ) + + ctx = ExecutionContext( + executor_name="legacy", + operation="ide_index", + run_id="run-ide-fresh", + execution_source="ide", + organization_id="org-test", + executor_params={ + "extract_params": { + "x2text_instance_id": "x2t-1", + "file_path": "/data/doc.pdf", + "enable_highlight": False, + "output_file_path": "/data/extract/doc.txt", + "platform_api_key": "pk-test", + "usage_kwargs": {}, + }, + "index_params": { + "tool_id": "tool-1", + "embedding_instance_id": "emb-1", + "vector_db_instance_id": "vdb-1", + "x2text_instance_id": "x2t-1", + "file_path": "/data/extract/doc.txt", + "file_hash": None, + "chunk_overlap": 64, + "chunk_size": 512, + "reindex": True, + "enable_highlight": False, + "usage_kwargs": {}, + "run_id": "run-ide-fresh", + "execution_source": "ide", + "platform_api_key": "pk-test", + # No "extracted_text" key → must take the extract path. + }, + }, + ) + + result_dict = _run_task(eager_app, ctx.to_dict()) + result = ExecutionResult.from_dict(result_dict) + + assert result.success + assert result.data["doc_id"] == "idx-doc-fresh" + # Extract adapter was called exactly once. + x2t_instance.process.assert_called_once() + # perform_indexing received the freshly extracted text. + perform_call_kwargs = index_inst.perform_indexing.call_args.kwargs + assert perform_call_kwargs.get("extracted_text") == "freshly extracted" + # --------------------------------------------------------------------------- # 5D: structure_pipeline compound operation through eager chain