arc53 · dartpain · Sep 4, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 21, 2025
diff --git a/application/agents/base.py b/application/agents/base.py
@@ -5,19 +5,17 @@
 
 from bson.objectid import ObjectId
 
-logger = logging.getLogger(__name__)
-
 from application.agents.tools.tool_action_parser import ToolActionParser
 from application.agents.tools.tool_manager import ToolManager
-
 from application.core.mongo_db import MongoDB
 from application.core.settings import settings
-
 from application.llm.handlers.handler_creator import LLMHandlerCreator
 from application.llm.llm_creator import LLMCreator
 from application.logging import build_stack_data, log_activity, LogContext
 from application.retriever.base import BaseRetriever
 
+logger = logging.getLogger(__name__)
+
 
 class BaseAgent(ABC):
     def __init__(
@@ -157,7 +155,7 @@ def _execute_tool_action(self, tools_dict, call):
             }
             yield {"type": "tool_call", "data": {**tool_call_data, "status": "error"}}
             self.tool_calls.append(tool_call_data)
-            return f"Failed to parse tool call.", call_id
+            return "Failed to parse tool call.", call_id
 
         # Check if tool_id exists in available tools
         if tool_id not in tools_dict:

diff --git a/application/api/connector/routes.py b/application/api/connector/routes.py
diff --git a/application/api/user/routes.py b/application/api/user/routes.py
@@ -28,6 +28,7 @@
 
 from application.api.user.tasks import (
     ingest,
+    ingest_connector_task,
     ingest_remote,
     process_agent_webhook,
     store_attachment,
@@ -46,6 +47,7 @@
 )
 from application.utils import num_tokens_from_string
 from application.vectorstore.vector_creator import VectorCreator
+from application.parser.connectors.connector_creator import ConnectorCreator
 
 storage = StorageCreator.get_storage()
 
@@ -492,9 +494,9 @@ def get(self):
         )
         if not doc:
             return make_response(jsonify({"status": "not found"}), 404)
-        
+
         storage = StorageCreator.get_storage()
-        
+
         try:
             # Delete vector index
             if settings.VECTOR_STORE == "faiss":
@@ -508,7 +510,7 @@ def get(self):
                     settings.VECTOR_STORE, source_id=str(doc["_id"])
                 )
                 vectorstore.delete_index()
-                
+
             if "file_path" in doc and doc["file_path"]:
                 file_path = doc["file_path"]
                 if storage.is_directory(file_path):
@@ -517,15 +519,15 @@ def get(self):
                         storage.delete_file(f)
                 else:
                     storage.delete_file(file_path)
-                    
+
         except FileNotFoundError:
             pass
         except Exception as err:
             current_app.logger.error(
                 f"Error deleting files and indexes: {err}", exc_info=True
             )
             return make_response(jsonify({"success": False}), 400)
-            
+
         sources_collection.delete_one({"_id": ObjectId(source_id)})
         return make_response(jsonify({"success": True}), 200)
 
@@ -573,30 +575,30 @@ def post(self):
 
         try:
             storage = StorageCreator.get_storage()
-            
-            
+
+
             for file in files:
                 original_filename = file.filename
                 safe_file = safe_filename(original_filename)
-                
+
                 with tempfile.TemporaryDirectory() as temp_dir:
                     temp_file_path = os.path.join(temp_dir, safe_file)
                     file.save(temp_file_path)
-                    
+
                     if zipfile.is_zipfile(temp_file_path):
                         try:
                             with zipfile.ZipFile(temp_file_path, 'r') as zip_ref:
                                 zip_ref.extractall(path=temp_dir)
-                                
+
                                 # Walk through extracted files and upload them
                                 for root, _, files in os.walk(temp_dir):
                                     for extracted_file in files:
                                         if os.path.join(root, extracted_file) == temp_file_path:
                                             continue
-                                            
+
                                         rel_path = os.path.relpath(os.path.join(root, extracted_file), temp_dir)
                                         storage_path = f"{base_path}/{rel_path}"
-                                        
+
                                         with open(os.path.join(root, extracted_file), 'rb') as f:
                                             storage.save_file(f, storage_path)
                         except Exception as e:
@@ -610,7 +612,7 @@ def post(self):
                         file_path = f"{base_path}/{safe_file}"
                         with open(temp_file_path, 'rb') as f:
                             storage.save_file(f, file_path)
-            
+
             task = ingest.delay(
                 settings.UPLOAD_FOLDER,
                 [
@@ -686,8 +688,8 @@ def post(self):
         try:
             storage = StorageCreator.get_storage()
             source_file_path = source.get("file_path", "")
-            parent_dir = request.form.get("parent_dir", "") 
-            
+            parent_dir = request.form.get("parent_dir", "")
+
             if parent_dir and (parent_dir.startswith("/") or ".." in parent_dir):
                 return make_response(
                     jsonify({"success": False, "message": "Invalid parent directory path"}), 400
@@ -701,7 +703,7 @@ def post(self):
                     )
 
                 added_files = []
-                
+
                 target_dir = source_file_path
                 if parent_dir:
                     target_dir = f"{source_file_path}/{parent_dir}"
@@ -877,6 +879,42 @@ def post(self):
                 source_data = config.get("url")
             elif data["source"] == "reddit":
                 source_data = config
+            elif data["source"] in ConnectorCreator.get_supported_connectors():
+                session_token = config.get("session_token")
+                if not session_token:
+                    return make_response(jsonify({
+                        "success": False,
+                        "error": f"Missing session_token in {data['source']} configuration"
+                    }), 400)
+
+                # Process file_ids
+                file_ids = config.get("file_ids", [])
+                if isinstance(file_ids, str):
+                    file_ids = [id.strip() for id in file_ids.split(',') if id.strip()]
+                elif not isinstance(file_ids, list):
+                    file_ids = []
+
+                # Process folder_ids
+                folder_ids = config.get("folder_ids", [])
+                if isinstance(folder_ids, str):
+                    folder_ids = [id.strip() for id in folder_ids.split(',') if id.strip()]
+                elif not isinstance(folder_ids, list):
+                    folder_ids = []
+
+                config["file_ids"] = file_ids
+                config["folder_ids"] = folder_ids
+
+                task = ingest_connector_task.delay(
+                    job_name=data["name"],
+                    user=decoded_token.get("sub"),
+                    source_type=data["source"],
+                    session_token=session_token,
+                    file_ids=file_ids,
+                    folder_ids=folder_ids,
+                    recursive=config.get("recursive", False),
+                    retriever=config.get("retriever", "classic")
+                )
+                return make_response(jsonify({"success": True, "task_id": task.id}), 200)
             task = ingest_remote.delay(
                 source_data=source_data,
                 job_name=data["name"],
@@ -984,7 +1022,8 @@ def get(self):
                     "tokens": doc.get("tokens", ""),
                     "retriever": doc.get("retriever", "classic"),
                     "syncFrequency": doc.get("sync_frequency", ""),
-                    "isNested": bool(doc.get("directory_structure"))
+                    "isNested": bool(doc.get("directory_structure")),
+                    "type": doc.get("type", "file")
                 }
                 paginated_docs.append(doc_data)
             response = {
@@ -1032,7 +1071,8 @@ def get(self):
                         "tokens": index.get("tokens", ""),
                         "retriever": index.get("retriever", "classic"),
                         "syncFrequency": index.get("sync_frequency", ""),
-                        "is_nested": bool(index.get("directory_structure"))
+                        "is_nested": bool(index.get("directory_structure")),
+                        "type": index.get("type", "file")  # Add type field with default "file"
                     }
                 )
         except Exception as err:
@@ -1407,27 +1447,27 @@ def post(self):
                 except json.JSONDecodeError:
                     data["json_schema"] = None
         print(f"Received data: {data}")
-        
+
         # Validate JSON schema if provided
         if data.get("json_schema"):
             try:
                 # Basic validation - ensure it's a valid JSON structure
                 json_schema = data.get("json_schema")
                 if not isinstance(json_schema, dict):
                     return make_response(
-                        jsonify({"success": False, "message": "JSON schema must be a valid JSON object"}), 
+                        jsonify({"success": False, "message": "JSON schema must be a valid JSON object"}),
                         400
                     )
-                
+
                 # Validate that it has either a 'schema' property or is itself a schema
                 if "schema" not in json_schema and "type" not in json_schema:
                     return make_response(
-                        jsonify({"success": False, "message": "JSON schema must contain either a 'schema' property or be a valid JSON schema with 'type' property"}), 
+                        jsonify({"success": False, "message": "JSON schema must contain either a 'schema' property or be a valid JSON schema with 'type' property"}),
                         400
                     )
             except Exception as e:
                 return make_response(
-                    jsonify({"success": False, "message": f"Invalid JSON schema: {str(e)}"}), 
+                    jsonify({"success": False, "message": f"Invalid JSON schema: {str(e)}"}),
                     400
                 )
 
@@ -3561,7 +3601,7 @@ def get(self):
         try:
             store = get_vector_store(doc_id)
             chunks = store.get_chunks()
-            
+
             filtered_chunks = []
             for chunk in chunks:
                 metadata = chunk.get("metadata", {})
@@ -3582,9 +3622,9 @@ def get(self):
                         continue
 
                 filtered_chunks.append(chunk)
-            
+
             chunks = filtered_chunks
-            
+
             total_chunks = len(chunks)
             start = (page - 1) * per_page
             end = start + per_page
@@ -3905,39 +3945,54 @@ def get(self):
         decoded_token = request.decoded_token
         if not decoded_token:
             return make_response(jsonify({"success": False}), 401)
-        
+
         user = decoded_token.get("sub")
         doc_id = request.args.get("id")
-        
+
         if not doc_id:
             return make_response(
                 jsonify({"error": "Document ID is required"}), 400
             )
-            
+
         if not ObjectId.is_valid(doc_id):
             return make_response(jsonify({"error": "Invalid document ID"}), 400)
-            
+
         try:
             doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
             if not doc:
                 return make_response(
                     jsonify({"error": "Document not found or access denied"}), 404
                 )
-                
+
             directory_structure = doc.get("directory_structure", {})
-
+            base_path = doc.get("file_path", "")
+
+            provider = None
+            remote_data = doc.get("remote_data")
+            try:
+                if isinstance(remote_data, str) and remote_data:
+                    remote_data_obj = json.loads(remote_data)
+                    provider = remote_data_obj.get("provider")
+            except Exception as e:
+                current_app.logger.warning(
+                    f"Failed to parse remote_data for doc {doc_id}: {e}")
+
             return make_response(
                 jsonify({
                     "success": True,
                     "directory_structure": directory_structure,
-                    "base_path": doc.get("file_path", "")
+                    "base_path": base_path,
+                    "provider": provider,
                 }), 200
             )
-            
+
         except Exception as e:
             current_app.logger.error(
                 f"Error retrieving directory structure: {e}", exc_info=True
             )
             return make_response(
                 jsonify({"success": False, "error": str(e)}), 500
             )
+
+
+
diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py
@@ -47,6 +47,39 @@ def process_agent_webhook(self, agent_id, payload):
     return resp
 
 
+@celery.task(bind=True)
+def ingest_connector_task(
+    self,
+    job_name,
+    user,
+    source_type,
+    session_token=None,
+    file_ids=None,
+    folder_ids=None,
+    recursive=True,
+    retriever="classic",
+    operation_mode="upload",
+    doc_id=None,
+    sync_frequency="never"
+):
+    from application.worker import ingest_connector
+    resp = ingest_connector(
+        self,
+        job_name,
+        user,
+        source_type,
+        session_token=session_token,
+        file_ids=file_ids,
+        folder_ids=folder_ids,
+        recursive=recursive,
+        retriever=retriever,
+        operation_mode=operation_mode,
+        doc_id=doc_id,
+        sync_frequency=sync_frequency
+    )
+    return resp
+
+
 @celery.on_after_configure.connect
 def setup_periodic_tasks(sender, **kwargs):
     sender.add_periodic_task(

diff --git a/application/app.py b/application/app.py
@@ -16,6 +16,7 @@
 from application.api.answer import answer  # noqa: E402
 from application.api.internal.routes import internal  # noqa: E402
 from application.api.user.routes import user  # noqa: E402
+from application.api.connector.routes import connector  # noqa: E402
 from application.celery_init import celery  # noqa: E402
 from application.core.settings import settings  # noqa: E402
 
@@ -30,6 +31,7 @@
 app.register_blueprint(user)
 app.register_blueprint(answer)
 app.register_blueprint(internal)
+app.register_blueprint(connector)
 app.config.update(
     UPLOAD_FOLDER="inputs",
     CELERY_BROKER_URL=settings.CELERY_BROKER_URL,