Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b2b0426
(feat:drive) oauth flow
ManishMadan2882 Aug 20, 2025
ab05726
Merge branch 'main' of https://github.com/manishmadan2882/docsgpt
ManishMadan2882 Aug 20, 2025
3b69bea
(chore:settings)addefault oath creds
ManishMadan2882 Aug 21, 2025
c2bebba
(feat:oauth/drive) raw fe integrate
ManishMadan2882 Aug 21, 2025
193d59f
Merge branch 'main' of https://github.com/arc53/DocsGPT
actions-user Aug 22, 2025
8c3f75e
(feat:ingestion) google drive loader
ManishMadan2882 Aug 22, 2025
f82be23
(feat:ingestion) external drive connect
ManishMadan2882 Aug 22, 2025
92d6ae5
(fix:google-oauth) no explicit datetime compare
ManishMadan2882 Aug 22, 2025
e7430f0
(feat:googleDrive,fe) file tree
ManishMadan2882 Aug 22, 2025
b1210c4
Merge branch 'main' of https://github.com/manishmadan2882/docsgpt
ManishMadan2882 Aug 22, 2025
2410bd8
(fix:driveLoader) folder ingesting
ManishMadan2882 Aug 22, 2025
e25b988
Merge branch 'main' of https://github.com/arc53/DocsGPT
actions-user Aug 23, 2025
15a9e97
(feat:ingest_connectors) spread config params
ManishMadan2882 Aug 25, 2025
f09f143
(feat:connectors) separate layer
ManishMadan2882 Aug 25, 2025
578c682
(feat:connectors) abstracting auth, base class
ManishMadan2882 Aug 25, 2025
f08067a
Merge branch 'main' of https://github.com/arc53/DocsGPT
actions-user Aug 27, 2025
4065041
(feat:connectors) separate routes, namespace
ManishMadan2882 Aug 27, 2025
a0cc2e4
Merge branch 'main' of https://github.com/manishmadan2882/docsgpt
ManishMadan2882 Aug 27, 2025
f39ac99
(feat:auth) follow connector-session
ManishMadan2882 Aug 27, 2025
018273c
(feat:connector) refactor, updated routes FE
ManishMadan2882 Aug 28, 2025
e55d1a5
(feat:connector,auth) consider user_id
ManishMadan2882 Aug 28, 2025
e0adc3e
Merge branch 'main' of https://github.com/arc53/DocsGPT
actions-user Aug 29, 2025
2868e47
(feat:connector) provider metadata, separate fe nested display
ManishMadan2882 Aug 29, 2025
bb4ea76
(fix:connectorTree) path navigation fn
ManishMadan2882 Sep 1, 2025
8c986aa
Merge branch 'main' of https://github.com/manishmadan2882/docsgpt
ManishMadan2882 Sep 1, 2025
384ad3e
(feat:connector) raw sync flow
ManishMadan2882 Sep 2, 2025
c2c18e8
(feat:connector,fe) sync api, notification
ManishMadan2882 Sep 2, 2025
f9b2c95
(feat:connector) sync, simply re-ingest
ManishMadan2882 Sep 2, 2025
3b3a04a
(feat:connector) sync fixes UI, minor refactor
ManishMadan2882 Sep 2, 2025
7e2cbdd
(feat:connector) redirect url as backend overhead
ManishMadan2882 Sep 3, 2025
c4a598f
(lint-fix) ruff
ManishMadan2882 Sep 3, 2025
f7f6042
(feat:connector) paginate files
ManishMadan2882 Sep 4, 2025
5a9bc6d
(feat:connector) infinite scroll file pick
ManishMadan2882 Sep 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions application/agents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,17 @@

from bson.objectid import ObjectId

logger = logging.getLogger(__name__)

from application.agents.tools.tool_action_parser import ToolActionParser
from application.agents.tools.tool_manager import ToolManager

from application.core.mongo_db import MongoDB
from application.core.settings import settings

from application.llm.handlers.handler_creator import LLMHandlerCreator
from application.llm.llm_creator import LLMCreator
from application.logging import build_stack_data, log_activity, LogContext
from application.retriever.base import BaseRetriever

logger = logging.getLogger(__name__)


class BaseAgent(ABC):
def __init__(
Expand Down Expand Up @@ -157,7 +155,7 @@ def _execute_tool_action(self, tools_dict, call):
}
yield {"type": "tool_call", "data": {**tool_call_data, "status": "error"}}
self.tool_calls.append(tool_call_data)
return f"Failed to parse tool call.", call_id
return "Failed to parse tool call.", call_id

# Check if tool_id exists in available tools
if tool_id not in tools_dict:
Expand Down
627 changes: 627 additions & 0 deletions application/api/connector/routes.py

Large diffs are not rendered by default.

123 changes: 89 additions & 34 deletions application/api/user/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from application.api.user.tasks import (
ingest,
ingest_connector_task,
ingest_remote,
process_agent_webhook,
store_attachment,
Expand All @@ -46,6 +47,7 @@
)
from application.utils import num_tokens_from_string
from application.vectorstore.vector_creator import VectorCreator
from application.parser.connectors.connector_creator import ConnectorCreator

storage = StorageCreator.get_storage()

Expand Down Expand Up @@ -492,9 +494,9 @@ def get(self):
)
if not doc:
return make_response(jsonify({"status": "not found"}), 404)

storage = StorageCreator.get_storage()

try:
# Delete vector index
if settings.VECTOR_STORE == "faiss":
Expand All @@ -508,7 +510,7 @@ def get(self):
settings.VECTOR_STORE, source_id=str(doc["_id"])
)
vectorstore.delete_index()

if "file_path" in doc and doc["file_path"]:
file_path = doc["file_path"]
if storage.is_directory(file_path):
Expand All @@ -517,15 +519,15 @@ def get(self):
storage.delete_file(f)
else:
storage.delete_file(file_path)

except FileNotFoundError:
pass
except Exception as err:
current_app.logger.error(
f"Error deleting files and indexes: {err}", exc_info=True
)
return make_response(jsonify({"success": False}), 400)

sources_collection.delete_one({"_id": ObjectId(source_id)})
return make_response(jsonify({"success": True}), 200)

Expand Down Expand Up @@ -573,30 +575,30 @@ def post(self):

try:
storage = StorageCreator.get_storage()


for file in files:
original_filename = file.filename
safe_file = safe_filename(original_filename)

with tempfile.TemporaryDirectory() as temp_dir:
temp_file_path = os.path.join(temp_dir, safe_file)
file.save(temp_file_path)

if zipfile.is_zipfile(temp_file_path):
try:
with zipfile.ZipFile(temp_file_path, 'r') as zip_ref:
zip_ref.extractall(path=temp_dir)

# Walk through extracted files and upload them
for root, _, files in os.walk(temp_dir):
for extracted_file in files:
if os.path.join(root, extracted_file) == temp_file_path:
continue

rel_path = os.path.relpath(os.path.join(root, extracted_file), temp_dir)
storage_path = f"{base_path}/{rel_path}"

with open(os.path.join(root, extracted_file), 'rb') as f:
storage.save_file(f, storage_path)
except Exception as e:
Expand All @@ -610,7 +612,7 @@ def post(self):
file_path = f"{base_path}/{safe_file}"
with open(temp_file_path, 'rb') as f:
storage.save_file(f, file_path)

task = ingest.delay(
settings.UPLOAD_FOLDER,
[
Expand Down Expand Up @@ -686,8 +688,8 @@ def post(self):
try:
storage = StorageCreator.get_storage()
source_file_path = source.get("file_path", "")
parent_dir = request.form.get("parent_dir", "")
parent_dir = request.form.get("parent_dir", "")

if parent_dir and (parent_dir.startswith("/") or ".." in parent_dir):
return make_response(
jsonify({"success": False, "message": "Invalid parent directory path"}), 400
Expand All @@ -701,7 +703,7 @@ def post(self):
)

added_files = []

target_dir = source_file_path
if parent_dir:
target_dir = f"{source_file_path}/{parent_dir}"
Expand Down Expand Up @@ -877,6 +879,42 @@ def post(self):
source_data = config.get("url")
elif data["source"] == "reddit":
source_data = config
elif data["source"] in ConnectorCreator.get_supported_connectors():
session_token = config.get("session_token")
if not session_token:
return make_response(jsonify({
"success": False,
"error": f"Missing session_token in {data['source']} configuration"
}), 400)

# Process file_ids
file_ids = config.get("file_ids", [])
if isinstance(file_ids, str):
file_ids = [id.strip() for id in file_ids.split(',') if id.strip()]
elif not isinstance(file_ids, list):
file_ids = []

# Process folder_ids
folder_ids = config.get("folder_ids", [])
if isinstance(folder_ids, str):
folder_ids = [id.strip() for id in folder_ids.split(',') if id.strip()]
elif not isinstance(folder_ids, list):
folder_ids = []

config["file_ids"] = file_ids
config["folder_ids"] = folder_ids

task = ingest_connector_task.delay(
job_name=data["name"],
user=decoded_token.get("sub"),
source_type=data["source"],
session_token=session_token,
file_ids=file_ids,
folder_ids=folder_ids,
recursive=config.get("recursive", False),
retriever=config.get("retriever", "classic")
)
return make_response(jsonify({"success": True, "task_id": task.id}), 200)
task = ingest_remote.delay(
source_data=source_data,
job_name=data["name"],
Expand Down Expand Up @@ -984,7 +1022,8 @@ def get(self):
"tokens": doc.get("tokens", ""),
"retriever": doc.get("retriever", "classic"),
"syncFrequency": doc.get("sync_frequency", ""),
"isNested": bool(doc.get("directory_structure"))
"isNested": bool(doc.get("directory_structure")),
"type": doc.get("type", "file")
}
paginated_docs.append(doc_data)
response = {
Expand Down Expand Up @@ -1032,7 +1071,8 @@ def get(self):
"tokens": index.get("tokens", ""),
"retriever": index.get("retriever", "classic"),
"syncFrequency": index.get("sync_frequency", ""),
"is_nested": bool(index.get("directory_structure"))
"is_nested": bool(index.get("directory_structure")),
"type": index.get("type", "file") # Add type field with default "file"
}
)
except Exception as err:
Expand Down Expand Up @@ -1407,27 +1447,27 @@ def post(self):
except json.JSONDecodeError:
data["json_schema"] = None
print(f"Received data: {data}")

# Validate JSON schema if provided
if data.get("json_schema"):
try:
# Basic validation - ensure it's a valid JSON structure
json_schema = data.get("json_schema")
if not isinstance(json_schema, dict):
return make_response(
jsonify({"success": False, "message": "JSON schema must be a valid JSON object"}),
jsonify({"success": False, "message": "JSON schema must be a valid JSON object"}),
400
)

# Validate that it has either a 'schema' property or is itself a schema
if "schema" not in json_schema and "type" not in json_schema:
return make_response(
jsonify({"success": False, "message": "JSON schema must contain either a 'schema' property or be a valid JSON schema with 'type' property"}),
jsonify({"success": False, "message": "JSON schema must contain either a 'schema' property or be a valid JSON schema with 'type' property"}),
400
)
except Exception as e:
return make_response(
jsonify({"success": False, "message": f"Invalid JSON schema: {str(e)}"}),
jsonify({"success": False, "message": f"Invalid JSON schema: {str(e)}"}),
400
)

Expand Down Expand Up @@ -3561,7 +3601,7 @@ def get(self):
try:
store = get_vector_store(doc_id)
chunks = store.get_chunks()

filtered_chunks = []
for chunk in chunks:
metadata = chunk.get("metadata", {})
Expand All @@ -3582,9 +3622,9 @@ def get(self):
continue

filtered_chunks.append(chunk)

chunks = filtered_chunks

total_chunks = len(chunks)
start = (page - 1) * per_page
end = start + per_page
Expand Down Expand Up @@ -3905,39 +3945,54 @@ def get(self):
decoded_token = request.decoded_token
if not decoded_token:
return make_response(jsonify({"success": False}), 401)

user = decoded_token.get("sub")
doc_id = request.args.get("id")

if not doc_id:
return make_response(
jsonify({"error": "Document ID is required"}), 400
)

if not ObjectId.is_valid(doc_id):
return make_response(jsonify({"error": "Invalid document ID"}), 400)

try:
doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
if not doc:
return make_response(
jsonify({"error": "Document not found or access denied"}), 404
)

directory_structure = doc.get("directory_structure", {})

base_path = doc.get("file_path", "")

provider = None
remote_data = doc.get("remote_data")
try:
if isinstance(remote_data, str) and remote_data:
remote_data_obj = json.loads(remote_data)
provider = remote_data_obj.get("provider")
except Exception as e:
current_app.logger.warning(
f"Failed to parse remote_data for doc {doc_id}: {e}")

return make_response(
jsonify({
"success": True,
"directory_structure": directory_structure,
"base_path": doc.get("file_path", "")
"base_path": base_path,
"provider": provider,
}), 200
)

except Exception as e:
current_app.logger.error(
f"Error retrieving directory structure: {e}", exc_info=True
)
return make_response(
jsonify({"success": False, "error": str(e)}), 500
)



33 changes: 33 additions & 0 deletions application/api/user/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,39 @@ def process_agent_webhook(self, agent_id, payload):
return resp


@celery.task(bind=True)
def ingest_connector_task(
self,
job_name,
user,
source_type,
session_token=None,
file_ids=None,
folder_ids=None,
recursive=True,
retriever="classic",
operation_mode="upload",
doc_id=None,
sync_frequency="never"
):
from application.worker import ingest_connector
resp = ingest_connector(
self,
job_name,
user,
source_type,
session_token=session_token,
file_ids=file_ids,
folder_ids=folder_ids,
recursive=recursive,
retriever=retriever,
operation_mode=operation_mode,
doc_id=doc_id,
sync_frequency=sync_frequency
)
return resp


@celery.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
sender.add_periodic_task(
Expand Down
2 changes: 2 additions & 0 deletions application/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from application.api.answer import answer # noqa: E402
from application.api.internal.routes import internal # noqa: E402
from application.api.user.routes import user # noqa: E402
from application.api.connector.routes import connector # noqa: E402
from application.celery_init import celery # noqa: E402
from application.core.settings import settings # noqa: E402

Expand All @@ -30,6 +31,7 @@
app.register_blueprint(user)
app.register_blueprint(answer)
app.register_blueprint(internal)
app.register_blueprint(connector)
app.config.update(
UPLOAD_FOLDER="inputs",
CELERY_BROKER_URL=settings.CELERY_BROKER_URL,
Expand Down
Loading
Loading