From ae242de7f6258772213525bc9c6223f770712f37 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 15 Jul 2025 14:18:38 +0200 Subject: [PATCH] feat: Split add endpoint for data upload and for text upload --- cognee/api/v1/add/routers/get_add_router.py | 87 ++++++++++++++------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/cognee/api/v1/add/routers/get_add_router.py b/cognee/api/v1/add/routers/get_add_router.py index e18a5f322..527037aca 100644 --- a/cognee/api/v1/add/routers/get_add_router.py +++ b/cognee/api/v1/add/routers/get_add_router.py @@ -5,7 +5,6 @@ from fastapi import Form, UploadFile, Depends from fastapi.responses import JSONResponse from fastapi import APIRouter from typing import List, Optional -import subprocess from cognee.shared.logging_utils import get_logger import requests @@ -28,15 +27,58 @@ def get_add_router() -> APIRouter: """ Add data to a dataset for processing and knowledge graph construction. - This endpoint accepts various types of data (files, URLs, GitHub repositories) - and adds them to a specified dataset for processing. The data is ingested, - analyzed, and integrated into the knowledge graph. + This endpoint accepts various types of data files and adds them to a specified dataset for processing. + The data is ingested, analyzed, and integrated into the knowledge graph. ## Request Parameters - - **data** (List[UploadFile]): List of files to upload. Can also include: - - HTTP URLs (if ALLOW_HTTP_REQUESTS is enabled) - - GitHub repository URLs (will be cloned and processed) - - Regular file uploads + - **data** (List[UploadFile]): List of files to upload. Regular file uploads. + - **datasetName** (Optional[str]): Name of the dataset to add data to + - **datasetId** (Optional[UUID]): UUID of the dataset to add data to + + Either datasetName or datasetId must be provided. + + ## Response + Returns information about the add operation containing: + - Status of the operation + - Details about the processed data + - Any relevant metadata from the ingestion process + + ## Error Codes + - **400 Bad Request**: Neither datasetId nor datasetName provided + - **409 Conflict**: Error during add operation + - **403 Forbidden**: User doesn't have permission to add to dataset + + ## Notes + - To add data to datasets not owned by the user, use dataset_id (when ENABLE_BACKEND_ACCESS_CONTROL is set to True) + + """ + from cognee.api.v1.add import add as cognee_add + + if not datasetId and not datasetName: + raise ValueError("Either datasetId or datasetName must be provided.") + + try: + add_run = await cognee_add(data, datasetName, user=user, dataset_id=datasetId) + return add_run.model_dump() + except Exception as error: + return JSONResponse(status_code=409, content={"error": str(error)}) + + @router.post("/text", response_model=dict) + async def add_text( + text_data: List[str] = Form(description="Plain-text data"), + datasetName: Optional[str] = Form(default=None), + datasetId: Optional[UUID] = Form(default=None), + user: User = Depends(get_authenticated_user), + ): + """ + Add text data to a dataset for processing and knowledge graph construction. + + This endpoint accepts only text and adds it to a specified dataset for processing. The text is ingested, + analyzed, and integrated into the knowledge graph. + + ## Request Parameters + - **text_data** (List[str]): List of text to process. Can also include: + HTTP URLs (if ALLOW_HTTP_REQUESTS is enabled) - **datasetName** (Optional[str]): Name of the dataset to add data to - **datasetId** (Optional[UUID]): UUID of the dataset to add data to @@ -55,7 +97,6 @@ def get_add_router() -> APIRouter: ## Notes - To add data to datasets not owned by the user, use dataset_id (when ENABLE_BACKEND_ACCESS_CONTROL is set to True) - - GitHub repositories are cloned and all files are processed - HTTP URLs are fetched and their content is processed - The ALLOW_HTTP_REQUESTS environment variable controls URL processing """ @@ -66,30 +107,18 @@ def get_add_router() -> APIRouter: try: if ( - isinstance(data, str) - and data.startswith("http") + isinstance(text_data, str) + and text_data.startswith("http") and (os.getenv("ALLOW_HTTP_REQUESTS", "true").lower() == "true") ): - if "github" in data: - # Perform git clone if the URL is from GitHub - repo_name = data.split("/")[-1].replace(".git", "") - subprocess.run(["git", "clone", data, f".data/{repo_name}"], check=True) - # TODO: Update add call with dataset info - await cognee_add( - "data://.data/", - f"{repo_name}", - ) - else: - # Fetch and store the data from other types of URL using curl - response = requests.get(data) - response.raise_for_status() + # Fetch and store the data from other types of URL using curl + response = requests.get(text_data) + response.raise_for_status() - file_data = await response.content() - # TODO: Update add call with dataset info - return await cognee_add(file_data) + file_data = await response.content() + return await cognee_add(file_data, datasetName, user=user, dataset_id=datasetId) else: - add_run = await cognee_add(data, datasetName, user=user, dataset_id=datasetId) - + add_run = await cognee_add(text_data, datasetName, user=user, dataset_id=datasetId) return add_run.model_dump() except Exception as error: return JSONResponse(status_code=409, content={"error": str(error)})