Compare commits

...
Sign in to create a new pull request.

1 commit

Author SHA1 Message Date
Igor Ilic
ae242de7f6 feat: Split add endpoint for data upload and for text upload 2025-07-15 14:18:38 +02:00

View file

@ -5,7 +5,6 @@ from fastapi import Form, UploadFile, Depends
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from fastapi import APIRouter from fastapi import APIRouter
from typing import List, Optional from typing import List, Optional
import subprocess
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
import requests import requests
@ -28,15 +27,58 @@ def get_add_router() -> APIRouter:
""" """
Add data to a dataset for processing and knowledge graph construction. Add data to a dataset for processing and knowledge graph construction.
This endpoint accepts various types of data (files, URLs, GitHub repositories) This endpoint accepts various types of data files and adds them to a specified dataset for processing.
and adds them to a specified dataset for processing. The data is ingested, The data is ingested, analyzed, and integrated into the knowledge graph.
analyzed, and integrated into the knowledge graph.
## Request Parameters ## Request Parameters
- **data** (List[UploadFile]): List of files to upload. Can also include: - **data** (List[UploadFile]): List of files to upload. Regular file uploads.
- HTTP URLs (if ALLOW_HTTP_REQUESTS is enabled) - **datasetName** (Optional[str]): Name of the dataset to add data to
- GitHub repository URLs (will be cloned and processed) - **datasetId** (Optional[UUID]): UUID of the dataset to add data to
- Regular file uploads
Either datasetName or datasetId must be provided.
## Response
Returns information about the add operation containing:
- Status of the operation
- Details about the processed data
- Any relevant metadata from the ingestion process
## Error Codes
- **400 Bad Request**: Neither datasetId nor datasetName provided
- **409 Conflict**: Error during add operation
- **403 Forbidden**: User doesn't have permission to add to dataset
## Notes
- To add data to datasets not owned by the user, use dataset_id (when ENABLE_BACKEND_ACCESS_CONTROL is set to True)
"""
from cognee.api.v1.add import add as cognee_add
if not datasetId and not datasetName:
raise ValueError("Either datasetId or datasetName must be provided.")
try:
add_run = await cognee_add(data, datasetName, user=user, dataset_id=datasetId)
return add_run.model_dump()
except Exception as error:
return JSONResponse(status_code=409, content={"error": str(error)})
@router.post("/text", response_model=dict)
async def add_text(
text_data: List[str] = Form(description="Plain-text data"),
datasetName: Optional[str] = Form(default=None),
datasetId: Optional[UUID] = Form(default=None),
user: User = Depends(get_authenticated_user),
):
"""
Add text data to a dataset for processing and knowledge graph construction.
This endpoint accepts only text and adds it to a specified dataset for processing. The text is ingested,
analyzed, and integrated into the knowledge graph.
## Request Parameters
- **text_data** (List[str]): List of text to process. Can also include:
HTTP URLs (if ALLOW_HTTP_REQUESTS is enabled)
- **datasetName** (Optional[str]): Name of the dataset to add data to - **datasetName** (Optional[str]): Name of the dataset to add data to
- **datasetId** (Optional[UUID]): UUID of the dataset to add data to - **datasetId** (Optional[UUID]): UUID of the dataset to add data to
@ -55,7 +97,6 @@ def get_add_router() -> APIRouter:
## Notes ## Notes
- To add data to datasets not owned by the user, use dataset_id (when ENABLE_BACKEND_ACCESS_CONTROL is set to True) - To add data to datasets not owned by the user, use dataset_id (when ENABLE_BACKEND_ACCESS_CONTROL is set to True)
- GitHub repositories are cloned and all files are processed
- HTTP URLs are fetched and their content is processed - HTTP URLs are fetched and their content is processed
- The ALLOW_HTTP_REQUESTS environment variable controls URL processing - The ALLOW_HTTP_REQUESTS environment variable controls URL processing
""" """
@ -66,30 +107,18 @@ def get_add_router() -> APIRouter:
try: try:
if ( if (
isinstance(data, str) isinstance(text_data, str)
and data.startswith("http") and text_data.startswith("http")
and (os.getenv("ALLOW_HTTP_REQUESTS", "true").lower() == "true") and (os.getenv("ALLOW_HTTP_REQUESTS", "true").lower() == "true")
): ):
if "github" in data: # Fetch and store the data from other types of URL using curl
# Perform git clone if the URL is from GitHub response = requests.get(text_data)
repo_name = data.split("/")[-1].replace(".git", "") response.raise_for_status()
subprocess.run(["git", "clone", data, f".data/{repo_name}"], check=True)
# TODO: Update add call with dataset info
await cognee_add(
"data://.data/",
f"{repo_name}",
)
else:
# Fetch and store the data from other types of URL using curl
response = requests.get(data)
response.raise_for_status()
file_data = await response.content() file_data = await response.content()
# TODO: Update add call with dataset info return await cognee_add(file_data, datasetName, user=user, dataset_id=datasetId)
return await cognee_add(file_data)
else: else:
add_run = await cognee_add(data, datasetName, user=user, dataset_id=datasetId) add_run = await cognee_add(text_data, datasetName, user=user, dataset_id=datasetId)
return add_run.model_dump() return add_run.model_dump()
except Exception as error: except Exception as error:
return JSONResponse(status_code=409, content={"error": str(error)}) return JSONResponse(status_code=409, content={"error": str(error)})