cognee/cognee/api/v1/add/routers/get_add_router.py
2025-07-19 22:44:26 +02:00

198 lines
8 KiB
Python

import os
from uuid import UUID
from fastapi import Form, UploadFile, Depends
from fastapi.responses import JSONResponse
from fastapi import APIRouter
from typing import List, Optional
import subprocess
from cognee.shared.logging_utils import get_logger
import requests
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_authenticated_user
from cognee.exceptions import (
UnsupportedFileFormatError,
FileAccessError,
DatasetNotFoundError,
CogneeValidationError,
CogneeSystemError,
)
logger = get_logger()
def get_add_router() -> APIRouter:
router = APIRouter()
@router.post("", response_model=dict)
async def add(
data: List[UploadFile],
datasetName: Optional[str] = Form(default=None),
datasetId: Optional[UUID] = Form(default=None),
user: User = Depends(get_authenticated_user),
):
"""
Add data to a dataset for processing and knowledge graph construction.
This endpoint accepts various types of data (files, URLs, GitHub repositories)
and adds them to a specified dataset for processing. The data is ingested,
analyzed, and integrated into the knowledge graph.
## Request Parameters
- **data** (List[UploadFile]): List of files to upload. Can also include:
- HTTP URLs (if ALLOW_HTTP_REQUESTS is enabled)
- GitHub repository URLs (will be cloned and processed)
- Regular file uploads
- **datasetName** (Optional[str]): Name of the dataset to add data to
- **datasetId** (Optional[UUID]): UUID of the dataset to add data to
Either datasetName or datasetId must be provided.
## Response
Returns information about the add operation containing:
- Status of the operation
- Details about the processed data
- Any relevant metadata from the ingestion process
## Error Codes
- **400 Bad Request**: Missing required parameters or invalid input
- **422 Unprocessable Entity**: Unsupported file format or validation error
- **403 Forbidden**: User doesn't have permission to add to dataset
- **500 Internal Server Error**: System error during processing
## Notes
- To add data to datasets not owned by the user, use dataset_id (when ENABLE_BACKEND_ACCESS_CONTROL is set to True)
- GitHub repositories are cloned and all files are processed
- HTTP URLs are fetched and their content is processed
- The ALLOW_HTTP_REQUESTS environment variable controls URL processing
- Enhanced error messages provide specific guidance for fixing issues
"""
from cognee.api.v1.add import add as cognee_add
# Input validation with enhanced exceptions
if not datasetId and not datasetName:
raise CogneeValidationError(
message="Either datasetId or datasetName must be provided",
user_message="You must specify either a dataset name or dataset ID.",
suggestions=[
"Provide a datasetName parameter (e.g., 'my_dataset')",
"Provide a datasetId parameter with a valid UUID",
"Check the API documentation for parameter examples",
],
docs_link="https://docs.cognee.ai/api/add",
context={"provided_dataset_name": datasetName, "provided_dataset_id": datasetId},
operation="add",
)
if not data or len(data) == 0:
raise CogneeValidationError(
message="No data provided for upload",
user_message="You must provide data to add to the dataset.",
suggestions=[
"Upload one or more files",
"Provide a valid URL (if URL processing is enabled)",
"Check that your request includes the data parameter",
],
docs_link="https://docs.cognee.ai/guides/adding-data",
operation="add",
)
logger.info(
f"Adding {len(data)} items to dataset",
extra={
"dataset_name": datasetName,
"dataset_id": datasetId,
"user_id": user.id,
"item_count": len(data),
},
)
# Handle URL-based data (GitHub repos, HTTP URLs)
if (
len(data) == 1
and hasattr(data[0], "filename")
and isinstance(data[0].filename, str)
and data[0].filename.startswith("http")
and (os.getenv("ALLOW_HTTP_REQUESTS", "true").lower() == "true")
):
url = data[0].filename
if "github" in url:
try:
# Perform git clone if the URL is from GitHub
repo_name = url.split("/")[-1].replace(".git", "")
subprocess.run(["git", "clone", url, f".data/{repo_name}"], check=True)
# TODO: Update add call with dataset info
result = await cognee_add(
"data://.data/",
f"{repo_name}",
)
except subprocess.CalledProcessError as e:
raise CogneeSystemError(
message=f"Failed to clone GitHub repository: {e}",
user_message=f"Could not clone the GitHub repository '{url}'.",
suggestions=[
"Check if the repository URL is correct",
"Verify the repository is public or you have access",
"Try cloning the repository manually to test access",
],
context={"url": url, "repo_name": repo_name, "error": str(e)},
operation="add",
)
else:
try:
# Fetch and store the data from other types of URL
response = requests.get(url, timeout=30)
response.raise_for_status()
file_data = response.content
# TODO: Update add call with dataset info
result = await cognee_add(file_data)
except requests.RequestException as e:
raise CogneeSystemError(
message=f"Failed to fetch URL: {e}",
user_message=f"Could not fetch content from '{url}'.",
suggestions=[
"Check if the URL is accessible",
"Verify your internet connection",
"Try accessing the URL in a browser",
"Check if the URL requires authentication",
],
context={"url": url, "error": str(e)},
operation="add",
)
else:
# Handle regular file uploads
# Validate file types before processing
supported_extensions = [
".txt",
".pdf",
".docx",
".md",
".csv",
".json",
".py",
".js",
".ts",
]
for file in data:
if file.filename:
file_ext = os.path.splitext(file.filename)[1].lower()
if file_ext and file_ext not in supported_extensions:
raise UnsupportedFileFormatError(
file_path=file.filename, supported_formats=supported_extensions
)
# Process the files
result = await cognee_add(data, datasetName, user=user, dataset_id=datasetId)
logger.info(
"Successfully added data to dataset",
extra={"dataset_name": datasetName, "dataset_id": datasetId, "user_id": user.id},
)
return result.model_dump() if hasattr(result, "model_dump") else result
return router