diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index e1dafce5f..3fd480601 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -16,6 +16,128 @@ async def add( graph_db_config: dict = None, dataset_id: UUID = None, ): + """ + Add data to Cognee for knowledge graph processing. + + This is the first step in the Cognee workflow - it ingests raw data and prepares it + for processing. The function accepts various data formats including text, files, and + binary streams, then stores them in a specified dataset for further processing. + + Prerequisites: + - **LLM_API_KEY**: Must be set in environment variables for content processing + - **Database Setup**: Relational and vector databases must be configured + - **User Authentication**: Uses default user if none provided (created automatically) + + Supported Input Types: + - **Text strings**: Direct text content (str) - any string not starting with "/" or "file://" + - **File paths**: Local file paths as strings in these formats: + * Absolute paths: "/path/to/document.pdf" + * File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt" + * S3 paths: "s3://bucket-name/path/to/file.pdf" + - **Binary file objects**: File handles/streams (BinaryIO) + - **Lists**: Multiple files or text strings in a single call + + Supported File Formats: + - Text files (.txt, .md, .csv) + - PDFs (.pdf) + - Images (.png, .jpg, .jpeg) - extracted via OCR/vision models + - Audio files (.mp3, .wav) - transcribed to text + - Code files (.py, .js, .ts, etc.) - parsed for structure and content + - Office documents (.docx, .pptx) + + Workflow: + 1. **Data Resolution**: Resolves file paths and validates accessibility + 2. **Content Extraction**: Extracts text content from various file formats + 3. **Dataset Storage**: Stores processed content in the specified dataset + 4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions + 5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset + + Args: + data: The data to ingest. Can be: + - Single text string: "Your text content here" + - Absolute file path: "/path/to/document.pdf" + - File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt" + - S3 path: "s3://my-bucket/documents/file.pdf" + - List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle] + - Binary file object: open("file.txt", "rb") + dataset_name: Name of the dataset to store data in. Defaults to "main_dataset". + Create separate datasets to organize different knowledge domains. + user: User object for authentication and permissions. Uses default user if None. + Default user: "default_user@example.com" (created automatically on first use). + Users can only access datasets they have permissions for. + node_set: Optional list of node identifiers for graph organization and access control. + Used for grouping related data points in the knowledge graph. + vector_db_config: Optional configuration for vector database (for custom setups). + graph_db_config: Optional configuration for graph database (for custom setups). + dataset_id: Optional specific dataset UUID to use instead of dataset_name. + + Returns: + PipelineRunInfo: Information about the ingestion pipeline execution including: + - Pipeline run ID for tracking + - Dataset ID where data was stored + - Processing status and any errors + - Execution timestamps and metadata + + Next Steps: + After successfully adding data, call `cognify()` to process the ingested content: + + ```python + import cognee + + # Step 1: Add your data (text content or file path) + await cognee.add("Your document content") # Raw text + # OR + await cognee.add("/path/to/your/file.pdf") # File path + + # Step 2: Process into knowledge graph + await cognee.cognify() + + # Step 3: Search and query + results = await cognee.search("What insights can you find?") + ``` + + Example Usage: + ```python + # Add a single text document + await cognee.add("Natural language processing is a field of AI...") + + # Add multiple files with different path formats + await cognee.add([ + "/absolute/path/to/research_paper.pdf", # Absolute path + "file://relative/path/to/dataset.csv", # Relative file URL + "file:///absolute/path/to/report.docx", # Absolute file URL + "s3://my-bucket/documents/data.json", # S3 path + "Additional context text" # Raw text content + ]) + + # Add to a specific dataset + await cognee.add( + data="Project documentation content", + dataset_name="project_docs" + ) + + # Add a single file + await cognee.add("/home/user/documents/analysis.pdf") + ``` + + Environment Variables: + Required: + - LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.) + + Optional: + - LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama" + - LLM_MODEL: Model name (default: "gpt-4o-mini") + - DEFAULT_USER_EMAIL: Custom default user email + - DEFAULT_USER_PASSWORD: Custom default user password + - VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "qdrant", "weaviate" + - GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j", "networkx" + + Raises: + FileNotFoundError: If specified file paths don't exist + PermissionError: If user lacks access to files or dataset + UnsupportedFileTypeError: If file format cannot be processed + InvalidValueError: If LLM_API_KEY is not set or invalid + """ tasks = [ Task(resolve_data_directories), Task(ingest_data, dataset_name, user, node_set, dataset_id), diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index a3cf645d3..bed200e13 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -39,6 +39,151 @@ async def cognify( graph_db_config: dict = None, run_in_background: bool = False, ): + """ + Transform ingested data into a structured knowledge graph. + + This is the core processing step in Cognee that converts raw text and documents + into an intelligent knowledge graph. It analyzes content, extracts entities and + relationships, and creates semantic connections for enhanced search and reasoning. + + Prerequisites: + - **LLM_API_KEY**: Must be configured (required for entity extraction and graph generation) + - **Data Added**: Must have data previously added via `cognee.add()` + - **Vector Database**: Must be accessible for embeddings storage + - **Graph Database**: Must be accessible for relationship storage + + Input Requirements: + - **Datasets**: Must contain data previously added via `cognee.add()` + - **Content Types**: Works with any text-extractable content including: + * Natural language documents + * Structured data (CSV, JSON) + * Code repositories + * Academic papers and technical documentation + * Mixed multimedia content (with text extraction) + + Processing Pipeline: + 1. **Document Classification**: Identifies document types and structures + 2. **Permission Validation**: Ensures user has processing rights + 3. **Text Chunking**: Breaks content into semantically meaningful segments + 4. **Entity Extraction**: Identifies key concepts, people, places, organizations + 5. **Relationship Detection**: Discovers connections between entities + 6. **Graph Construction**: Builds semantic knowledge graph with embeddings + 7. **Content Summarization**: Creates hierarchical summaries for navigation + + Graph Model Customization: + The `graph_model` parameter allows custom knowledge structures: + - **Default**: General-purpose KnowledgeGraph for any domain + - **Custom Models**: Domain-specific schemas (e.g., scientific papers, code analysis) + - **Ontology Integration**: Use `ontology_file_path` for predefined vocabularies + + Args: + datasets: Dataset name(s) or dataset uuid to process. Processes all available data if None. + - Single dataset: "my_dataset" + - Multiple datasets: ["docs", "research", "reports"] + - None: Process all datasets for the user + user: User context for authentication and data access. Uses default if None. + graph_model: Pydantic model defining the knowledge graph structure. + Defaults to KnowledgeGraph for general-purpose processing. + chunker: Text chunking strategy (TextChunker, LangchainChunker). + - TextChunker: Paragraph-based chunking (default, most reliable) + - LangchainChunker: Recursive character splitting with overlap + Determines how documents are segmented for processing. + chunk_size: Maximum tokens per chunk. Auto-calculated based on LLM if None. + Formula: min(embedding_max_tokens, llm_max_tokens // 2) + Default limits: ~512-8192 tokens depending on models. + Smaller chunks = more granular but potentially fragmented knowledge. + ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types. + Useful for specialized fields like medical or legal documents. + vector_db_config: Custom vector database configuration for embeddings storage. + graph_db_config: Custom graph database configuration for relationship storage. + run_in_background: If True, starts processing asynchronously and returns immediately. + If False, waits for completion before returning. + Background mode recommended for large datasets (>100MB). + Use pipeline_run_id from return value to monitor progress. + + Returns: + Union[dict, list[PipelineRunInfo]]: + - **Blocking mode**: Dictionary mapping dataset_id -> PipelineRunInfo with: + * Processing status (completed/failed/in_progress) + * Extracted entity and relationship counts + * Processing duration and resource usage + * Error details if any failures occurred + - **Background mode**: List of PipelineRunInfo objects for tracking progress + * Use pipeline_run_id to monitor status + * Check completion via pipeline monitoring APIs + + Next Steps: + After successful cognify processing, use search functions to query the knowledge: + + ```python + import cognee + from cognee import SearchType + + # Process your data into knowledge graph + await cognee.cognify() + + # Query for insights using different search types: + + # 1. Natural language completion with graph context + insights = await cognee.search( + "What are the main themes?", + query_type=SearchType.GRAPH_COMPLETION + ) + + # 2. Get entity relationships and connections + relationships = await cognee.search( + "connections between concepts", + query_type=SearchType.INSIGHTS + ) + + # 3. Find relevant document chunks + chunks = await cognee.search( + "specific topic", + query_type=SearchType.CHUNKS + ) + ``` + + Advanced Usage: + ```python + # Custom domain model for scientific papers + class ScientificPaper(DataPoint): + title: str + authors: List[str] + methodology: str + findings: List[str] + + await cognee.cognify( + datasets=["research_papers"], + graph_model=ScientificPaper, + ontology_file_path="scientific_ontology.owl" + ) + + # Background processing for large datasets + run_info = await cognee.cognify( + datasets=["large_corpus"], + run_in_background=True + ) + # Check status later with run_info.pipeline_run_id + ``` + + + Environment Variables: + Required: + - LLM_API_KEY: API key for your LLM provider + + Optional (same as add function): + - LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER + - LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False) + - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60) + + Raises: + DatasetNotFoundError: If specified datasets don't exist + PermissionError: If user lacks processing rights + InvalidValueError: If LLM_API_KEY is not set + OntologyParsingError: If ontology file is malformed + ValueError: If chunks exceed max token limits (reduce chunk_size) + DatabaseNotCreatedError: If databases are not properly initialized + """ tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path) if run_in_background: diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 75873dc88..eb245f545 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -20,6 +20,142 @@ async def search( node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, ) -> list: + """ + Search and query the knowledge graph for insights, information, and connections. + + This is the final step in the Cognee workflow that retrieves information from the + processed knowledge graph. It supports multiple search modes optimized for different + use cases - from simple fact retrieval to complex reasoning and code analysis. + + Search Prerequisites: + - **LLM_API_KEY**: Required for GRAPH_COMPLETION and RAG_COMPLETION search types + - **Data Added**: Must have data previously added via `cognee.add()` + - **Knowledge Graph Built**: Must have processed data via `cognee.cognify()` + - **Dataset Permissions**: User must have 'read' permission on target datasets + - **Vector Database**: Must be accessible for semantic search functionality + + Search Types & Use Cases: + + **GRAPH_COMPLETION** (Default - Recommended): + Natural language Q&A using full graph context and LLM reasoning. + Best for: Complex questions, analysis, summaries, insights. + Returns: Conversational AI responses with graph-backed context. + + **RAG_COMPLETION**: + Traditional RAG using document chunks without graph structure. + Best for: Direct document retrieval, specific fact-finding. + Returns: LLM responses based on relevant text chunks. + + **INSIGHTS**: + Structured entity relationships and semantic connections. + Best for: Understanding concept relationships, knowledge mapping. + Returns: Formatted relationship data and entity connections. + + **CHUNKS**: + Raw text segments that match the query semantically. + Best for: Finding specific passages, citations, exact content. + Returns: Ranked list of relevant text chunks with metadata. + + **SUMMARIES**: + Pre-generated hierarchical summaries of content. + Best for: Quick overviews, document abstracts, topic summaries. + Returns: Multi-level summaries from detailed to high-level. + + **CODE**: + Code-specific search with syntax and semantic understanding. + Best for: Finding functions, classes, implementation patterns. + Returns: Structured code information with context and relationships. + + **CYPHER**: + Direct graph database queries using Cypher syntax. + Best for: Advanced users, specific graph traversals, debugging. + Returns: Raw graph query results. + + Args: + query_text: Your question or search query in natural language. + Examples: + - "What are the main themes in this research?" + - "How do these concepts relate to each other?" + - "Find information about machine learning algorithms" + - "What functions handle user authentication?" + + query_type: SearchType enum specifying the search mode. + Defaults to GRAPH_COMPLETION for conversational AI responses. + + user: User context for data access permissions. Uses default if None. + + datasets: Dataset name(s) to search within. Searches all accessible if None. + - Single dataset: "research_papers" + - Multiple datasets: ["docs", "reports", "analysis"] + - None: Search across all user datasets + + dataset_ids: Alternative to datasets - use specific UUID identifiers. + + system_prompt_path: Custom system prompt file for LLM-based search types. + Defaults to "answer_simple_question.txt". + + top_k: Maximum number of results to return (1-N) + Higher values provide more comprehensive but potentially noisy results. + + node_type: Filter results to specific entity types (for advanced filtering). + + node_name: Filter results to specific named entities (for targeted search). + + Returns: + list: Search results in format determined by query_type: + + **GRAPH_COMPLETION/RAG_COMPLETION**: + [List of conversational AI response strings] + + **INSIGHTS**: + [List of formatted relationship descriptions and entity connections] + + **CHUNKS**: + [List of relevant text passages with source metadata] + + **SUMMARIES**: + [List of hierarchical summaries from general to specific] + + **CODE**: + [List of structured code information with context] + + + + + + Performance & Optimization: + - **GRAPH_COMPLETION**: Slower but most intelligent, uses LLM + graph context + - **RAG_COMPLETION**: Medium speed, uses LLM + document chunks (no graph traversal) + - **INSIGHTS**: Fast, returns structured relationships without LLM processing + - **CHUNKS**: Fastest, pure vector similarity search without LLM + - **SUMMARIES**: Fast, returns pre-computed summaries + - **CODE**: Medium speed, specialized for code understanding + - **top_k**: Start with 10, increase for comprehensive analysis (max 100) + - **datasets**: Specify datasets to improve speed and relevance + + Next Steps After Search: + - Use results for further analysis or application integration + - Combine different search types for comprehensive understanding + - Export insights for reporting or downstream processing + - Iterate with refined queries based on initial results + + Environment Variables: + Required for LLM-based search types (GRAPH_COMPLETION, RAG_COMPLETION): + - LLM_API_KEY: API key for your LLM provider + + Optional: + - LLM_PROVIDER, LLM_MODEL: Configure LLM for search responses + - VECTOR_DB_PROVIDER: Must match what was used during cognify + - GRAPH_DATABASE_PROVIDER: Must match what was used during cognify + + Raises: + DatasetNotFoundError: If specified datasets don't exist or aren't accessible + PermissionDeniedError: If user lacks read access to requested datasets + NoDataError: If no relevant data found for the search query + InvalidValueError: If LLM_API_KEY is not set (for LLM-based search types) + ValueError: If query_text is empty or search parameters are invalid + CollectionNotFoundError: If vector collection not found (data not processed) + """ # We use lists from now on for datasets if isinstance(datasets, UUID) or isinstance(datasets, str): datasets = [datasets]