From ada3f7b0869c63ef780b580e327f9f2e4c5e77dc Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Thu, 3 Jul 2025 20:08:27 +0200 Subject: [PATCH 1/2] fix: Logger suppresion and database logs (#1041) ## Description ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> Co-authored-by: Igor Ilic --- alembic.ini | 2 +- cognee-mcp/entrypoint.sh | 26 ++++++-- cognee-mcp/src/server.py | 1 + cognee/infrastructure/llm/gemini/adapter.py | 1 + .../llm/generic_llm_api/adapter.py | 2 + cognee/infrastructure/llm/utils.py | 9 ++- .../document_types/open_data_file.py | 16 ++++- cognee/shared/logging_utils.py | 61 +++++++++++++++++++ .../unit/modules/data/test_open_data_file.py | 14 +++-- 9 files changed, 117 insertions(+), 15 deletions(-) diff --git a/alembic.ini b/alembic.ini index e7cb55ee6..15cd939b3 100644 --- a/alembic.ini +++ b/alembic.ini @@ -102,7 +102,7 @@ handlers = qualname = sqlalchemy.engine [logger_alembic] -level = INFO +level = WARN handlers = qualname = alembic diff --git a/cognee-mcp/entrypoint.sh b/cognee-mcp/entrypoint.sh index 91a88c572..7a7cf70b8 100644 --- a/cognee-mcp/entrypoint.sh +++ b/cognee-mcp/entrypoint.sh @@ -4,6 +4,10 @@ set -e # Exit on error echo "Debug mode: $DEBUG" echo "Environment: $ENVIRONMENT" +# Set default transport mode if not specified +TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"} +echo "Transport mode: $TRANSPORT_MODE" + # Run Alembic migrations with proper error handling. # Note on UserAlreadyExists error handling: # During database migrations, we attempt to create a default user. If this user @@ -28,19 +32,31 @@ fi echo "Database migrations done." -echo "Starting Cognee MCP Server..." +echo "Starting Cognee MCP Server with transport mode: $TRANSPORT_MODE" # Add startup delay to ensure DB is ready sleep 2 -# Modified Gunicorn startup with error handling +# Modified startup with transport mode selection and error handling if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then if [ "$DEBUG" = "true" ]; then echo "Waiting for the debugger to attach..." - exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee + if [ "$TRANSPORT_MODE" = "sse" ]; then + exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport sse + else + exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport stdio + fi else - exec cognee + if [ "$TRANSPORT_MODE" = "sse" ]; then + exec cognee --transport sse + else + exec cognee --transport stdio + fi fi else - exec cognee + if [ "$TRANSPORT_MODE" = "sse" ]; then + exec cognee --transport sse + else + exec cognee --transport stdio + fi fi diff --git a/cognee-mcp/src/server.py b/cognee-mcp/src/server.py index def3512b1..5a0a36b5a 100755 --- a/cognee-mcp/src/server.py +++ b/cognee-mcp/src/server.py @@ -18,6 +18,7 @@ from cognee.modules.search.types import SearchType from cognee.shared.data_models import KnowledgeGraph from cognee.modules.storage.utils import JSONEncoder + try: from codingagents.coding_rule_associations import ( add_rule_associations, diff --git a/cognee/infrastructure/llm/gemini/adapter.py b/cognee/infrastructure/llm/gemini/adapter.py index d141d1c84..db11a5ab4 100644 --- a/cognee/infrastructure/llm/gemini/adapter.py +++ b/cognee/infrastructure/llm/gemini/adapter.py @@ -1,4 +1,5 @@ import litellm +import logging from pydantic import BaseModel from typing import Type, Optional from litellm import acompletion, JSONSchemaValidationError diff --git a/cognee/infrastructure/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/generic_llm_api/adapter.py index 9c00054f6..e74a4eb03 100644 --- a/cognee/infrastructure/llm/generic_llm_api/adapter.py +++ b/cognee/infrastructure/llm/generic_llm_api/adapter.py @@ -1,5 +1,6 @@ """Adapter for Generic API LLM provider API""" +import logging from typing import Type from pydantic import BaseModel @@ -7,6 +8,7 @@ import instructor from cognee.infrastructure.llm.llm_interface import LLMInterface from cognee.infrastructure.llm.config import get_llm_config from cognee.infrastructure.llm.rate_limiter import rate_limit_async, sleep_and_retry_async +from cognee.shared.logging_utils import get_logger import litellm diff --git a/cognee/infrastructure/llm/utils.py b/cognee/infrastructure/llm/utils.py index d25e41326..fdc8c521c 100644 --- a/cognee/infrastructure/llm/utils.py +++ b/cognee/infrastructure/llm/utils.py @@ -1,8 +1,7 @@ -from cognee.shared.logging_utils import get_logger import litellm -from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.llm.get_llm_client import get_llm_client +from cognee.shared.logging_utils import get_logger logger = get_logger() @@ -22,6 +21,9 @@ def get_max_chunk_tokens(): the smaller value of the embedding engine's max tokens and half of the LLM's maximum tokens. """ + # NOTE: Import must be done in function to avoid circular import issue + from cognee.infrastructure.databases.vector import get_vector_engine + # Calculate max chunk size based on the following formula embedding_engine = get_vector_engine().embedding_engine llm_client = get_llm_client() @@ -93,6 +95,9 @@ async def test_embedding_connection(): the exception if the connection to the embedding handler cannot be established. """ try: + # NOTE: Vector engine import must be done in function to avoid circular import issue + from cognee.infrastructure.databases.vector import get_vector_engine + await get_vector_engine().embedding_engine.embed_text("test") except Exception as e: logger.error(e) diff --git a/cognee/modules/data/processing/document_types/open_data_file.py b/cognee/modules/data/processing/document_types/open_data_file.py index 34a8b098a..4190f4420 100644 --- a/cognee/modules/data/processing/document_types/open_data_file.py +++ b/cognee/modules/data/processing/document_types/open_data_file.py @@ -1,4 +1,6 @@ from typing import IO, Optional +from urllib.parse import urlparse +import os from cognee.api.v1.add.config import get_s3_config @@ -24,8 +26,16 @@ def open_data_file( else: return fs.open(file_path, mode=mode, encoding=encoding, **kwargs) elif file_path.startswith("file://"): - # Handle local file URLs by stripping the file:// prefix - file_path = file_path.replace("file://", "", 1) - return open(file_path, mode=mode, encoding=encoding, **kwargs) + # Handle local file URLs by properly parsing the URI + parsed_url = urlparse(file_path) + # On Windows, urlparse handles drive letters correctly + # Convert the path component to a proper file path + if os.name == "nt": # Windows + # Remove leading slash from Windows paths like /C:/Users/... + local_path = parsed_url.path.lstrip("/") + else: # Unix-like systems + local_path = parsed_url.path + + return open(local_path, mode=mode, encoding=encoding, **kwargs) else: return open(file_path, mode=mode, encoding=encoding, **kwargs) diff --git a/cognee/shared/logging_utils.py b/cognee/shared/logging_utils.py index 16084eac6..989bcba64 100644 --- a/cognee/shared/logging_utils.py +++ b/cognee/shared/logging_utils.py @@ -11,6 +11,23 @@ import importlib.metadata from cognee import __version__ as cognee_version from typing import Protocol + +# Configure external library logging +def configure_external_library_logging(): + """Configure logging for external libraries to reduce verbosity""" + # Configure LiteLLM logging to reduce verbosity + try: + import litellm + + litellm.set_verbose = False + + # Suppress LiteLLM ERROR logging using standard logging + logging.getLogger("litellm").setLevel(logging.CRITICAL) + except ImportError: + # LiteLLM not available, skip configuration + pass + + # Export common log levels DEBUG = logging.DEBUG INFO = logging.INFO @@ -148,6 +165,44 @@ def get_logger(name=None, level=None) -> LoggerInterface: return logger +def log_database_configuration(logger): + """Log the current database configuration for all database types""" + # NOTE: Has to be imporated at runtime to avoid circular import + from cognee.infrastructure.databases.relational.config import get_relational_config + from cognee.infrastructure.databases.vector.config import get_vectordb_config + from cognee.infrastructure.databases.graph.config import get_graph_config + + try: + # Log relational database configuration + relational_config = get_relational_config() + logger.info(f"Relational database: {relational_config.db_provider}") + if relational_config.db_provider == "postgres": + logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}") + logger.info(f"Postgres database: {relational_config.db_name}") + elif relational_config.db_provider == "sqlite": + logger.info(f"SQLite path: {relational_config.db_path}") + logger.info(f"SQLite database: {relational_config.db_name}") + + # Log vector database configuration + vector_config = get_vectordb_config() + logger.info(f"Vector database: {vector_config.vector_db_provider}") + if vector_config.vector_db_provider == "lancedb": + logger.info(f"Vector database path: {vector_config.vector_db_url}") + else: + logger.info(f"Vector database URL: {vector_config.vector_db_url}") + + # Log graph database configuration + graph_config = get_graph_config() + logger.info(f"Graph database: {graph_config.graph_database_provider}") + if graph_config.graph_database_provider == "kuzu": + logger.info(f"Graph database path: {graph_config.graph_file_path}") + else: + logger.info(f"Graph database URL: {graph_config.graph_database_url}") + + except Exception as e: + logger.warning(f"Could not retrieve database configuration: {str(e)}") + + def cleanup_old_logs(logs_dir, max_files): """ Removes old log files, keeping only the most recent ones. @@ -193,6 +248,9 @@ def setup_logging(log_level=None, name=None): log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")] + # Configure external library logging early to suppress verbose output + configure_external_library_logging() + def exception_handler(logger, method_name, event_dict): """Custom processor to handle uncaught exceptions.""" # Check if there's an exc_info that needs to be processed @@ -339,6 +397,9 @@ def setup_logging(log_level=None, name=None): logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai") + # Log database configuration + log_database_configuration(logger) + # Return the configured logger return logger diff --git a/cognee/tests/unit/modules/data/test_open_data_file.py b/cognee/tests/unit/modules/data/test_open_data_file.py index eea402aa3..8ad3ec813 100644 --- a/cognee/tests/unit/modules/data/test_open_data_file.py +++ b/cognee/tests/unit/modules/data/test_open_data_file.py @@ -1,6 +1,7 @@ import os import tempfile import pytest +from pathlib import Path from cognee.modules.data.processing.document_types.open_data_file import open_data_file @@ -29,7 +30,8 @@ class TestOpenDataFile: temp_file_path = f.name try: - file_url = f"file://{temp_file_path}" + # Use pathlib.Path.as_uri() for proper cross-platform file URL creation + file_url = Path(temp_file_path).as_uri() with open_data_file(file_url, mode="r") as f: content = f.read() assert content == test_content @@ -44,7 +46,8 @@ class TestOpenDataFile: temp_file_path = f.name try: - file_url = f"file://{temp_file_path}" + # Use pathlib.Path.as_uri() for proper cross-platform file URL creation + file_url = Path(temp_file_path).as_uri() with open_data_file(file_url, mode="rb") as f: content = f.read() assert content == test_content.encode() @@ -61,7 +64,8 @@ class TestOpenDataFile: temp_file_path = f.name try: - file_url = f"file://{temp_file_path}" + # Use pathlib.Path.as_uri() for proper cross-platform file URL creation + file_url = Path(temp_file_path).as_uri() with open_data_file(file_url, mode="r", encoding="utf-8") as f: content = f.read() assert content == test_content @@ -84,7 +88,9 @@ class TestOpenDataFile: try: # Even if someone accidentally adds multiple file:// prefixes - file_url = f"file://file://{temp_file_path}" + # Use proper file URL creation first + proper_file_url = Path(temp_file_path).as_uri() + file_url = f"file://{proper_file_url}" with open_data_file(file_url, mode="r") as f: content = f.read() # This should work because we only replace the first occurrence From c936f5e0a30e8009d21c5dd09ae9dd46e5dfd9c3 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Thu, 3 Jul 2025 21:24:47 +0200 Subject: [PATCH 2/2] feat: adding docstrings (#1045) ## Description ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- cognee/api/v1/add/add.py | 122 ++++++++++++++++++++++++++ cognee/api/v1/cognify/cognify.py | 145 +++++++++++++++++++++++++++++++ cognee/api/v1/search/search.py | 136 +++++++++++++++++++++++++++++ 3 files changed, 403 insertions(+) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index e1dafce5f..3fd480601 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -16,6 +16,128 @@ async def add( graph_db_config: dict = None, dataset_id: UUID = None, ): + """ + Add data to Cognee for knowledge graph processing. + + This is the first step in the Cognee workflow - it ingests raw data and prepares it + for processing. The function accepts various data formats including text, files, and + binary streams, then stores them in a specified dataset for further processing. + + Prerequisites: + - **LLM_API_KEY**: Must be set in environment variables for content processing + - **Database Setup**: Relational and vector databases must be configured + - **User Authentication**: Uses default user if none provided (created automatically) + + Supported Input Types: + - **Text strings**: Direct text content (str) - any string not starting with "/" or "file://" + - **File paths**: Local file paths as strings in these formats: + * Absolute paths: "/path/to/document.pdf" + * File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt" + * S3 paths: "s3://bucket-name/path/to/file.pdf" + - **Binary file objects**: File handles/streams (BinaryIO) + - **Lists**: Multiple files or text strings in a single call + + Supported File Formats: + - Text files (.txt, .md, .csv) + - PDFs (.pdf) + - Images (.png, .jpg, .jpeg) - extracted via OCR/vision models + - Audio files (.mp3, .wav) - transcribed to text + - Code files (.py, .js, .ts, etc.) - parsed for structure and content + - Office documents (.docx, .pptx) + + Workflow: + 1. **Data Resolution**: Resolves file paths and validates accessibility + 2. **Content Extraction**: Extracts text content from various file formats + 3. **Dataset Storage**: Stores processed content in the specified dataset + 4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions + 5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset + + Args: + data: The data to ingest. Can be: + - Single text string: "Your text content here" + - Absolute file path: "/path/to/document.pdf" + - File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt" + - S3 path: "s3://my-bucket/documents/file.pdf" + - List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle] + - Binary file object: open("file.txt", "rb") + dataset_name: Name of the dataset to store data in. Defaults to "main_dataset". + Create separate datasets to organize different knowledge domains. + user: User object for authentication and permissions. Uses default user if None. + Default user: "default_user@example.com" (created automatically on first use). + Users can only access datasets they have permissions for. + node_set: Optional list of node identifiers for graph organization and access control. + Used for grouping related data points in the knowledge graph. + vector_db_config: Optional configuration for vector database (for custom setups). + graph_db_config: Optional configuration for graph database (for custom setups). + dataset_id: Optional specific dataset UUID to use instead of dataset_name. + + Returns: + PipelineRunInfo: Information about the ingestion pipeline execution including: + - Pipeline run ID for tracking + - Dataset ID where data was stored + - Processing status and any errors + - Execution timestamps and metadata + + Next Steps: + After successfully adding data, call `cognify()` to process the ingested content: + + ```python + import cognee + + # Step 1: Add your data (text content or file path) + await cognee.add("Your document content") # Raw text + # OR + await cognee.add("/path/to/your/file.pdf") # File path + + # Step 2: Process into knowledge graph + await cognee.cognify() + + # Step 3: Search and query + results = await cognee.search("What insights can you find?") + ``` + + Example Usage: + ```python + # Add a single text document + await cognee.add("Natural language processing is a field of AI...") + + # Add multiple files with different path formats + await cognee.add([ + "/absolute/path/to/research_paper.pdf", # Absolute path + "file://relative/path/to/dataset.csv", # Relative file URL + "file:///absolute/path/to/report.docx", # Absolute file URL + "s3://my-bucket/documents/data.json", # S3 path + "Additional context text" # Raw text content + ]) + + # Add to a specific dataset + await cognee.add( + data="Project documentation content", + dataset_name="project_docs" + ) + + # Add a single file + await cognee.add("/home/user/documents/analysis.pdf") + ``` + + Environment Variables: + Required: + - LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.) + + Optional: + - LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama" + - LLM_MODEL: Model name (default: "gpt-4o-mini") + - DEFAULT_USER_EMAIL: Custom default user email + - DEFAULT_USER_PASSWORD: Custom default user password + - VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "qdrant", "weaviate" + - GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j", "networkx" + + Raises: + FileNotFoundError: If specified file paths don't exist + PermissionError: If user lacks access to files or dataset + UnsupportedFileTypeError: If file format cannot be processed + InvalidValueError: If LLM_API_KEY is not set or invalid + """ tasks = [ Task(resolve_data_directories), Task(ingest_data, dataset_name, user, node_set, dataset_id), diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index a3cf645d3..bed200e13 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -39,6 +39,151 @@ async def cognify( graph_db_config: dict = None, run_in_background: bool = False, ): + """ + Transform ingested data into a structured knowledge graph. + + This is the core processing step in Cognee that converts raw text and documents + into an intelligent knowledge graph. It analyzes content, extracts entities and + relationships, and creates semantic connections for enhanced search and reasoning. + + Prerequisites: + - **LLM_API_KEY**: Must be configured (required for entity extraction and graph generation) + - **Data Added**: Must have data previously added via `cognee.add()` + - **Vector Database**: Must be accessible for embeddings storage + - **Graph Database**: Must be accessible for relationship storage + + Input Requirements: + - **Datasets**: Must contain data previously added via `cognee.add()` + - **Content Types**: Works with any text-extractable content including: + * Natural language documents + * Structured data (CSV, JSON) + * Code repositories + * Academic papers and technical documentation + * Mixed multimedia content (with text extraction) + + Processing Pipeline: + 1. **Document Classification**: Identifies document types and structures + 2. **Permission Validation**: Ensures user has processing rights + 3. **Text Chunking**: Breaks content into semantically meaningful segments + 4. **Entity Extraction**: Identifies key concepts, people, places, organizations + 5. **Relationship Detection**: Discovers connections between entities + 6. **Graph Construction**: Builds semantic knowledge graph with embeddings + 7. **Content Summarization**: Creates hierarchical summaries for navigation + + Graph Model Customization: + The `graph_model` parameter allows custom knowledge structures: + - **Default**: General-purpose KnowledgeGraph for any domain + - **Custom Models**: Domain-specific schemas (e.g., scientific papers, code analysis) + - **Ontology Integration**: Use `ontology_file_path` for predefined vocabularies + + Args: + datasets: Dataset name(s) or dataset uuid to process. Processes all available data if None. + - Single dataset: "my_dataset" + - Multiple datasets: ["docs", "research", "reports"] + - None: Process all datasets for the user + user: User context for authentication and data access. Uses default if None. + graph_model: Pydantic model defining the knowledge graph structure. + Defaults to KnowledgeGraph for general-purpose processing. + chunker: Text chunking strategy (TextChunker, LangchainChunker). + - TextChunker: Paragraph-based chunking (default, most reliable) + - LangchainChunker: Recursive character splitting with overlap + Determines how documents are segmented for processing. + chunk_size: Maximum tokens per chunk. Auto-calculated based on LLM if None. + Formula: min(embedding_max_tokens, llm_max_tokens // 2) + Default limits: ~512-8192 tokens depending on models. + Smaller chunks = more granular but potentially fragmented knowledge. + ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types. + Useful for specialized fields like medical or legal documents. + vector_db_config: Custom vector database configuration for embeddings storage. + graph_db_config: Custom graph database configuration for relationship storage. + run_in_background: If True, starts processing asynchronously and returns immediately. + If False, waits for completion before returning. + Background mode recommended for large datasets (>100MB). + Use pipeline_run_id from return value to monitor progress. + + Returns: + Union[dict, list[PipelineRunInfo]]: + - **Blocking mode**: Dictionary mapping dataset_id -> PipelineRunInfo with: + * Processing status (completed/failed/in_progress) + * Extracted entity and relationship counts + * Processing duration and resource usage + * Error details if any failures occurred + - **Background mode**: List of PipelineRunInfo objects for tracking progress + * Use pipeline_run_id to monitor status + * Check completion via pipeline monitoring APIs + + Next Steps: + After successful cognify processing, use search functions to query the knowledge: + + ```python + import cognee + from cognee import SearchType + + # Process your data into knowledge graph + await cognee.cognify() + + # Query for insights using different search types: + + # 1. Natural language completion with graph context + insights = await cognee.search( + "What are the main themes?", + query_type=SearchType.GRAPH_COMPLETION + ) + + # 2. Get entity relationships and connections + relationships = await cognee.search( + "connections between concepts", + query_type=SearchType.INSIGHTS + ) + + # 3. Find relevant document chunks + chunks = await cognee.search( + "specific topic", + query_type=SearchType.CHUNKS + ) + ``` + + Advanced Usage: + ```python + # Custom domain model for scientific papers + class ScientificPaper(DataPoint): + title: str + authors: List[str] + methodology: str + findings: List[str] + + await cognee.cognify( + datasets=["research_papers"], + graph_model=ScientificPaper, + ontology_file_path="scientific_ontology.owl" + ) + + # Background processing for large datasets + run_info = await cognee.cognify( + datasets=["large_corpus"], + run_in_background=True + ) + # Check status later with run_info.pipeline_run_id + ``` + + + Environment Variables: + Required: + - LLM_API_KEY: API key for your LLM provider + + Optional (same as add function): + - LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER + - LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False) + - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60) + + Raises: + DatasetNotFoundError: If specified datasets don't exist + PermissionError: If user lacks processing rights + InvalidValueError: If LLM_API_KEY is not set + OntologyParsingError: If ontology file is malformed + ValueError: If chunks exceed max token limits (reduce chunk_size) + DatabaseNotCreatedError: If databases are not properly initialized + """ tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path) if run_in_background: diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 75873dc88..eb245f545 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -20,6 +20,142 @@ async def search( node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, ) -> list: + """ + Search and query the knowledge graph for insights, information, and connections. + + This is the final step in the Cognee workflow that retrieves information from the + processed knowledge graph. It supports multiple search modes optimized for different + use cases - from simple fact retrieval to complex reasoning and code analysis. + + Search Prerequisites: + - **LLM_API_KEY**: Required for GRAPH_COMPLETION and RAG_COMPLETION search types + - **Data Added**: Must have data previously added via `cognee.add()` + - **Knowledge Graph Built**: Must have processed data via `cognee.cognify()` + - **Dataset Permissions**: User must have 'read' permission on target datasets + - **Vector Database**: Must be accessible for semantic search functionality + + Search Types & Use Cases: + + **GRAPH_COMPLETION** (Default - Recommended): + Natural language Q&A using full graph context and LLM reasoning. + Best for: Complex questions, analysis, summaries, insights. + Returns: Conversational AI responses with graph-backed context. + + **RAG_COMPLETION**: + Traditional RAG using document chunks without graph structure. + Best for: Direct document retrieval, specific fact-finding. + Returns: LLM responses based on relevant text chunks. + + **INSIGHTS**: + Structured entity relationships and semantic connections. + Best for: Understanding concept relationships, knowledge mapping. + Returns: Formatted relationship data and entity connections. + + **CHUNKS**: + Raw text segments that match the query semantically. + Best for: Finding specific passages, citations, exact content. + Returns: Ranked list of relevant text chunks with metadata. + + **SUMMARIES**: + Pre-generated hierarchical summaries of content. + Best for: Quick overviews, document abstracts, topic summaries. + Returns: Multi-level summaries from detailed to high-level. + + **CODE**: + Code-specific search with syntax and semantic understanding. + Best for: Finding functions, classes, implementation patterns. + Returns: Structured code information with context and relationships. + + **CYPHER**: + Direct graph database queries using Cypher syntax. + Best for: Advanced users, specific graph traversals, debugging. + Returns: Raw graph query results. + + Args: + query_text: Your question or search query in natural language. + Examples: + - "What are the main themes in this research?" + - "How do these concepts relate to each other?" + - "Find information about machine learning algorithms" + - "What functions handle user authentication?" + + query_type: SearchType enum specifying the search mode. + Defaults to GRAPH_COMPLETION for conversational AI responses. + + user: User context for data access permissions. Uses default if None. + + datasets: Dataset name(s) to search within. Searches all accessible if None. + - Single dataset: "research_papers" + - Multiple datasets: ["docs", "reports", "analysis"] + - None: Search across all user datasets + + dataset_ids: Alternative to datasets - use specific UUID identifiers. + + system_prompt_path: Custom system prompt file for LLM-based search types. + Defaults to "answer_simple_question.txt". + + top_k: Maximum number of results to return (1-N) + Higher values provide more comprehensive but potentially noisy results. + + node_type: Filter results to specific entity types (for advanced filtering). + + node_name: Filter results to specific named entities (for targeted search). + + Returns: + list: Search results in format determined by query_type: + + **GRAPH_COMPLETION/RAG_COMPLETION**: + [List of conversational AI response strings] + + **INSIGHTS**: + [List of formatted relationship descriptions and entity connections] + + **CHUNKS**: + [List of relevant text passages with source metadata] + + **SUMMARIES**: + [List of hierarchical summaries from general to specific] + + **CODE**: + [List of structured code information with context] + + + + + + Performance & Optimization: + - **GRAPH_COMPLETION**: Slower but most intelligent, uses LLM + graph context + - **RAG_COMPLETION**: Medium speed, uses LLM + document chunks (no graph traversal) + - **INSIGHTS**: Fast, returns structured relationships without LLM processing + - **CHUNKS**: Fastest, pure vector similarity search without LLM + - **SUMMARIES**: Fast, returns pre-computed summaries + - **CODE**: Medium speed, specialized for code understanding + - **top_k**: Start with 10, increase for comprehensive analysis (max 100) + - **datasets**: Specify datasets to improve speed and relevance + + Next Steps After Search: + - Use results for further analysis or application integration + - Combine different search types for comprehensive understanding + - Export insights for reporting or downstream processing + - Iterate with refined queries based on initial results + + Environment Variables: + Required for LLM-based search types (GRAPH_COMPLETION, RAG_COMPLETION): + - LLM_API_KEY: API key for your LLM provider + + Optional: + - LLM_PROVIDER, LLM_MODEL: Configure LLM for search responses + - VECTOR_DB_PROVIDER: Must match what was used during cognify + - GRAPH_DATABASE_PROVIDER: Must match what was used during cognify + + Raises: + DatasetNotFoundError: If specified datasets don't exist or aren't accessible + PermissionDeniedError: If user lacks read access to requested datasets + NoDataError: If no relevant data found for the search query + InvalidValueError: If LLM_API_KEY is not set (for LLM-based search types) + ValueError: If query_text is empty or search parameters are invalid + CollectionNotFoundError: If vector collection not found (data not processed) + """ # We use lists from now on for datasets if isinstance(datasets, UUID) or isinstance(datasets, str): datasets = [datasets]