Merge remote-tracking branch 'origin/dev' into feat/modal-parallelization

2025-07-04 15:37:57 +02:00 · 2025-07-04 15:37:57 +02:00 · 4eba76ca1f
commit 4eba76ca1f
parent 00dd3b8d97 c936f5e0a3
11 changed files with 518 additions and 15 deletions
--- a/alembic.ini
+++ b/alembic.ini
@ -102,7 +102,7 @@ handlers =
 qualname = sqlalchemy.engine

 [logger_alembic]
-level = INFO
+level = WARN
 handlers =
 qualname = alembic

--- a/cognee-mcp/entrypoint.sh
+++ b/cognee-mcp/entrypoint.sh
@ -4,6 +4,10 @@ set -e  # Exit on error
 echo "Debug mode: $DEBUG"
 echo "Environment: $ENVIRONMENT"

+# Set default transport mode if not specified
+TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"}
+echo "Transport mode: $TRANSPORT_MODE"
+
 # Run Alembic migrations with proper error handling.
 # Note on UserAlreadyExists error handling:
 # During database migrations, we attempt to create a default user. If this user
@ -28,19 +32,31 @@ fi

 echo "Database migrations done."

-echo "Starting Cognee MCP Server..."
+echo "Starting Cognee MCP Server with transport mode: $TRANSPORT_MODE"

 # Add startup delay to ensure DB is ready
 sleep 2

-# Modified Gunicorn startup with error handling
+# Modified startup with transport mode selection and error handling
 if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then
    if [ "$DEBUG" = "true" ]; then
        echo "Waiting for the debugger to attach..."
-        exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee
+        if [ "$TRANSPORT_MODE" = "sse" ]; then
+            exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport sse
+        else
+            exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport stdio
+        fi
    else
-        exec cognee
+        if [ "$TRANSPORT_MODE" = "sse" ]; then
+            exec cognee --transport sse
+        else
+            exec cognee --transport stdio
+        fi
    fi
 else
-    exec cognee
+    if [ "$TRANSPORT_MODE" = "sse" ]; then
+        exec cognee --transport sse
+    else
+        exec cognee --transport stdio
+    fi
 fi
--- a/cognee-mcp/src/server.py
+++ b/cognee-mcp/src/server.py
@ -18,6 +18,7 @@ from cognee.modules.search.types import SearchType
 from cognee.shared.data_models import KnowledgeGraph
 from cognee.modules.storage.utils import JSONEncoder

+
 try:
    from codingagents.coding_rule_associations import (
        add_rule_associations,
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@ -16,6 +16,128 @@ async def add(
    graph_db_config: dict = None,
    dataset_id: UUID = None,
 ):
+    """
+    Add data to Cognee for knowledge graph processing.
+
+    This is the first step in the Cognee workflow - it ingests raw data and prepares it
+    for processing. The function accepts various data formats including text, files, and
+    binary streams, then stores them in a specified dataset for further processing.
+
+    Prerequisites:
+        - **LLM_API_KEY**: Must be set in environment variables for content processing
+        - **Database Setup**: Relational and vector databases must be configured
+        - **User Authentication**: Uses default user if none provided (created automatically)
+
+    Supported Input Types:
+        - **Text strings**: Direct text content (str) - any string not starting with "/" or "file://"
+        - **File paths**: Local file paths as strings in these formats:
+            * Absolute paths: "/path/to/document.pdf"
+            * File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt"
+            * S3 paths: "s3://bucket-name/path/to/file.pdf"
+        - **Binary file objects**: File handles/streams (BinaryIO)
+        - **Lists**: Multiple files or text strings in a single call
+
+    Supported File Formats:
+        - Text files (.txt, .md, .csv)
+        - PDFs (.pdf)
+        - Images (.png, .jpg, .jpeg) - extracted via OCR/vision models
+        - Audio files (.mp3, .wav) - transcribed to text
+        - Code files (.py, .js, .ts, etc.) - parsed for structure and content
+        - Office documents (.docx, .pptx)
+
+            Workflow:
+        1. **Data Resolution**: Resolves file paths and validates accessibility
+        2. **Content Extraction**: Extracts text content from various file formats
+        3. **Dataset Storage**: Stores processed content in the specified dataset
+        4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions
+        5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset
+
+    Args:
+        data: The data to ingest. Can be:
+            - Single text string: "Your text content here"
+            - Absolute file path: "/path/to/document.pdf"
+            - File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt"
+            - S3 path: "s3://my-bucket/documents/file.pdf"
+            - List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
+            - Binary file object: open("file.txt", "rb")
+        dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
+                    Create separate datasets to organize different knowledge domains.
+        user: User object for authentication and permissions. Uses default user if None.
+              Default user: "default_user@example.com" (created automatically on first use).
+              Users can only access datasets they have permissions for.
+        node_set: Optional list of node identifiers for graph organization and access control.
+                 Used for grouping related data points in the knowledge graph.
+        vector_db_config: Optional configuration for vector database (for custom setups).
+        graph_db_config: Optional configuration for graph database (for custom setups).
+        dataset_id: Optional specific dataset UUID to use instead of dataset_name.
+
+    Returns:
+        PipelineRunInfo: Information about the ingestion pipeline execution including:
+            - Pipeline run ID for tracking
+            - Dataset ID where data was stored
+            - Processing status and any errors
+            - Execution timestamps and metadata
+
+    Next Steps:
+        After successfully adding data, call `cognify()` to process the ingested content:
+
+        ```python
+        import cognee
+
+        # Step 1: Add your data (text content or file path)
+        await cognee.add("Your document content")  # Raw text
+        # OR
+        await cognee.add("/path/to/your/file.pdf")  # File path
+
+        # Step 2: Process into knowledge graph
+        await cognee.cognify()
+
+        # Step 3: Search and query
+        results = await cognee.search("What insights can you find?")
+        ```
+
+    Example Usage:
+        ```python
+        # Add a single text document
+        await cognee.add("Natural language processing is a field of AI...")
+
+        # Add multiple files with different path formats
+        await cognee.add([
+            "/absolute/path/to/research_paper.pdf",        # Absolute path
+            "file://relative/path/to/dataset.csv",         # Relative file URL
+            "file:///absolute/path/to/report.docx",        # Absolute file URL
+            "s3://my-bucket/documents/data.json",           # S3 path
+            "Additional context text"                       # Raw text content
+        ])
+
+        # Add to a specific dataset
+        await cognee.add(
+            data="Project documentation content",
+            dataset_name="project_docs"
+        )
+
+        # Add a single file
+        await cognee.add("/home/user/documents/analysis.pdf")
+        ```
+
+    Environment Variables:
+        Required:
+        - LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.)
+
+        Optional:
+        - LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama"
+        - LLM_MODEL: Model name (default: "gpt-4o-mini")
+        - DEFAULT_USER_EMAIL: Custom default user email
+        - DEFAULT_USER_PASSWORD: Custom default user password
+        - VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "qdrant", "weaviate"
+        - GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j", "networkx"
+
+    Raises:
+        FileNotFoundError: If specified file paths don't exist
+        PermissionError: If user lacks access to files or dataset
+        UnsupportedFileTypeError: If file format cannot be processed
+        InvalidValueError: If LLM_API_KEY is not set or invalid
+    """
    tasks = [
        Task(resolve_data_directories, include_subdirectories=True),
        Task(ingest_data, dataset_name, user, node_set, dataset_id),
--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@ -39,6 +39,151 @@ async def cognify(
    graph_db_config: dict = None,
    run_in_background: bool = False,
 ):
+    """
+    Transform ingested data into a structured knowledge graph.
+
+    This is the core processing step in Cognee that converts raw text and documents
+    into an intelligent knowledge graph. It analyzes content, extracts entities and
+    relationships, and creates semantic connections for enhanced search and reasoning.
+
+    Prerequisites:
+        - **LLM_API_KEY**: Must be configured (required for entity extraction and graph generation)
+        - **Data Added**: Must have data previously added via `cognee.add()`
+        - **Vector Database**: Must be accessible for embeddings storage
+        - **Graph Database**: Must be accessible for relationship storage
+
+    Input Requirements:
+        - **Datasets**: Must contain data previously added via `cognee.add()`
+        - **Content Types**: Works with any text-extractable content including:
+            * Natural language documents
+            * Structured data (CSV, JSON)
+            * Code repositories
+            * Academic papers and technical documentation
+            * Mixed multimedia content (with text extraction)
+
+    Processing Pipeline:
+        1. **Document Classification**: Identifies document types and structures
+        2. **Permission Validation**: Ensures user has processing rights
+        3. **Text Chunking**: Breaks content into semantically meaningful segments
+        4. **Entity Extraction**: Identifies key concepts, people, places, organizations
+        5. **Relationship Detection**: Discovers connections between entities
+        6. **Graph Construction**: Builds semantic knowledge graph with embeddings
+        7. **Content Summarization**: Creates hierarchical summaries for navigation
+
+    Graph Model Customization:
+        The `graph_model` parameter allows custom knowledge structures:
+        - **Default**: General-purpose KnowledgeGraph for any domain
+        - **Custom Models**: Domain-specific schemas (e.g., scientific papers, code analysis)
+        - **Ontology Integration**: Use `ontology_file_path` for predefined vocabularies
+
+    Args:
+        datasets: Dataset name(s) or dataset uuid to process. Processes all available data if None.
+            - Single dataset: "my_dataset"
+            - Multiple datasets: ["docs", "research", "reports"]
+            - None: Process all datasets for the user
+        user: User context for authentication and data access. Uses default if None.
+        graph_model: Pydantic model defining the knowledge graph structure.
+                    Defaults to KnowledgeGraph for general-purpose processing.
+        chunker: Text chunking strategy (TextChunker, LangchainChunker).
+                - TextChunker: Paragraph-based chunking (default, most reliable)
+                - LangchainChunker: Recursive character splitting with overlap
+                Determines how documents are segmented for processing.
+        chunk_size: Maximum tokens per chunk. Auto-calculated based on LLM if None.
+                   Formula: min(embedding_max_tokens, llm_max_tokens // 2)
+                   Default limits: ~512-8192 tokens depending on models.
+                   Smaller chunks = more granular but potentially fragmented knowledge.
+        ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types.
+                          Useful for specialized fields like medical or legal documents.
+        vector_db_config: Custom vector database configuration for embeddings storage.
+        graph_db_config: Custom graph database configuration for relationship storage.
+        run_in_background: If True, starts processing asynchronously and returns immediately.
+                          If False, waits for completion before returning.
+                          Background mode recommended for large datasets (>100MB).
+                          Use pipeline_run_id from return value to monitor progress.
+
+    Returns:
+        Union[dict, list[PipelineRunInfo]]:
+            - **Blocking mode**: Dictionary mapping dataset_id -> PipelineRunInfo with:
+                * Processing status (completed/failed/in_progress)
+                * Extracted entity and relationship counts
+                * Processing duration and resource usage
+                * Error details if any failures occurred
+            - **Background mode**: List of PipelineRunInfo objects for tracking progress
+                * Use pipeline_run_id to monitor status
+                * Check completion via pipeline monitoring APIs
+
+    Next Steps:
+        After successful cognify processing, use search functions to query the knowledge:
+
+        ```python
+        import cognee
+        from cognee import SearchType
+
+        # Process your data into knowledge graph
+        await cognee.cognify()
+
+        # Query for insights using different search types:
+
+        # 1. Natural language completion with graph context
+        insights = await cognee.search(
+            "What are the main themes?",
+            query_type=SearchType.GRAPH_COMPLETION
+        )
+
+        # 2. Get entity relationships and connections
+        relationships = await cognee.search(
+            "connections between concepts",
+            query_type=SearchType.INSIGHTS
+        )
+
+        # 3. Find relevant document chunks
+        chunks = await cognee.search(
+            "specific topic",
+            query_type=SearchType.CHUNKS
+        )
+        ```
+
+    Advanced Usage:
+        ```python
+        # Custom domain model for scientific papers
+        class ScientificPaper(DataPoint):
+            title: str
+            authors: List[str]
+            methodology: str
+            findings: List[str]
+
+        await cognee.cognify(
+            datasets=["research_papers"],
+            graph_model=ScientificPaper,
+            ontology_file_path="scientific_ontology.owl"
+        )
+
+        # Background processing for large datasets
+        run_info = await cognee.cognify(
+            datasets=["large_corpus"],
+            run_in_background=True
+        )
+        # Check status later with run_info.pipeline_run_id
+        ```
+
+
+    Environment Variables:
+        Required:
+        - LLM_API_KEY: API key for your LLM provider
+
+        Optional (same as add function):
+        - LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER
+        - LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False)
+        - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60)
+
+    Raises:
+        DatasetNotFoundError: If specified datasets don't exist
+        PermissionError: If user lacks processing rights
+        InvalidValueError: If LLM_API_KEY is not set
+        OntologyParsingError: If ontology file is malformed
+        ValueError: If chunks exceed max token limits (reduce chunk_size)
+        DatabaseNotCreatedError: If databases are not properly initialized
+    """
    tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path)

    if run_in_background:
--- a/cognee/api/v1/search/search.py
+++ b/cognee/api/v1/search/search.py
@ -20,6 +20,142 @@ async def search(
    node_type: Optional[Type] = None,
    node_name: Optional[List[str]] = None,
 ) -> list:
+    """
+    Search and query the knowledge graph for insights, information, and connections.
+
+    This is the final step in the Cognee workflow that retrieves information from the
+    processed knowledge graph. It supports multiple search modes optimized for different
+    use cases - from simple fact retrieval to complex reasoning and code analysis.
+
+    Search Prerequisites:
+        - **LLM_API_KEY**: Required for GRAPH_COMPLETION and RAG_COMPLETION search types
+        - **Data Added**: Must have data previously added via `cognee.add()`
+        - **Knowledge Graph Built**: Must have processed data via `cognee.cognify()`
+        - **Dataset Permissions**: User must have 'read' permission on target datasets
+        - **Vector Database**: Must be accessible for semantic search functionality
+
+    Search Types & Use Cases:
+
+        **GRAPH_COMPLETION** (Default - Recommended):
+            Natural language Q&A using full graph context and LLM reasoning.
+            Best for: Complex questions, analysis, summaries, insights.
+            Returns: Conversational AI responses with graph-backed context.
+
+        **RAG_COMPLETION**:
+            Traditional RAG using document chunks without graph structure.
+            Best for: Direct document retrieval, specific fact-finding.
+            Returns: LLM responses based on relevant text chunks.
+
+        **INSIGHTS**:
+            Structured entity relationships and semantic connections.
+            Best for: Understanding concept relationships, knowledge mapping.
+            Returns: Formatted relationship data and entity connections.
+
+        **CHUNKS**:
+            Raw text segments that match the query semantically.
+            Best for: Finding specific passages, citations, exact content.
+            Returns: Ranked list of relevant text chunks with metadata.
+
+        **SUMMARIES**:
+            Pre-generated hierarchical summaries of content.
+            Best for: Quick overviews, document abstracts, topic summaries.
+            Returns: Multi-level summaries from detailed to high-level.
+
+        **CODE**:
+            Code-specific search with syntax and semantic understanding.
+            Best for: Finding functions, classes, implementation patterns.
+            Returns: Structured code information with context and relationships.
+
+        **CYPHER**:
+            Direct graph database queries using Cypher syntax.
+            Best for: Advanced users, specific graph traversals, debugging.
+            Returns: Raw graph query results.
+
+    Args:
+        query_text: Your question or search query in natural language.
+            Examples:
+            - "What are the main themes in this research?"
+            - "How do these concepts relate to each other?"
+            - "Find information about machine learning algorithms"
+            - "What functions handle user authentication?"
+
+        query_type: SearchType enum specifying the search mode.
+                   Defaults to GRAPH_COMPLETION for conversational AI responses.
+
+        user: User context for data access permissions. Uses default if None.
+
+        datasets: Dataset name(s) to search within. Searches all accessible if None.
+            - Single dataset: "research_papers"
+            - Multiple datasets: ["docs", "reports", "analysis"]
+            - None: Search across all user datasets
+
+        dataset_ids: Alternative to datasets - use specific UUID identifiers.
+
+        system_prompt_path: Custom system prompt file for LLM-based search types.
+                          Defaults to "answer_simple_question.txt".
+
+        top_k: Maximum number of results to return (1-N)
+              Higher values provide more comprehensive but potentially noisy results.
+
+        node_type: Filter results to specific entity types (for advanced filtering).
+
+        node_name: Filter results to specific named entities (for targeted search).
+
+    Returns:
+        list: Search results in format determined by query_type:
+
+            **GRAPH_COMPLETION/RAG_COMPLETION**:
+                [List of conversational AI response strings]
+
+            **INSIGHTS**:
+                [List of formatted relationship descriptions and entity connections]
+
+            **CHUNKS**:
+                [List of relevant text passages with source metadata]
+
+            **SUMMARIES**:
+                [List of hierarchical summaries from general to specific]
+
+            **CODE**:
+                [List of structured code information with context]
+
+
+
+
+
+    Performance & Optimization:
+        - **GRAPH_COMPLETION**: Slower but most intelligent, uses LLM + graph context
+        - **RAG_COMPLETION**: Medium speed, uses LLM + document chunks (no graph traversal)
+        - **INSIGHTS**: Fast, returns structured relationships without LLM processing
+        - **CHUNKS**: Fastest, pure vector similarity search without LLM
+        - **SUMMARIES**: Fast, returns pre-computed summaries
+        - **CODE**: Medium speed, specialized for code understanding
+        - **top_k**: Start with 10, increase for comprehensive analysis (max 100)
+        - **datasets**: Specify datasets to improve speed and relevance
+
+    Next Steps After Search:
+        - Use results for further analysis or application integration
+        - Combine different search types for comprehensive understanding
+        - Export insights for reporting or downstream processing
+        - Iterate with refined queries based on initial results
+
+    Environment Variables:
+        Required for LLM-based search types (GRAPH_COMPLETION, RAG_COMPLETION):
+        - LLM_API_KEY: API key for your LLM provider
+
+        Optional:
+        - LLM_PROVIDER, LLM_MODEL: Configure LLM for search responses
+        - VECTOR_DB_PROVIDER: Must match what was used during cognify
+        - GRAPH_DATABASE_PROVIDER: Must match what was used during cognify
+
+    Raises:
+        DatasetNotFoundError: If specified datasets don't exist or aren't accessible
+        PermissionDeniedError: If user lacks read access to requested datasets
+        NoDataError: If no relevant data found for the search query
+        InvalidValueError: If LLM_API_KEY is not set (for LLM-based search types)
+        ValueError: If query_text is empty or search parameters are invalid
+        CollectionNotFoundError: If vector collection not found (data not processed)
+    """
    # We use lists from now on for datasets
    if isinstance(datasets, UUID) or isinstance(datasets, str):
        datasets = [datasets]
--- a/cognee/infrastructure/llm/gemini/adapter.py
+++ b/cognee/infrastructure/llm/gemini/adapter.py
@ -1,4 +1,5 @@
 import litellm
+import logging
 from pydantic import BaseModel
 from typing import Type, Optional
 from litellm import acompletion, JSONSchemaValidationError
--- a/cognee/infrastructure/llm/utils.py
+++ b/cognee/infrastructure/llm/utils.py
@ -1,8 +1,7 @@
-from cognee.shared.logging_utils import get_logger
 import litellm

-from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.shared.logging_utils import get_logger

 logger = get_logger()

@ -22,6 +21,9 @@ def get_max_chunk_tokens():
          the smaller value of the embedding engine's max tokens and half of the LLM's
          maximum tokens.
    """
+    # NOTE: Import must be done in function to avoid circular import issue
+    from cognee.infrastructure.databases.vector import get_vector_engine
+
    # Calculate max chunk size based on the following formula
    embedding_engine = get_vector_engine().embedding_engine
    llm_client = get_llm_client()
@ -93,6 +95,9 @@ async def test_embedding_connection():
    the exception if the connection to the embedding handler cannot be established.
    """
    try:
+        # NOTE: Vector engine import must be done in function to avoid circular import issue
+        from cognee.infrastructure.databases.vector import get_vector_engine
+
        await get_vector_engine().embedding_engine.embed_text("test")
    except Exception as e:
        logger.error(e)
--- a/cognee/modules/data/processing/document_types/open_data_file.py
+++ b/cognee/modules/data/processing/document_types/open_data_file.py
@ -1,4 +1,6 @@
 from typing import IO, Optional
+from urllib.parse import urlparse
+import os
 from cognee.api.v1.add.config import get_s3_config


@ -24,8 +26,16 @@ def open_data_file(
        else:
            return fs.open(file_path, mode=mode, encoding=encoding, **kwargs)
    elif file_path.startswith("file://"):
-        # Handle local file URLs by stripping the file:// prefix
-        file_path = file_path.replace("file://", "", 1)
-        return open(file_path, mode=mode, encoding=encoding, **kwargs)
+        # Handle local file URLs by properly parsing the URI
+        parsed_url = urlparse(file_path)
+        # On Windows, urlparse handles drive letters correctly
+        # Convert the path component to a proper file path
+        if os.name == "nt":  # Windows
+            # Remove leading slash from Windows paths like /C:/Users/...
+            local_path = parsed_url.path.lstrip("/")
+        else:  # Unix-like systems
+            local_path = parsed_url.path
+
+        return open(local_path, mode=mode, encoding=encoding, **kwargs)
    else:
        return open(file_path, mode=mode, encoding=encoding, **kwargs)
--- a/cognee/shared/logging_utils.py
+++ b/cognee/shared/logging_utils.py
@ -11,6 +11,23 @@ import importlib.metadata
 from cognee import __version__ as cognee_version
 from typing import Protocol

+
+# Configure external library logging
+def configure_external_library_logging():
+    """Configure logging for external libraries to reduce verbosity"""
+    # Configure LiteLLM logging to reduce verbosity
+    try:
+        import litellm
+
+        litellm.set_verbose = False
+
+        # Suppress LiteLLM ERROR logging using standard logging
+        logging.getLogger("litellm").setLevel(logging.CRITICAL)
+    except ImportError:
+        # LiteLLM not available, skip configuration
+        pass
+
+
 # Export common log levels
 DEBUG = logging.DEBUG
 INFO = logging.INFO
@ -148,6 +165,44 @@ def get_logger(name=None, level=None) -> LoggerInterface:
        return logger


+def log_database_configuration(logger):
+    """Log the current database configuration for all database types"""
+    # NOTE: Has to be imporated at runtime to avoid circular import
+    from cognee.infrastructure.databases.relational.config import get_relational_config
+    from cognee.infrastructure.databases.vector.config import get_vectordb_config
+    from cognee.infrastructure.databases.graph.config import get_graph_config
+
+    try:
+        # Log relational database configuration
+        relational_config = get_relational_config()
+        logger.info(f"Relational database: {relational_config.db_provider}")
+        if relational_config.db_provider == "postgres":
+            logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}")
+            logger.info(f"Postgres database: {relational_config.db_name}")
+        elif relational_config.db_provider == "sqlite":
+            logger.info(f"SQLite path: {relational_config.db_path}")
+            logger.info(f"SQLite database: {relational_config.db_name}")
+
+        # Log vector database configuration
+        vector_config = get_vectordb_config()
+        logger.info(f"Vector database: {vector_config.vector_db_provider}")
+        if vector_config.vector_db_provider == "lancedb":
+            logger.info(f"Vector database path: {vector_config.vector_db_url}")
+        else:
+            logger.info(f"Vector database URL: {vector_config.vector_db_url}")
+
+        # Log graph database configuration
+        graph_config = get_graph_config()
+        logger.info(f"Graph database: {graph_config.graph_database_provider}")
+        if graph_config.graph_database_provider == "kuzu":
+            logger.info(f"Graph database path: {graph_config.graph_file_path}")
+        else:
+            logger.info(f"Graph database URL: {graph_config.graph_database_url}")
+
+    except Exception as e:
+        logger.warning(f"Could not retrieve database configuration: {str(e)}")
+
+
 def cleanup_old_logs(logs_dir, max_files):
    """
    Removes old log files, keeping only the most recent ones.
@ -193,6 +248,9 @@ def setup_logging(log_level=None, name=None):

    log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")]

+    # Configure external library logging early to suppress verbose output
+    configure_external_library_logging()
+
    def exception_handler(logger, method_name, event_dict):
        """Custom processor to handle uncaught exceptions."""
        # Check if there's an exc_info that needs to be processed
@ -339,6 +397,9 @@ def setup_logging(log_level=None, name=None):

    logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")

+    # Log database configuration
+    log_database_configuration(logger)
+
    # Return the configured logger
    return logger

--- a/cognee/tests/unit/modules/data/test_open_data_file.py
+++ b/cognee/tests/unit/modules/data/test_open_data_file.py
@ -1,6 +1,7 @@
 import os
 import tempfile
 import pytest
+from pathlib import Path
 from cognee.modules.data.processing.document_types.open_data_file import open_data_file


@ -29,7 +30,8 @@ class TestOpenDataFile:
            temp_file_path = f.name

        try:
-            file_url = f"file://{temp_file_path}"
+            # Use pathlib.Path.as_uri() for proper cross-platform file URL creation
+            file_url = Path(temp_file_path).as_uri()
            with open_data_file(file_url, mode="r") as f:
                content = f.read()
                assert content == test_content
@ -44,7 +46,8 @@ class TestOpenDataFile:
            temp_file_path = f.name

        try:
-            file_url = f"file://{temp_file_path}"
+            # Use pathlib.Path.as_uri() for proper cross-platform file URL creation
+            file_url = Path(temp_file_path).as_uri()
            with open_data_file(file_url, mode="rb") as f:
                content = f.read()
                assert content == test_content.encode()
@ -61,7 +64,8 @@ class TestOpenDataFile:
            temp_file_path = f.name

        try:
-            file_url = f"file://{temp_file_path}"
+            # Use pathlib.Path.as_uri() for proper cross-platform file URL creation
+            file_url = Path(temp_file_path).as_uri()
            with open_data_file(file_url, mode="r", encoding="utf-8") as f:
                content = f.read()
                assert content == test_content
@ -84,7 +88,9 @@ class TestOpenDataFile:

        try:
            # Even if someone accidentally adds multiple file:// prefixes
-            file_url = f"file://file://{temp_file_path}"
+            # Use proper file URL creation first
+            proper_file_url = Path(temp_file_path).as_uri()
+            file_url = f"file://{proper_file_url}"
            with open_data_file(file_url, mode="r") as f:
                content = f.read()
                # This should work because we only replace the first occurrence