Merge remote-tracking branch 'origin/dev' into feat/modal-parallelization

2025-07-04 15:37:57 +02:00 · 2025-07-04 15:37:57 +02:00 · 4eba76ca1f
commit 4eba76ca1f
parent 00dd3b8d97 c936f5e0a3
11 changed files with 518 additions and 15 deletions
--- a/alembic.ini
+++ b/alembic.ini
@ -102,7 +102,7 @@ handlers =
 qualname = sqlalchemy.engine
 [logger_alembic]
-level = INFO
+level = WARN
 handlers =
 qualname = alembic
--- a/cognee-mcp/entrypoint.sh
+++ b/cognee-mcp/entrypoint.sh
@ -4,6 +4,10 @@ set -e  # Exit on error
 echo "Debug mode: $DEBUG"
 echo "Environment: $ENVIRONMENT"
 # Set default transport mode if not specified
 TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"}
 echo "Transport mode: $TRANSPORT_MODE"
 # Run Alembic migrations with proper error handling.
 # Note on UserAlreadyExists error handling:
 # During database migrations, we attempt to create a default user. If this user
@ -28,19 +32,31 @@ fi
 echo "Database migrations done."
-echo "Starting Cognee MCP Server..."
+echo "Starting Cognee MCP Server with transport mode: $TRANSPORT_MODE"
 # Add startup delay to ensure DB is ready
 sleep 2
-# Modified Gunicorn startup with error handling
+# Modified startup with transport mode selection and error handling
 if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then
    if [ "$DEBUG" = "true" ]; then
        echo "Waiting for the debugger to attach..."
-        exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee
+        if [ "$TRANSPORT_MODE" = "sse" ]; then
            exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport sse
        else
            exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport stdio
        fi
    else
-        exec cognee
+        if [ "$TRANSPORT_MODE" = "sse" ]; then
            exec cognee --transport sse
        else
            exec cognee --transport stdio
        fi
    fi
 else
-    exec cognee
+    if [ "$TRANSPORT_MODE" = "sse" ]; then
        exec cognee --transport sse
    else
        exec cognee --transport stdio
    fi
 fi
--- a/cognee-mcp/src/server.py
+++ b/cognee-mcp/src/server.py
@ -18,6 +18,7 @@ from cognee.modules.search.types import SearchType
 from cognee.shared.data_models import KnowledgeGraph
 from cognee.modules.storage.utils import JSONEncoder
 try:
    from codingagents.coding_rule_associations import (
        add_rule_associations,
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@ -16,6 +16,128 @@ async def add(
    graph_db_config: dict = None,
    dataset_id: UUID = None,
 ):
    """
    Add data to Cognee for knowledge graph processing.
    This is the first step in the Cognee workflow - it ingests raw data and prepares it
    for processing. The function accepts various data formats including text, files, and
    binary streams, then stores them in a specified dataset for further processing.
    Prerequisites:
        - **LLM_API_KEY**: Must be set in environment variables for content processing
        - **Database Setup**: Relational and vector databases must be configured
        - **User Authentication**: Uses default user if none provided (created automatically)
    Supported Input Types:
        - **Text strings**: Direct text content (str) - any string not starting with "/" or "file://"
        - **File paths**: Local file paths as strings in these formats:
            * Absolute paths: "/path/to/document.pdf"
            * File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt"
            * S3 paths: "s3://bucket-name/path/to/file.pdf"
        - **Binary file objects**: File handles/streams (BinaryIO)
        - **Lists**: Multiple files or text strings in a single call
    Supported File Formats:
        - Text files (.txt, .md, .csv)
        - PDFs (.pdf)
        - Images (.png, .jpg, .jpeg) - extracted via OCR/vision models
        - Audio files (.mp3, .wav) - transcribed to text
        - Code files (.py, .js, .ts, etc.) - parsed for structure and content
        - Office documents (.docx, .pptx)
            Workflow:
        1. **Data Resolution**: Resolves file paths and validates accessibility
        2. **Content Extraction**: Extracts text content from various file formats
        3. **Dataset Storage**: Stores processed content in the specified dataset
        4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions
        5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset
    Args:
        data: The data to ingest. Can be:
            - Single text string: "Your text content here"
            - Absolute file path: "/path/to/document.pdf"
            - File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt"
            - S3 path: "s3://my-bucket/documents/file.pdf"
            - List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
            - Binary file object: open("file.txt", "rb")
        dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
                    Create separate datasets to organize different knowledge domains.
        user: User object for authentication and permissions. Uses default user if None.
              Default user: "default_user@example.com" (created automatically on first use).
              Users can only access datasets they have permissions for.
        node_set: Optional list of node identifiers for graph organization and access control.
                 Used for grouping related data points in the knowledge graph.
        vector_db_config: Optional configuration for vector database (for custom setups).
        graph_db_config: Optional configuration for graph database (for custom setups).
        dataset_id: Optional specific dataset UUID to use instead of dataset_name.
    Returns:
        PipelineRunInfo: Information about the ingestion pipeline execution including:
            - Pipeline run ID for tracking
            - Dataset ID where data was stored
            - Processing status and any errors
            - Execution timestamps and metadata
    Next Steps:
        After successfully adding data, call `cognify()` to process the ingested content:
        ```python
        import cognee
        # Step 1: Add your data (text content or file path)
        await cognee.add("Your document content")  # Raw text
        # OR
        await cognee.add("/path/to/your/file.pdf")  # File path
        # Step 2: Process into knowledge graph
        await cognee.cognify()
        # Step 3: Search and query
        results = await cognee.search("What insights can you find?")
        ```
    Example Usage:
        ```python
        # Add a single text document
        await cognee.add("Natural language processing is a field of AI...")
        # Add multiple files with different path formats
        await cognee.add([
            "/absolute/path/to/research_paper.pdf",        # Absolute path
            "file://relative/path/to/dataset.csv",         # Relative file URL
            "file:///absolute/path/to/report.docx",        # Absolute file URL
            "s3://my-bucket/documents/data.json",           # S3 path
            "Additional context text"                       # Raw text content
        ])
        # Add to a specific dataset
        await cognee.add(
            data="Project documentation content",
            dataset_name="project_docs"
        )
        # Add a single file
        await cognee.add("/home/user/documents/analysis.pdf")
        ```
    Environment Variables:
        Required:
        - LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.)
        Optional:
        - LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama"
        - LLM_MODEL: Model name (default: "gpt-4o-mini")
        - DEFAULT_USER_EMAIL: Custom default user email
        - DEFAULT_USER_PASSWORD: Custom default user password
        - VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "qdrant", "weaviate"
        - GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j", "networkx"
    Raises:
        FileNotFoundError: If specified file paths don't exist
        PermissionError: If user lacks access to files or dataset
        UnsupportedFileTypeError: If file format cannot be processed
        InvalidValueError: If LLM_API_KEY is not set or invalid
    """
    tasks = [
        Task(resolve_data_directories, include_subdirectories=True),
        Task(ingest_data, dataset_name, user, node_set, dataset_id),
--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@ -39,6 +39,151 @@ async def cognify(
    graph_db_config: dict = None,
    run_in_background: bool = False,
 ):
    """
    Transform ingested data into a structured knowledge graph.
    This is the core processing step in Cognee that converts raw text and documents
    into an intelligent knowledge graph. It analyzes content, extracts entities and
    relationships, and creates semantic connections for enhanced search and reasoning.
    Prerequisites:
        - **LLM_API_KEY**: Must be configured (required for entity extraction and graph generation)
        - **Data Added**: Must have data previously added via `cognee.add()`
        - **Vector Database**: Must be accessible for embeddings storage
        - **Graph Database**: Must be accessible for relationship storage
    Input Requirements:
        - **Datasets**: Must contain data previously added via `cognee.add()`
        - **Content Types**: Works with any text-extractable content including:
            * Natural language documents
            * Structured data (CSV, JSON)
            * Code repositories
            * Academic papers and technical documentation
            * Mixed multimedia content (with text extraction)
    Processing Pipeline:
        1. **Document Classification**: Identifies document types and structures
        2. **Permission Validation**: Ensures user has processing rights
        3. **Text Chunking**: Breaks content into semantically meaningful segments
        4. **Entity Extraction**: Identifies key concepts, people, places, organizations
        5. **Relationship Detection**: Discovers connections between entities
        6. **Graph Construction**: Builds semantic knowledge graph with embeddings
        7. **Content Summarization**: Creates hierarchical summaries for navigation
    Graph Model Customization:
        The `graph_model` parameter allows custom knowledge structures:
        - **Default**: General-purpose KnowledgeGraph for any domain
        - **Custom Models**: Domain-specific schemas (e.g., scientific papers, code analysis)
        - **Ontology Integration**: Use `ontology_file_path` for predefined vocabularies
    Args:
        datasets: Dataset name(s) or dataset uuid to process. Processes all available data if None.
            - Single dataset: "my_dataset"
            - Multiple datasets: ["docs", "research", "reports"]
            - None: Process all datasets for the user
        user: User context for authentication and data access. Uses default if None.
        graph_model: Pydantic model defining the knowledge graph structure.
                    Defaults to KnowledgeGraph for general-purpose processing.
        chunker: Text chunking strategy (TextChunker, LangchainChunker).
                - TextChunker: Paragraph-based chunking (default, most reliable)
                - LangchainChunker: Recursive character splitting with overlap
                Determines how documents are segmented for processing.
        chunk_size: Maximum tokens per chunk. Auto-calculated based on LLM if None.
                   Formula: min(embedding_max_tokens, llm_max_tokens // 2)
                   Default limits: ~512-8192 tokens depending on models.
                   Smaller chunks = more granular but potentially fragmented knowledge.
        ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types.
                          Useful for specialized fields like medical or legal documents.
        vector_db_config: Custom vector database configuration for embeddings storage.
        graph_db_config: Custom graph database configuration for relationship storage.
        run_in_background: If True, starts processing asynchronously and returns immediately.
                          If False, waits for completion before returning.
                          Background mode recommended for large datasets (>100MB).
                          Use pipeline_run_id from return value to monitor progress.
    Returns:
        Union[dict, list[PipelineRunInfo]]:
            - **Blocking mode**: Dictionary mapping dataset_id -> PipelineRunInfo with:
                * Processing status (completed/failed/in_progress)
                * Extracted entity and relationship counts
                * Processing duration and resource usage
                * Error details if any failures occurred
            - **Background mode**: List of PipelineRunInfo objects for tracking progress
                * Use pipeline_run_id to monitor status
                * Check completion via pipeline monitoring APIs
    Next Steps:
        After successful cognify processing, use search functions to query the knowledge:
        ```python
        import cognee
        from cognee import SearchType
        # Process your data into knowledge graph
        await cognee.cognify()
        # Query for insights using different search types:
        # 1. Natural language completion with graph context
        insights = await cognee.search(
            "What are the main themes?",
            query_type=SearchType.GRAPH_COMPLETION
        )
        # 2. Get entity relationships and connections
        relationships = await cognee.search(
            "connections between concepts",
            query_type=SearchType.INSIGHTS
        )
        # 3. Find relevant document chunks
        chunks = await cognee.search(
            "specific topic",
            query_type=SearchType.CHUNKS
        )
        ```
    Advanced Usage:
        ```python
        # Custom domain model for scientific papers
        class ScientificPaper(DataPoint):
            title: str
            authors: List[str]
            methodology: str
            findings: List[str]
        await cognee.cognify(
            datasets=["research_papers"],
            graph_model=ScientificPaper,
            ontology_file_path="scientific_ontology.owl"
        )
        # Background processing for large datasets
        run_info = await cognee.cognify(
            datasets=["large_corpus"],
            run_in_background=True
        )
        # Check status later with run_info.pipeline_run_id
        ```
    Environment Variables:
        Required:
        - LLM_API_KEY: API key for your LLM provider
        Optional (same as add function):
        - LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER
        - LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False)
        - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60)
    Raises:
        DatasetNotFoundError: If specified datasets don't exist
        PermissionError: If user lacks processing rights
        InvalidValueError: If LLM_API_KEY is not set
        OntologyParsingError: If ontology file is malformed
        ValueError: If chunks exceed max token limits (reduce chunk_size)
        DatabaseNotCreatedError: If databases are not properly initialized
    """
    tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path)
    if run_in_background:
--- a/cognee/api/v1/search/search.py
+++ b/cognee/api/v1/search/search.py
@ -20,6 +20,142 @@ async def search(
    node_type: Optional[Type] = None,
    node_name: Optional[List[str]] = None,
 ) -> list:
    """
    Search and query the knowledge graph for insights, information, and connections.
    This is the final step in the Cognee workflow that retrieves information from the
    processed knowledge graph. It supports multiple search modes optimized for different
    use cases - from simple fact retrieval to complex reasoning and code analysis.
    Search Prerequisites:
        - **LLM_API_KEY**: Required for GRAPH_COMPLETION and RAG_COMPLETION search types
        - **Data Added**: Must have data previously added via `cognee.add()`
        - **Knowledge Graph Built**: Must have processed data via `cognee.cognify()`
        - **Dataset Permissions**: User must have 'read' permission on target datasets
        - **Vector Database**: Must be accessible for semantic search functionality
    Search Types & Use Cases:
        **GRAPH_COMPLETION** (Default - Recommended):
            Natural language Q&A using full graph context and LLM reasoning.
            Best for: Complex questions, analysis, summaries, insights.
            Returns: Conversational AI responses with graph-backed context.
        **RAG_COMPLETION**:
            Traditional RAG using document chunks without graph structure.
            Best for: Direct document retrieval, specific fact-finding.
            Returns: LLM responses based on relevant text chunks.
        **INSIGHTS**:
            Structured entity relationships and semantic connections.
            Best for: Understanding concept relationships, knowledge mapping.
            Returns: Formatted relationship data and entity connections.
        **CHUNKS**:
            Raw text segments that match the query semantically.
            Best for: Finding specific passages, citations, exact content.
            Returns: Ranked list of relevant text chunks with metadata.
        **SUMMARIES**:
            Pre-generated hierarchical summaries of content.
            Best for: Quick overviews, document abstracts, topic summaries.
            Returns: Multi-level summaries from detailed to high-level.
        **CODE**:
            Code-specific search with syntax and semantic understanding.
            Best for: Finding functions, classes, implementation patterns.
            Returns: Structured code information with context and relationships.
        **CYPHER**:
            Direct graph database queries using Cypher syntax.
            Best for: Advanced users, specific graph traversals, debugging.
            Returns: Raw graph query results.
    Args:
        query_text: Your question or search query in natural language.
            Examples:
            - "What are the main themes in this research?"
            - "How do these concepts relate to each other?"
            - "Find information about machine learning algorithms"
            - "What functions handle user authentication?"
        query_type: SearchType enum specifying the search mode.
                   Defaults to GRAPH_COMPLETION for conversational AI responses.
        user: User context for data access permissions. Uses default if None.
        datasets: Dataset name(s) to search within. Searches all accessible if None.
            - Single dataset: "research_papers"
            - Multiple datasets: ["docs", "reports", "analysis"]
            - None: Search across all user datasets
        dataset_ids: Alternative to datasets - use specific UUID identifiers.
        system_prompt_path: Custom system prompt file for LLM-based search types.
                          Defaults to "answer_simple_question.txt".
        top_k: Maximum number of results to return (1-N)
              Higher values provide more comprehensive but potentially noisy results.
        node_type: Filter results to specific entity types (for advanced filtering).
        node_name: Filter results to specific named entities (for targeted search).
    Returns:
        list: Search results in format determined by query_type:
            **GRAPH_COMPLETION/RAG_COMPLETION**:
                [List of conversational AI response strings]
            **INSIGHTS**:
                [List of formatted relationship descriptions and entity connections]
            **CHUNKS**:
                [List of relevant text passages with source metadata]
            **SUMMARIES**:
                [List of hierarchical summaries from general to specific]
            **CODE**:
                [List of structured code information with context]
    Performance & Optimization:
        - **GRAPH_COMPLETION**: Slower but most intelligent, uses LLM + graph context
        - **RAG_COMPLETION**: Medium speed, uses LLM + document chunks (no graph traversal)
        - **INSIGHTS**: Fast, returns structured relationships without LLM processing
        - **CHUNKS**: Fastest, pure vector similarity search without LLM
        - **SUMMARIES**: Fast, returns pre-computed summaries
        - **CODE**: Medium speed, specialized for code understanding
        - **top_k**: Start with 10, increase for comprehensive analysis (max 100)
        - **datasets**: Specify datasets to improve speed and relevance
    Next Steps After Search:
        - Use results for further analysis or application integration
        - Combine different search types for comprehensive understanding
        - Export insights for reporting or downstream processing
        - Iterate with refined queries based on initial results
    Environment Variables:
        Required for LLM-based search types (GRAPH_COMPLETION, RAG_COMPLETION):
        - LLM_API_KEY: API key for your LLM provider
        Optional:
        - LLM_PROVIDER, LLM_MODEL: Configure LLM for search responses
        - VECTOR_DB_PROVIDER: Must match what was used during cognify
        - GRAPH_DATABASE_PROVIDER: Must match what was used during cognify
    Raises:
        DatasetNotFoundError: If specified datasets don't exist or aren't accessible
        PermissionDeniedError: If user lacks read access to requested datasets
        NoDataError: If no relevant data found for the search query
        InvalidValueError: If LLM_API_KEY is not set (for LLM-based search types)
        ValueError: If query_text is empty or search parameters are invalid
        CollectionNotFoundError: If vector collection not found (data not processed)
    """
    # We use lists from now on for datasets
    if isinstance(datasets, UUID) or isinstance(datasets, str):
        datasets = [datasets]
--- a/cognee/infrastructure/llm/gemini/adapter.py
+++ b/cognee/infrastructure/llm/gemini/adapter.py
@ -1,4 +1,5 @@
 import litellm
 import logging
 from pydantic import BaseModel
 from typing import Type, Optional
 from litellm import acompletion, JSONSchemaValidationError
--- a/cognee/infrastructure/llm/utils.py
+++ b/cognee/infrastructure/llm/utils.py
@ -1,8 +1,7 @@
 from cognee.shared.logging_utils import get_logger
 import litellm
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.shared.logging_utils import get_logger
 logger = get_logger()
@ -22,6 +21,9 @@ def get_max_chunk_tokens():
          the smaller value of the embedding engine's max tokens and half of the LLM's
          maximum tokens.
    """
    # NOTE: Import must be done in function to avoid circular import issue
    from cognee.infrastructure.databases.vector import get_vector_engine
    # Calculate max chunk size based on the following formula
    embedding_engine = get_vector_engine().embedding_engine
    llm_client = get_llm_client()
@ -93,6 +95,9 @@ async def test_embedding_connection():
    the exception if the connection to the embedding handler cannot be established.
    """
    try:
        # NOTE: Vector engine import must be done in function to avoid circular import issue
        from cognee.infrastructure.databases.vector import get_vector_engine
        await get_vector_engine().embedding_engine.embed_text("test")
    except Exception as e:
        logger.error(e)
--- a/cognee/modules/data/processing/document_types/open_data_file.py
+++ b/cognee/modules/data/processing/document_types/open_data_file.py
@ -1,4 +1,6 @@
 from typing import IO, Optional
 from urllib.parse import urlparse
 import os
 from cognee.api.v1.add.config import get_s3_config
@ -24,8 +26,16 @@ def open_data_file(
        else:
            return fs.open(file_path, mode=mode, encoding=encoding, **kwargs)
    elif file_path.startswith("file://"):
-        # Handle local file URLs by stripping the file:// prefix
+        # Handle local file URLs by properly parsing the URI
-        file_path = file_path.replace("file://", "", 1)
+        parsed_url = urlparse(file_path)
-        return open(file_path, mode=mode, encoding=encoding, **kwargs)
+        # On Windows, urlparse handles drive letters correctly
        # Convert the path component to a proper file path
        if os.name == "nt":  # Windows
            # Remove leading slash from Windows paths like /C:/Users/...
            local_path = parsed_url.path.lstrip("/")
        else:  # Unix-like systems
            local_path = parsed_url.path
        return open(local_path, mode=mode, encoding=encoding, **kwargs)
    else:
        return open(file_path, mode=mode, encoding=encoding, **kwargs)
--- a/cognee/shared/logging_utils.py
+++ b/cognee/shared/logging_utils.py
@ -11,6 +11,23 @@ import importlib.metadata
 from cognee import __version__ as cognee_version
 from typing import Protocol
 # Configure external library logging
 def configure_external_library_logging():
    """Configure logging for external libraries to reduce verbosity"""
    # Configure LiteLLM logging to reduce verbosity
    try:
        import litellm
        litellm.set_verbose = False
        # Suppress LiteLLM ERROR logging using standard logging
        logging.getLogger("litellm").setLevel(logging.CRITICAL)
    except ImportError:
        # LiteLLM not available, skip configuration
        pass
 # Export common log levels
 DEBUG = logging.DEBUG
 INFO = logging.INFO
@ -148,6 +165,44 @@ def get_logger(name=None, level=None) -> LoggerInterface:
        return logger
 def log_database_configuration(logger):
    """Log the current database configuration for all database types"""
    # NOTE: Has to be imporated at runtime to avoid circular import
    from cognee.infrastructure.databases.relational.config import get_relational_config
    from cognee.infrastructure.databases.vector.config import get_vectordb_config
    from cognee.infrastructure.databases.graph.config import get_graph_config
    try:
        # Log relational database configuration
        relational_config = get_relational_config()
        logger.info(f"Relational database: {relational_config.db_provider}")
        if relational_config.db_provider == "postgres":
            logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}")
            logger.info(f"Postgres database: {relational_config.db_name}")
        elif relational_config.db_provider == "sqlite":
            logger.info(f"SQLite path: {relational_config.db_path}")
            logger.info(f"SQLite database: {relational_config.db_name}")
        # Log vector database configuration
        vector_config = get_vectordb_config()
        logger.info(f"Vector database: {vector_config.vector_db_provider}")
        if vector_config.vector_db_provider == "lancedb":
            logger.info(f"Vector database path: {vector_config.vector_db_url}")
        else:
            logger.info(f"Vector database URL: {vector_config.vector_db_url}")
        # Log graph database configuration
        graph_config = get_graph_config()
        logger.info(f"Graph database: {graph_config.graph_database_provider}")
        if graph_config.graph_database_provider == "kuzu":
            logger.info(f"Graph database path: {graph_config.graph_file_path}")
        else:
            logger.info(f"Graph database URL: {graph_config.graph_database_url}")
    except Exception as e:
        logger.warning(f"Could not retrieve database configuration: {str(e)}")
 def cleanup_old_logs(logs_dir, max_files):
    """
    Removes old log files, keeping only the most recent ones.
@ -193,6 +248,9 @@ def setup_logging(log_level=None, name=None):
    log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")]
    # Configure external library logging early to suppress verbose output
    configure_external_library_logging()
    def exception_handler(logger, method_name, event_dict):
        """Custom processor to handle uncaught exceptions."""
        # Check if there's an exc_info that needs to be processed
@ -339,6 +397,9 @@ def setup_logging(log_level=None, name=None):
    logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")
    # Log database configuration
    log_database_configuration(logger)
    # Return the configured logger
    return logger
--- a/cognee/tests/unit/modules/data/test_open_data_file.py
+++ b/cognee/tests/unit/modules/data/test_open_data_file.py
@ -1,6 +1,7 @@
 import os
 import tempfile
 import pytest
 from pathlib import Path
 from cognee.modules.data.processing.document_types.open_data_file import open_data_file
@ -29,7 +30,8 @@ class TestOpenDataFile:
            temp_file_path = f.name
        try:
-            file_url = f"file://{temp_file_path}"
+            # Use pathlib.Path.as_uri() for proper cross-platform file URL creation
            file_url = Path(temp_file_path).as_uri()
            with open_data_file(file_url, mode="r") as f:
                content = f.read()
                assert content == test_content
@ -44,7 +46,8 @@ class TestOpenDataFile:
            temp_file_path = f.name
        try:
-            file_url = f"file://{temp_file_path}"
+            # Use pathlib.Path.as_uri() for proper cross-platform file URL creation
            file_url = Path(temp_file_path).as_uri()
            with open_data_file(file_url, mode="rb") as f:
                content = f.read()
                assert content == test_content.encode()
@ -61,7 +64,8 @@ class TestOpenDataFile:
            temp_file_path = f.name
        try:
-            file_url = f"file://{temp_file_path}"
+            # Use pathlib.Path.as_uri() for proper cross-platform file URL creation
            file_url = Path(temp_file_path).as_uri()
            with open_data_file(file_url, mode="r", encoding="utf-8") as f:
                content = f.read()
                assert content == test_content
@ -84,7 +88,9 @@ class TestOpenDataFile:
        try:
            # Even if someone accidentally adds multiple file:// prefixes
-            file_url = f"file://file://{temp_file_path}"
+            # Use proper file URL creation first
            proper_file_url = Path(temp_file_path).as_uri()
            file_url = f"file://{proper_file_url}"
            with open_data_file(file_url, mode="r") as f:
                content = f.read()
                # This should work because we only replace the first occurrence