Merge remote-tracking branch 'origin/dev' into feat/modal-parallelization
This commit is contained in:
commit
4eba76ca1f
11 changed files with 518 additions and 15 deletions
|
|
@ -102,7 +102,7 @@ handlers =
|
|||
qualname = sqlalchemy.engine
|
||||
|
||||
[logger_alembic]
|
||||
level = INFO
|
||||
level = WARN
|
||||
handlers =
|
||||
qualname = alembic
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,10 @@ set -e # Exit on error
|
|||
echo "Debug mode: $DEBUG"
|
||||
echo "Environment: $ENVIRONMENT"
|
||||
|
||||
# Set default transport mode if not specified
|
||||
TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"}
|
||||
echo "Transport mode: $TRANSPORT_MODE"
|
||||
|
||||
# Run Alembic migrations with proper error handling.
|
||||
# Note on UserAlreadyExists error handling:
|
||||
# During database migrations, we attempt to create a default user. If this user
|
||||
|
|
@ -28,19 +32,31 @@ fi
|
|||
|
||||
echo "Database migrations done."
|
||||
|
||||
echo "Starting Cognee MCP Server..."
|
||||
echo "Starting Cognee MCP Server with transport mode: $TRANSPORT_MODE"
|
||||
|
||||
# Add startup delay to ensure DB is ready
|
||||
sleep 2
|
||||
|
||||
# Modified Gunicorn startup with error handling
|
||||
# Modified startup with transport mode selection and error handling
|
||||
if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then
|
||||
if [ "$DEBUG" = "true" ]; then
|
||||
echo "Waiting for the debugger to attach..."
|
||||
exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee
|
||||
if [ "$TRANSPORT_MODE" = "sse" ]; then
|
||||
exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport sse
|
||||
else
|
||||
exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport stdio
|
||||
fi
|
||||
else
|
||||
exec cognee
|
||||
if [ "$TRANSPORT_MODE" = "sse" ]; then
|
||||
exec cognee --transport sse
|
||||
else
|
||||
exec cognee --transport stdio
|
||||
fi
|
||||
fi
|
||||
else
|
||||
exec cognee
|
||||
if [ "$TRANSPORT_MODE" = "sse" ]; then
|
||||
exec cognee --transport sse
|
||||
else
|
||||
exec cognee --transport stdio
|
||||
fi
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ from cognee.modules.search.types import SearchType
|
|||
from cognee.shared.data_models import KnowledgeGraph
|
||||
from cognee.modules.storage.utils import JSONEncoder
|
||||
|
||||
|
||||
try:
|
||||
from codingagents.coding_rule_associations import (
|
||||
add_rule_associations,
|
||||
|
|
|
|||
|
|
@ -16,6 +16,128 @@ async def add(
|
|||
graph_db_config: dict = None,
|
||||
dataset_id: UUID = None,
|
||||
):
|
||||
"""
|
||||
Add data to Cognee for knowledge graph processing.
|
||||
|
||||
This is the first step in the Cognee workflow - it ingests raw data and prepares it
|
||||
for processing. The function accepts various data formats including text, files, and
|
||||
binary streams, then stores them in a specified dataset for further processing.
|
||||
|
||||
Prerequisites:
|
||||
- **LLM_API_KEY**: Must be set in environment variables for content processing
|
||||
- **Database Setup**: Relational and vector databases must be configured
|
||||
- **User Authentication**: Uses default user if none provided (created automatically)
|
||||
|
||||
Supported Input Types:
|
||||
- **Text strings**: Direct text content (str) - any string not starting with "/" or "file://"
|
||||
- **File paths**: Local file paths as strings in these formats:
|
||||
* Absolute paths: "/path/to/document.pdf"
|
||||
* File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt"
|
||||
* S3 paths: "s3://bucket-name/path/to/file.pdf"
|
||||
- **Binary file objects**: File handles/streams (BinaryIO)
|
||||
- **Lists**: Multiple files or text strings in a single call
|
||||
|
||||
Supported File Formats:
|
||||
- Text files (.txt, .md, .csv)
|
||||
- PDFs (.pdf)
|
||||
- Images (.png, .jpg, .jpeg) - extracted via OCR/vision models
|
||||
- Audio files (.mp3, .wav) - transcribed to text
|
||||
- Code files (.py, .js, .ts, etc.) - parsed for structure and content
|
||||
- Office documents (.docx, .pptx)
|
||||
|
||||
Workflow:
|
||||
1. **Data Resolution**: Resolves file paths and validates accessibility
|
||||
2. **Content Extraction**: Extracts text content from various file formats
|
||||
3. **Dataset Storage**: Stores processed content in the specified dataset
|
||||
4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions
|
||||
5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset
|
||||
|
||||
Args:
|
||||
data: The data to ingest. Can be:
|
||||
- Single text string: "Your text content here"
|
||||
- Absolute file path: "/path/to/document.pdf"
|
||||
- File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt"
|
||||
- S3 path: "s3://my-bucket/documents/file.pdf"
|
||||
- List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
|
||||
- Binary file object: open("file.txt", "rb")
|
||||
dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
|
||||
Create separate datasets to organize different knowledge domains.
|
||||
user: User object for authentication and permissions. Uses default user if None.
|
||||
Default user: "default_user@example.com" (created automatically on first use).
|
||||
Users can only access datasets they have permissions for.
|
||||
node_set: Optional list of node identifiers for graph organization and access control.
|
||||
Used for grouping related data points in the knowledge graph.
|
||||
vector_db_config: Optional configuration for vector database (for custom setups).
|
||||
graph_db_config: Optional configuration for graph database (for custom setups).
|
||||
dataset_id: Optional specific dataset UUID to use instead of dataset_name.
|
||||
|
||||
Returns:
|
||||
PipelineRunInfo: Information about the ingestion pipeline execution including:
|
||||
- Pipeline run ID for tracking
|
||||
- Dataset ID where data was stored
|
||||
- Processing status and any errors
|
||||
- Execution timestamps and metadata
|
||||
|
||||
Next Steps:
|
||||
After successfully adding data, call `cognify()` to process the ingested content:
|
||||
|
||||
```python
|
||||
import cognee
|
||||
|
||||
# Step 1: Add your data (text content or file path)
|
||||
await cognee.add("Your document content") # Raw text
|
||||
# OR
|
||||
await cognee.add("/path/to/your/file.pdf") # File path
|
||||
|
||||
# Step 2: Process into knowledge graph
|
||||
await cognee.cognify()
|
||||
|
||||
# Step 3: Search and query
|
||||
results = await cognee.search("What insights can you find?")
|
||||
```
|
||||
|
||||
Example Usage:
|
||||
```python
|
||||
# Add a single text document
|
||||
await cognee.add("Natural language processing is a field of AI...")
|
||||
|
||||
# Add multiple files with different path formats
|
||||
await cognee.add([
|
||||
"/absolute/path/to/research_paper.pdf", # Absolute path
|
||||
"file://relative/path/to/dataset.csv", # Relative file URL
|
||||
"file:///absolute/path/to/report.docx", # Absolute file URL
|
||||
"s3://my-bucket/documents/data.json", # S3 path
|
||||
"Additional context text" # Raw text content
|
||||
])
|
||||
|
||||
# Add to a specific dataset
|
||||
await cognee.add(
|
||||
data="Project documentation content",
|
||||
dataset_name="project_docs"
|
||||
)
|
||||
|
||||
# Add a single file
|
||||
await cognee.add("/home/user/documents/analysis.pdf")
|
||||
```
|
||||
|
||||
Environment Variables:
|
||||
Required:
|
||||
- LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.)
|
||||
|
||||
Optional:
|
||||
- LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama"
|
||||
- LLM_MODEL: Model name (default: "gpt-4o-mini")
|
||||
- DEFAULT_USER_EMAIL: Custom default user email
|
||||
- DEFAULT_USER_PASSWORD: Custom default user password
|
||||
- VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "qdrant", "weaviate"
|
||||
- GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j", "networkx"
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If specified file paths don't exist
|
||||
PermissionError: If user lacks access to files or dataset
|
||||
UnsupportedFileTypeError: If file format cannot be processed
|
||||
InvalidValueError: If LLM_API_KEY is not set or invalid
|
||||
"""
|
||||
tasks = [
|
||||
Task(resolve_data_directories, include_subdirectories=True),
|
||||
Task(ingest_data, dataset_name, user, node_set, dataset_id),
|
||||
|
|
|
|||
|
|
@ -39,6 +39,151 @@ async def cognify(
|
|||
graph_db_config: dict = None,
|
||||
run_in_background: bool = False,
|
||||
):
|
||||
"""
|
||||
Transform ingested data into a structured knowledge graph.
|
||||
|
||||
This is the core processing step in Cognee that converts raw text and documents
|
||||
into an intelligent knowledge graph. It analyzes content, extracts entities and
|
||||
relationships, and creates semantic connections for enhanced search and reasoning.
|
||||
|
||||
Prerequisites:
|
||||
- **LLM_API_KEY**: Must be configured (required for entity extraction and graph generation)
|
||||
- **Data Added**: Must have data previously added via `cognee.add()`
|
||||
- **Vector Database**: Must be accessible for embeddings storage
|
||||
- **Graph Database**: Must be accessible for relationship storage
|
||||
|
||||
Input Requirements:
|
||||
- **Datasets**: Must contain data previously added via `cognee.add()`
|
||||
- **Content Types**: Works with any text-extractable content including:
|
||||
* Natural language documents
|
||||
* Structured data (CSV, JSON)
|
||||
* Code repositories
|
||||
* Academic papers and technical documentation
|
||||
* Mixed multimedia content (with text extraction)
|
||||
|
||||
Processing Pipeline:
|
||||
1. **Document Classification**: Identifies document types and structures
|
||||
2. **Permission Validation**: Ensures user has processing rights
|
||||
3. **Text Chunking**: Breaks content into semantically meaningful segments
|
||||
4. **Entity Extraction**: Identifies key concepts, people, places, organizations
|
||||
5. **Relationship Detection**: Discovers connections between entities
|
||||
6. **Graph Construction**: Builds semantic knowledge graph with embeddings
|
||||
7. **Content Summarization**: Creates hierarchical summaries for navigation
|
||||
|
||||
Graph Model Customization:
|
||||
The `graph_model` parameter allows custom knowledge structures:
|
||||
- **Default**: General-purpose KnowledgeGraph for any domain
|
||||
- **Custom Models**: Domain-specific schemas (e.g., scientific papers, code analysis)
|
||||
- **Ontology Integration**: Use `ontology_file_path` for predefined vocabularies
|
||||
|
||||
Args:
|
||||
datasets: Dataset name(s) or dataset uuid to process. Processes all available data if None.
|
||||
- Single dataset: "my_dataset"
|
||||
- Multiple datasets: ["docs", "research", "reports"]
|
||||
- None: Process all datasets for the user
|
||||
user: User context for authentication and data access. Uses default if None.
|
||||
graph_model: Pydantic model defining the knowledge graph structure.
|
||||
Defaults to KnowledgeGraph for general-purpose processing.
|
||||
chunker: Text chunking strategy (TextChunker, LangchainChunker).
|
||||
- TextChunker: Paragraph-based chunking (default, most reliable)
|
||||
- LangchainChunker: Recursive character splitting with overlap
|
||||
Determines how documents are segmented for processing.
|
||||
chunk_size: Maximum tokens per chunk. Auto-calculated based on LLM if None.
|
||||
Formula: min(embedding_max_tokens, llm_max_tokens // 2)
|
||||
Default limits: ~512-8192 tokens depending on models.
|
||||
Smaller chunks = more granular but potentially fragmented knowledge.
|
||||
ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types.
|
||||
Useful for specialized fields like medical or legal documents.
|
||||
vector_db_config: Custom vector database configuration for embeddings storage.
|
||||
graph_db_config: Custom graph database configuration for relationship storage.
|
||||
run_in_background: If True, starts processing asynchronously and returns immediately.
|
||||
If False, waits for completion before returning.
|
||||
Background mode recommended for large datasets (>100MB).
|
||||
Use pipeline_run_id from return value to monitor progress.
|
||||
|
||||
Returns:
|
||||
Union[dict, list[PipelineRunInfo]]:
|
||||
- **Blocking mode**: Dictionary mapping dataset_id -> PipelineRunInfo with:
|
||||
* Processing status (completed/failed/in_progress)
|
||||
* Extracted entity and relationship counts
|
||||
* Processing duration and resource usage
|
||||
* Error details if any failures occurred
|
||||
- **Background mode**: List of PipelineRunInfo objects for tracking progress
|
||||
* Use pipeline_run_id to monitor status
|
||||
* Check completion via pipeline monitoring APIs
|
||||
|
||||
Next Steps:
|
||||
After successful cognify processing, use search functions to query the knowledge:
|
||||
|
||||
```python
|
||||
import cognee
|
||||
from cognee import SearchType
|
||||
|
||||
# Process your data into knowledge graph
|
||||
await cognee.cognify()
|
||||
|
||||
# Query for insights using different search types:
|
||||
|
||||
# 1. Natural language completion with graph context
|
||||
insights = await cognee.search(
|
||||
"What are the main themes?",
|
||||
query_type=SearchType.GRAPH_COMPLETION
|
||||
)
|
||||
|
||||
# 2. Get entity relationships and connections
|
||||
relationships = await cognee.search(
|
||||
"connections between concepts",
|
||||
query_type=SearchType.INSIGHTS
|
||||
)
|
||||
|
||||
# 3. Find relevant document chunks
|
||||
chunks = await cognee.search(
|
||||
"specific topic",
|
||||
query_type=SearchType.CHUNKS
|
||||
)
|
||||
```
|
||||
|
||||
Advanced Usage:
|
||||
```python
|
||||
# Custom domain model for scientific papers
|
||||
class ScientificPaper(DataPoint):
|
||||
title: str
|
||||
authors: List[str]
|
||||
methodology: str
|
||||
findings: List[str]
|
||||
|
||||
await cognee.cognify(
|
||||
datasets=["research_papers"],
|
||||
graph_model=ScientificPaper,
|
||||
ontology_file_path="scientific_ontology.owl"
|
||||
)
|
||||
|
||||
# Background processing for large datasets
|
||||
run_info = await cognee.cognify(
|
||||
datasets=["large_corpus"],
|
||||
run_in_background=True
|
||||
)
|
||||
# Check status later with run_info.pipeline_run_id
|
||||
```
|
||||
|
||||
|
||||
Environment Variables:
|
||||
Required:
|
||||
- LLM_API_KEY: API key for your LLM provider
|
||||
|
||||
Optional (same as add function):
|
||||
- LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER
|
||||
- LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False)
|
||||
- LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60)
|
||||
|
||||
Raises:
|
||||
DatasetNotFoundError: If specified datasets don't exist
|
||||
PermissionError: If user lacks processing rights
|
||||
InvalidValueError: If LLM_API_KEY is not set
|
||||
OntologyParsingError: If ontology file is malformed
|
||||
ValueError: If chunks exceed max token limits (reduce chunk_size)
|
||||
DatabaseNotCreatedError: If databases are not properly initialized
|
||||
"""
|
||||
tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path)
|
||||
|
||||
if run_in_background:
|
||||
|
|
|
|||
|
|
@ -20,6 +20,142 @@ async def search(
|
|||
node_type: Optional[Type] = None,
|
||||
node_name: Optional[List[str]] = None,
|
||||
) -> list:
|
||||
"""
|
||||
Search and query the knowledge graph for insights, information, and connections.
|
||||
|
||||
This is the final step in the Cognee workflow that retrieves information from the
|
||||
processed knowledge graph. It supports multiple search modes optimized for different
|
||||
use cases - from simple fact retrieval to complex reasoning and code analysis.
|
||||
|
||||
Search Prerequisites:
|
||||
- **LLM_API_KEY**: Required for GRAPH_COMPLETION and RAG_COMPLETION search types
|
||||
- **Data Added**: Must have data previously added via `cognee.add()`
|
||||
- **Knowledge Graph Built**: Must have processed data via `cognee.cognify()`
|
||||
- **Dataset Permissions**: User must have 'read' permission on target datasets
|
||||
- **Vector Database**: Must be accessible for semantic search functionality
|
||||
|
||||
Search Types & Use Cases:
|
||||
|
||||
**GRAPH_COMPLETION** (Default - Recommended):
|
||||
Natural language Q&A using full graph context and LLM reasoning.
|
||||
Best for: Complex questions, analysis, summaries, insights.
|
||||
Returns: Conversational AI responses with graph-backed context.
|
||||
|
||||
**RAG_COMPLETION**:
|
||||
Traditional RAG using document chunks without graph structure.
|
||||
Best for: Direct document retrieval, specific fact-finding.
|
||||
Returns: LLM responses based on relevant text chunks.
|
||||
|
||||
**INSIGHTS**:
|
||||
Structured entity relationships and semantic connections.
|
||||
Best for: Understanding concept relationships, knowledge mapping.
|
||||
Returns: Formatted relationship data and entity connections.
|
||||
|
||||
**CHUNKS**:
|
||||
Raw text segments that match the query semantically.
|
||||
Best for: Finding specific passages, citations, exact content.
|
||||
Returns: Ranked list of relevant text chunks with metadata.
|
||||
|
||||
**SUMMARIES**:
|
||||
Pre-generated hierarchical summaries of content.
|
||||
Best for: Quick overviews, document abstracts, topic summaries.
|
||||
Returns: Multi-level summaries from detailed to high-level.
|
||||
|
||||
**CODE**:
|
||||
Code-specific search with syntax and semantic understanding.
|
||||
Best for: Finding functions, classes, implementation patterns.
|
||||
Returns: Structured code information with context and relationships.
|
||||
|
||||
**CYPHER**:
|
||||
Direct graph database queries using Cypher syntax.
|
||||
Best for: Advanced users, specific graph traversals, debugging.
|
||||
Returns: Raw graph query results.
|
||||
|
||||
Args:
|
||||
query_text: Your question or search query in natural language.
|
||||
Examples:
|
||||
- "What are the main themes in this research?"
|
||||
- "How do these concepts relate to each other?"
|
||||
- "Find information about machine learning algorithms"
|
||||
- "What functions handle user authentication?"
|
||||
|
||||
query_type: SearchType enum specifying the search mode.
|
||||
Defaults to GRAPH_COMPLETION for conversational AI responses.
|
||||
|
||||
user: User context for data access permissions. Uses default if None.
|
||||
|
||||
datasets: Dataset name(s) to search within. Searches all accessible if None.
|
||||
- Single dataset: "research_papers"
|
||||
- Multiple datasets: ["docs", "reports", "analysis"]
|
||||
- None: Search across all user datasets
|
||||
|
||||
dataset_ids: Alternative to datasets - use specific UUID identifiers.
|
||||
|
||||
system_prompt_path: Custom system prompt file for LLM-based search types.
|
||||
Defaults to "answer_simple_question.txt".
|
||||
|
||||
top_k: Maximum number of results to return (1-N)
|
||||
Higher values provide more comprehensive but potentially noisy results.
|
||||
|
||||
node_type: Filter results to specific entity types (for advanced filtering).
|
||||
|
||||
node_name: Filter results to specific named entities (for targeted search).
|
||||
|
||||
Returns:
|
||||
list: Search results in format determined by query_type:
|
||||
|
||||
**GRAPH_COMPLETION/RAG_COMPLETION**:
|
||||
[List of conversational AI response strings]
|
||||
|
||||
**INSIGHTS**:
|
||||
[List of formatted relationship descriptions and entity connections]
|
||||
|
||||
**CHUNKS**:
|
||||
[List of relevant text passages with source metadata]
|
||||
|
||||
**SUMMARIES**:
|
||||
[List of hierarchical summaries from general to specific]
|
||||
|
||||
**CODE**:
|
||||
[List of structured code information with context]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Performance & Optimization:
|
||||
- **GRAPH_COMPLETION**: Slower but most intelligent, uses LLM + graph context
|
||||
- **RAG_COMPLETION**: Medium speed, uses LLM + document chunks (no graph traversal)
|
||||
- **INSIGHTS**: Fast, returns structured relationships without LLM processing
|
||||
- **CHUNKS**: Fastest, pure vector similarity search without LLM
|
||||
- **SUMMARIES**: Fast, returns pre-computed summaries
|
||||
- **CODE**: Medium speed, specialized for code understanding
|
||||
- **top_k**: Start with 10, increase for comprehensive analysis (max 100)
|
||||
- **datasets**: Specify datasets to improve speed and relevance
|
||||
|
||||
Next Steps After Search:
|
||||
- Use results for further analysis or application integration
|
||||
- Combine different search types for comprehensive understanding
|
||||
- Export insights for reporting or downstream processing
|
||||
- Iterate with refined queries based on initial results
|
||||
|
||||
Environment Variables:
|
||||
Required for LLM-based search types (GRAPH_COMPLETION, RAG_COMPLETION):
|
||||
- LLM_API_KEY: API key for your LLM provider
|
||||
|
||||
Optional:
|
||||
- LLM_PROVIDER, LLM_MODEL: Configure LLM for search responses
|
||||
- VECTOR_DB_PROVIDER: Must match what was used during cognify
|
||||
- GRAPH_DATABASE_PROVIDER: Must match what was used during cognify
|
||||
|
||||
Raises:
|
||||
DatasetNotFoundError: If specified datasets don't exist or aren't accessible
|
||||
PermissionDeniedError: If user lacks read access to requested datasets
|
||||
NoDataError: If no relevant data found for the search query
|
||||
InvalidValueError: If LLM_API_KEY is not set (for LLM-based search types)
|
||||
ValueError: If query_text is empty or search parameters are invalid
|
||||
CollectionNotFoundError: If vector collection not found (data not processed)
|
||||
"""
|
||||
# We use lists from now on for datasets
|
||||
if isinstance(datasets, UUID) or isinstance(datasets, str):
|
||||
datasets = [datasets]
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import litellm
|
||||
import logging
|
||||
from pydantic import BaseModel
|
||||
from typing import Type, Optional
|
||||
from litellm import acompletion, JSONSchemaValidationError
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
from cognee.shared.logging_utils import get_logger
|
||||
import litellm
|
||||
|
||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
|
@ -22,6 +21,9 @@ def get_max_chunk_tokens():
|
|||
the smaller value of the embedding engine's max tokens and half of the LLM's
|
||||
maximum tokens.
|
||||
"""
|
||||
# NOTE: Import must be done in function to avoid circular import issue
|
||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||
|
||||
# Calculate max chunk size based on the following formula
|
||||
embedding_engine = get_vector_engine().embedding_engine
|
||||
llm_client = get_llm_client()
|
||||
|
|
@ -93,6 +95,9 @@ async def test_embedding_connection():
|
|||
the exception if the connection to the embedding handler cannot be established.
|
||||
"""
|
||||
try:
|
||||
# NOTE: Vector engine import must be done in function to avoid circular import issue
|
||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||
|
||||
await get_vector_engine().embedding_engine.embed_text("test")
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,6 @@
|
|||
from typing import IO, Optional
|
||||
from urllib.parse import urlparse
|
||||
import os
|
||||
from cognee.api.v1.add.config import get_s3_config
|
||||
|
||||
|
||||
|
|
@ -24,8 +26,16 @@ def open_data_file(
|
|||
else:
|
||||
return fs.open(file_path, mode=mode, encoding=encoding, **kwargs)
|
||||
elif file_path.startswith("file://"):
|
||||
# Handle local file URLs by stripping the file:// prefix
|
||||
file_path = file_path.replace("file://", "", 1)
|
||||
return open(file_path, mode=mode, encoding=encoding, **kwargs)
|
||||
# Handle local file URLs by properly parsing the URI
|
||||
parsed_url = urlparse(file_path)
|
||||
# On Windows, urlparse handles drive letters correctly
|
||||
# Convert the path component to a proper file path
|
||||
if os.name == "nt": # Windows
|
||||
# Remove leading slash from Windows paths like /C:/Users/...
|
||||
local_path = parsed_url.path.lstrip("/")
|
||||
else: # Unix-like systems
|
||||
local_path = parsed_url.path
|
||||
|
||||
return open(local_path, mode=mode, encoding=encoding, **kwargs)
|
||||
else:
|
||||
return open(file_path, mode=mode, encoding=encoding, **kwargs)
|
||||
|
|
|
|||
|
|
@ -11,6 +11,23 @@ import importlib.metadata
|
|||
from cognee import __version__ as cognee_version
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
# Configure external library logging
|
||||
def configure_external_library_logging():
|
||||
"""Configure logging for external libraries to reduce verbosity"""
|
||||
# Configure LiteLLM logging to reduce verbosity
|
||||
try:
|
||||
import litellm
|
||||
|
||||
litellm.set_verbose = False
|
||||
|
||||
# Suppress LiteLLM ERROR logging using standard logging
|
||||
logging.getLogger("litellm").setLevel(logging.CRITICAL)
|
||||
except ImportError:
|
||||
# LiteLLM not available, skip configuration
|
||||
pass
|
||||
|
||||
|
||||
# Export common log levels
|
||||
DEBUG = logging.DEBUG
|
||||
INFO = logging.INFO
|
||||
|
|
@ -148,6 +165,44 @@ def get_logger(name=None, level=None) -> LoggerInterface:
|
|||
return logger
|
||||
|
||||
|
||||
def log_database_configuration(logger):
|
||||
"""Log the current database configuration for all database types"""
|
||||
# NOTE: Has to be imporated at runtime to avoid circular import
|
||||
from cognee.infrastructure.databases.relational.config import get_relational_config
|
||||
from cognee.infrastructure.databases.vector.config import get_vectordb_config
|
||||
from cognee.infrastructure.databases.graph.config import get_graph_config
|
||||
|
||||
try:
|
||||
# Log relational database configuration
|
||||
relational_config = get_relational_config()
|
||||
logger.info(f"Relational database: {relational_config.db_provider}")
|
||||
if relational_config.db_provider == "postgres":
|
||||
logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}")
|
||||
logger.info(f"Postgres database: {relational_config.db_name}")
|
||||
elif relational_config.db_provider == "sqlite":
|
||||
logger.info(f"SQLite path: {relational_config.db_path}")
|
||||
logger.info(f"SQLite database: {relational_config.db_name}")
|
||||
|
||||
# Log vector database configuration
|
||||
vector_config = get_vectordb_config()
|
||||
logger.info(f"Vector database: {vector_config.vector_db_provider}")
|
||||
if vector_config.vector_db_provider == "lancedb":
|
||||
logger.info(f"Vector database path: {vector_config.vector_db_url}")
|
||||
else:
|
||||
logger.info(f"Vector database URL: {vector_config.vector_db_url}")
|
||||
|
||||
# Log graph database configuration
|
||||
graph_config = get_graph_config()
|
||||
logger.info(f"Graph database: {graph_config.graph_database_provider}")
|
||||
if graph_config.graph_database_provider == "kuzu":
|
||||
logger.info(f"Graph database path: {graph_config.graph_file_path}")
|
||||
else:
|
||||
logger.info(f"Graph database URL: {graph_config.graph_database_url}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not retrieve database configuration: {str(e)}")
|
||||
|
||||
|
||||
def cleanup_old_logs(logs_dir, max_files):
|
||||
"""
|
||||
Removes old log files, keeping only the most recent ones.
|
||||
|
|
@ -193,6 +248,9 @@ def setup_logging(log_level=None, name=None):
|
|||
|
||||
log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")]
|
||||
|
||||
# Configure external library logging early to suppress verbose output
|
||||
configure_external_library_logging()
|
||||
|
||||
def exception_handler(logger, method_name, event_dict):
|
||||
"""Custom processor to handle uncaught exceptions."""
|
||||
# Check if there's an exc_info that needs to be processed
|
||||
|
|
@ -339,6 +397,9 @@ def setup_logging(log_level=None, name=None):
|
|||
|
||||
logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")
|
||||
|
||||
# Log database configuration
|
||||
log_database_configuration(logger)
|
||||
|
||||
# Return the configured logger
|
||||
return logger
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import tempfile
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from cognee.modules.data.processing.document_types.open_data_file import open_data_file
|
||||
|
||||
|
||||
|
|
@ -29,7 +30,8 @@ class TestOpenDataFile:
|
|||
temp_file_path = f.name
|
||||
|
||||
try:
|
||||
file_url = f"file://{temp_file_path}"
|
||||
# Use pathlib.Path.as_uri() for proper cross-platform file URL creation
|
||||
file_url = Path(temp_file_path).as_uri()
|
||||
with open_data_file(file_url, mode="r") as f:
|
||||
content = f.read()
|
||||
assert content == test_content
|
||||
|
|
@ -44,7 +46,8 @@ class TestOpenDataFile:
|
|||
temp_file_path = f.name
|
||||
|
||||
try:
|
||||
file_url = f"file://{temp_file_path}"
|
||||
# Use pathlib.Path.as_uri() for proper cross-platform file URL creation
|
||||
file_url = Path(temp_file_path).as_uri()
|
||||
with open_data_file(file_url, mode="rb") as f:
|
||||
content = f.read()
|
||||
assert content == test_content.encode()
|
||||
|
|
@ -61,7 +64,8 @@ class TestOpenDataFile:
|
|||
temp_file_path = f.name
|
||||
|
||||
try:
|
||||
file_url = f"file://{temp_file_path}"
|
||||
# Use pathlib.Path.as_uri() for proper cross-platform file URL creation
|
||||
file_url = Path(temp_file_path).as_uri()
|
||||
with open_data_file(file_url, mode="r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
assert content == test_content
|
||||
|
|
@ -84,7 +88,9 @@ class TestOpenDataFile:
|
|||
|
||||
try:
|
||||
# Even if someone accidentally adds multiple file:// prefixes
|
||||
file_url = f"file://file://{temp_file_path}"
|
||||
# Use proper file URL creation first
|
||||
proper_file_url = Path(temp_file_path).as_uri()
|
||||
file_url = f"file://{proper_file_url}"
|
||||
with open_data_file(file_url, mode="r") as f:
|
||||
content = f.read()
|
||||
# This should work because we only replace the first occurrence
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue