Merge remote-tracking branch 'origin/dev' into feat/modal-parallelization
This commit is contained in:
commit
4eba76ca1f
11 changed files with 518 additions and 15 deletions
|
|
@ -102,7 +102,7 @@ handlers =
|
||||||
qualname = sqlalchemy.engine
|
qualname = sqlalchemy.engine
|
||||||
|
|
||||||
[logger_alembic]
|
[logger_alembic]
|
||||||
level = INFO
|
level = WARN
|
||||||
handlers =
|
handlers =
|
||||||
qualname = alembic
|
qualname = alembic
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,10 @@ set -e # Exit on error
|
||||||
echo "Debug mode: $DEBUG"
|
echo "Debug mode: $DEBUG"
|
||||||
echo "Environment: $ENVIRONMENT"
|
echo "Environment: $ENVIRONMENT"
|
||||||
|
|
||||||
|
# Set default transport mode if not specified
|
||||||
|
TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"}
|
||||||
|
echo "Transport mode: $TRANSPORT_MODE"
|
||||||
|
|
||||||
# Run Alembic migrations with proper error handling.
|
# Run Alembic migrations with proper error handling.
|
||||||
# Note on UserAlreadyExists error handling:
|
# Note on UserAlreadyExists error handling:
|
||||||
# During database migrations, we attempt to create a default user. If this user
|
# During database migrations, we attempt to create a default user. If this user
|
||||||
|
|
@ -28,19 +32,31 @@ fi
|
||||||
|
|
||||||
echo "Database migrations done."
|
echo "Database migrations done."
|
||||||
|
|
||||||
echo "Starting Cognee MCP Server..."
|
echo "Starting Cognee MCP Server with transport mode: $TRANSPORT_MODE"
|
||||||
|
|
||||||
# Add startup delay to ensure DB is ready
|
# Add startup delay to ensure DB is ready
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
# Modified Gunicorn startup with error handling
|
# Modified startup with transport mode selection and error handling
|
||||||
if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then
|
if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then
|
||||||
if [ "$DEBUG" = "true" ]; then
|
if [ "$DEBUG" = "true" ]; then
|
||||||
echo "Waiting for the debugger to attach..."
|
echo "Waiting for the debugger to attach..."
|
||||||
exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee
|
if [ "$TRANSPORT_MODE" = "sse" ]; then
|
||||||
|
exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport sse
|
||||||
|
else
|
||||||
|
exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m cognee --transport stdio
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
exec cognee
|
if [ "$TRANSPORT_MODE" = "sse" ]; then
|
||||||
|
exec cognee --transport sse
|
||||||
|
else
|
||||||
|
exec cognee --transport stdio
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
exec cognee
|
if [ "$TRANSPORT_MODE" = "sse" ]; then
|
||||||
|
exec cognee --transport sse
|
||||||
|
else
|
||||||
|
exec cognee --transport stdio
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,7 @@ from cognee.modules.search.types import SearchType
|
||||||
from cognee.shared.data_models import KnowledgeGraph
|
from cognee.shared.data_models import KnowledgeGraph
|
||||||
from cognee.modules.storage.utils import JSONEncoder
|
from cognee.modules.storage.utils import JSONEncoder
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from codingagents.coding_rule_associations import (
|
from codingagents.coding_rule_associations import (
|
||||||
add_rule_associations,
|
add_rule_associations,
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,128 @@ async def add(
|
||||||
graph_db_config: dict = None,
|
graph_db_config: dict = None,
|
||||||
dataset_id: UUID = None,
|
dataset_id: UUID = None,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Add data to Cognee for knowledge graph processing.
|
||||||
|
|
||||||
|
This is the first step in the Cognee workflow - it ingests raw data and prepares it
|
||||||
|
for processing. The function accepts various data formats including text, files, and
|
||||||
|
binary streams, then stores them in a specified dataset for further processing.
|
||||||
|
|
||||||
|
Prerequisites:
|
||||||
|
- **LLM_API_KEY**: Must be set in environment variables for content processing
|
||||||
|
- **Database Setup**: Relational and vector databases must be configured
|
||||||
|
- **User Authentication**: Uses default user if none provided (created automatically)
|
||||||
|
|
||||||
|
Supported Input Types:
|
||||||
|
- **Text strings**: Direct text content (str) - any string not starting with "/" or "file://"
|
||||||
|
- **File paths**: Local file paths as strings in these formats:
|
||||||
|
* Absolute paths: "/path/to/document.pdf"
|
||||||
|
* File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt"
|
||||||
|
* S3 paths: "s3://bucket-name/path/to/file.pdf"
|
||||||
|
- **Binary file objects**: File handles/streams (BinaryIO)
|
||||||
|
- **Lists**: Multiple files or text strings in a single call
|
||||||
|
|
||||||
|
Supported File Formats:
|
||||||
|
- Text files (.txt, .md, .csv)
|
||||||
|
- PDFs (.pdf)
|
||||||
|
- Images (.png, .jpg, .jpeg) - extracted via OCR/vision models
|
||||||
|
- Audio files (.mp3, .wav) - transcribed to text
|
||||||
|
- Code files (.py, .js, .ts, etc.) - parsed for structure and content
|
||||||
|
- Office documents (.docx, .pptx)
|
||||||
|
|
||||||
|
Workflow:
|
||||||
|
1. **Data Resolution**: Resolves file paths and validates accessibility
|
||||||
|
2. **Content Extraction**: Extracts text content from various file formats
|
||||||
|
3. **Dataset Storage**: Stores processed content in the specified dataset
|
||||||
|
4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions
|
||||||
|
5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: The data to ingest. Can be:
|
||||||
|
- Single text string: "Your text content here"
|
||||||
|
- Absolute file path: "/path/to/document.pdf"
|
||||||
|
- File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt"
|
||||||
|
- S3 path: "s3://my-bucket/documents/file.pdf"
|
||||||
|
- List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
|
||||||
|
- Binary file object: open("file.txt", "rb")
|
||||||
|
dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
|
||||||
|
Create separate datasets to organize different knowledge domains.
|
||||||
|
user: User object for authentication and permissions. Uses default user if None.
|
||||||
|
Default user: "default_user@example.com" (created automatically on first use).
|
||||||
|
Users can only access datasets they have permissions for.
|
||||||
|
node_set: Optional list of node identifiers for graph organization and access control.
|
||||||
|
Used for grouping related data points in the knowledge graph.
|
||||||
|
vector_db_config: Optional configuration for vector database (for custom setups).
|
||||||
|
graph_db_config: Optional configuration for graph database (for custom setups).
|
||||||
|
dataset_id: Optional specific dataset UUID to use instead of dataset_name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PipelineRunInfo: Information about the ingestion pipeline execution including:
|
||||||
|
- Pipeline run ID for tracking
|
||||||
|
- Dataset ID where data was stored
|
||||||
|
- Processing status and any errors
|
||||||
|
- Execution timestamps and metadata
|
||||||
|
|
||||||
|
Next Steps:
|
||||||
|
After successfully adding data, call `cognify()` to process the ingested content:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import cognee
|
||||||
|
|
||||||
|
# Step 1: Add your data (text content or file path)
|
||||||
|
await cognee.add("Your document content") # Raw text
|
||||||
|
# OR
|
||||||
|
await cognee.add("/path/to/your/file.pdf") # File path
|
||||||
|
|
||||||
|
# Step 2: Process into knowledge graph
|
||||||
|
await cognee.cognify()
|
||||||
|
|
||||||
|
# Step 3: Search and query
|
||||||
|
results = await cognee.search("What insights can you find?")
|
||||||
|
```
|
||||||
|
|
||||||
|
Example Usage:
|
||||||
|
```python
|
||||||
|
# Add a single text document
|
||||||
|
await cognee.add("Natural language processing is a field of AI...")
|
||||||
|
|
||||||
|
# Add multiple files with different path formats
|
||||||
|
await cognee.add([
|
||||||
|
"/absolute/path/to/research_paper.pdf", # Absolute path
|
||||||
|
"file://relative/path/to/dataset.csv", # Relative file URL
|
||||||
|
"file:///absolute/path/to/report.docx", # Absolute file URL
|
||||||
|
"s3://my-bucket/documents/data.json", # S3 path
|
||||||
|
"Additional context text" # Raw text content
|
||||||
|
])
|
||||||
|
|
||||||
|
# Add to a specific dataset
|
||||||
|
await cognee.add(
|
||||||
|
data="Project documentation content",
|
||||||
|
dataset_name="project_docs"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add a single file
|
||||||
|
await cognee.add("/home/user/documents/analysis.pdf")
|
||||||
|
```
|
||||||
|
|
||||||
|
Environment Variables:
|
||||||
|
Required:
|
||||||
|
- LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.)
|
||||||
|
|
||||||
|
Optional:
|
||||||
|
- LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama"
|
||||||
|
- LLM_MODEL: Model name (default: "gpt-4o-mini")
|
||||||
|
- DEFAULT_USER_EMAIL: Custom default user email
|
||||||
|
- DEFAULT_USER_PASSWORD: Custom default user password
|
||||||
|
- VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "qdrant", "weaviate"
|
||||||
|
- GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j", "networkx"
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If specified file paths don't exist
|
||||||
|
PermissionError: If user lacks access to files or dataset
|
||||||
|
UnsupportedFileTypeError: If file format cannot be processed
|
||||||
|
InvalidValueError: If LLM_API_KEY is not set or invalid
|
||||||
|
"""
|
||||||
tasks = [
|
tasks = [
|
||||||
Task(resolve_data_directories, include_subdirectories=True),
|
Task(resolve_data_directories, include_subdirectories=True),
|
||||||
Task(ingest_data, dataset_name, user, node_set, dataset_id),
|
Task(ingest_data, dataset_name, user, node_set, dataset_id),
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,151 @@ async def cognify(
|
||||||
graph_db_config: dict = None,
|
graph_db_config: dict = None,
|
||||||
run_in_background: bool = False,
|
run_in_background: bool = False,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Transform ingested data into a structured knowledge graph.
|
||||||
|
|
||||||
|
This is the core processing step in Cognee that converts raw text and documents
|
||||||
|
into an intelligent knowledge graph. It analyzes content, extracts entities and
|
||||||
|
relationships, and creates semantic connections for enhanced search and reasoning.
|
||||||
|
|
||||||
|
Prerequisites:
|
||||||
|
- **LLM_API_KEY**: Must be configured (required for entity extraction and graph generation)
|
||||||
|
- **Data Added**: Must have data previously added via `cognee.add()`
|
||||||
|
- **Vector Database**: Must be accessible for embeddings storage
|
||||||
|
- **Graph Database**: Must be accessible for relationship storage
|
||||||
|
|
||||||
|
Input Requirements:
|
||||||
|
- **Datasets**: Must contain data previously added via `cognee.add()`
|
||||||
|
- **Content Types**: Works with any text-extractable content including:
|
||||||
|
* Natural language documents
|
||||||
|
* Structured data (CSV, JSON)
|
||||||
|
* Code repositories
|
||||||
|
* Academic papers and technical documentation
|
||||||
|
* Mixed multimedia content (with text extraction)
|
||||||
|
|
||||||
|
Processing Pipeline:
|
||||||
|
1. **Document Classification**: Identifies document types and structures
|
||||||
|
2. **Permission Validation**: Ensures user has processing rights
|
||||||
|
3. **Text Chunking**: Breaks content into semantically meaningful segments
|
||||||
|
4. **Entity Extraction**: Identifies key concepts, people, places, organizations
|
||||||
|
5. **Relationship Detection**: Discovers connections between entities
|
||||||
|
6. **Graph Construction**: Builds semantic knowledge graph with embeddings
|
||||||
|
7. **Content Summarization**: Creates hierarchical summaries for navigation
|
||||||
|
|
||||||
|
Graph Model Customization:
|
||||||
|
The `graph_model` parameter allows custom knowledge structures:
|
||||||
|
- **Default**: General-purpose KnowledgeGraph for any domain
|
||||||
|
- **Custom Models**: Domain-specific schemas (e.g., scientific papers, code analysis)
|
||||||
|
- **Ontology Integration**: Use `ontology_file_path` for predefined vocabularies
|
||||||
|
|
||||||
|
Args:
|
||||||
|
datasets: Dataset name(s) or dataset uuid to process. Processes all available data if None.
|
||||||
|
- Single dataset: "my_dataset"
|
||||||
|
- Multiple datasets: ["docs", "research", "reports"]
|
||||||
|
- None: Process all datasets for the user
|
||||||
|
user: User context for authentication and data access. Uses default if None.
|
||||||
|
graph_model: Pydantic model defining the knowledge graph structure.
|
||||||
|
Defaults to KnowledgeGraph for general-purpose processing.
|
||||||
|
chunker: Text chunking strategy (TextChunker, LangchainChunker).
|
||||||
|
- TextChunker: Paragraph-based chunking (default, most reliable)
|
||||||
|
- LangchainChunker: Recursive character splitting with overlap
|
||||||
|
Determines how documents are segmented for processing.
|
||||||
|
chunk_size: Maximum tokens per chunk. Auto-calculated based on LLM if None.
|
||||||
|
Formula: min(embedding_max_tokens, llm_max_tokens // 2)
|
||||||
|
Default limits: ~512-8192 tokens depending on models.
|
||||||
|
Smaller chunks = more granular but potentially fragmented knowledge.
|
||||||
|
ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types.
|
||||||
|
Useful for specialized fields like medical or legal documents.
|
||||||
|
vector_db_config: Custom vector database configuration for embeddings storage.
|
||||||
|
graph_db_config: Custom graph database configuration for relationship storage.
|
||||||
|
run_in_background: If True, starts processing asynchronously and returns immediately.
|
||||||
|
If False, waits for completion before returning.
|
||||||
|
Background mode recommended for large datasets (>100MB).
|
||||||
|
Use pipeline_run_id from return value to monitor progress.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Union[dict, list[PipelineRunInfo]]:
|
||||||
|
- **Blocking mode**: Dictionary mapping dataset_id -> PipelineRunInfo with:
|
||||||
|
* Processing status (completed/failed/in_progress)
|
||||||
|
* Extracted entity and relationship counts
|
||||||
|
* Processing duration and resource usage
|
||||||
|
* Error details if any failures occurred
|
||||||
|
- **Background mode**: List of PipelineRunInfo objects for tracking progress
|
||||||
|
* Use pipeline_run_id to monitor status
|
||||||
|
* Check completion via pipeline monitoring APIs
|
||||||
|
|
||||||
|
Next Steps:
|
||||||
|
After successful cognify processing, use search functions to query the knowledge:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import cognee
|
||||||
|
from cognee import SearchType
|
||||||
|
|
||||||
|
# Process your data into knowledge graph
|
||||||
|
await cognee.cognify()
|
||||||
|
|
||||||
|
# Query for insights using different search types:
|
||||||
|
|
||||||
|
# 1. Natural language completion with graph context
|
||||||
|
insights = await cognee.search(
|
||||||
|
"What are the main themes?",
|
||||||
|
query_type=SearchType.GRAPH_COMPLETION
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Get entity relationships and connections
|
||||||
|
relationships = await cognee.search(
|
||||||
|
"connections between concepts",
|
||||||
|
query_type=SearchType.INSIGHTS
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Find relevant document chunks
|
||||||
|
chunks = await cognee.search(
|
||||||
|
"specific topic",
|
||||||
|
query_type=SearchType.CHUNKS
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Advanced Usage:
|
||||||
|
```python
|
||||||
|
# Custom domain model for scientific papers
|
||||||
|
class ScientificPaper(DataPoint):
|
||||||
|
title: str
|
||||||
|
authors: List[str]
|
||||||
|
methodology: str
|
||||||
|
findings: List[str]
|
||||||
|
|
||||||
|
await cognee.cognify(
|
||||||
|
datasets=["research_papers"],
|
||||||
|
graph_model=ScientificPaper,
|
||||||
|
ontology_file_path="scientific_ontology.owl"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Background processing for large datasets
|
||||||
|
run_info = await cognee.cognify(
|
||||||
|
datasets=["large_corpus"],
|
||||||
|
run_in_background=True
|
||||||
|
)
|
||||||
|
# Check status later with run_info.pipeline_run_id
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Environment Variables:
|
||||||
|
Required:
|
||||||
|
- LLM_API_KEY: API key for your LLM provider
|
||||||
|
|
||||||
|
Optional (same as add function):
|
||||||
|
- LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER
|
||||||
|
- LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False)
|
||||||
|
- LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
DatasetNotFoundError: If specified datasets don't exist
|
||||||
|
PermissionError: If user lacks processing rights
|
||||||
|
InvalidValueError: If LLM_API_KEY is not set
|
||||||
|
OntologyParsingError: If ontology file is malformed
|
||||||
|
ValueError: If chunks exceed max token limits (reduce chunk_size)
|
||||||
|
DatabaseNotCreatedError: If databases are not properly initialized
|
||||||
|
"""
|
||||||
tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path)
|
tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path)
|
||||||
|
|
||||||
if run_in_background:
|
if run_in_background:
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,142 @@ async def search(
|
||||||
node_type: Optional[Type] = None,
|
node_type: Optional[Type] = None,
|
||||||
node_name: Optional[List[str]] = None,
|
node_name: Optional[List[str]] = None,
|
||||||
) -> list:
|
) -> list:
|
||||||
|
"""
|
||||||
|
Search and query the knowledge graph for insights, information, and connections.
|
||||||
|
|
||||||
|
This is the final step in the Cognee workflow that retrieves information from the
|
||||||
|
processed knowledge graph. It supports multiple search modes optimized for different
|
||||||
|
use cases - from simple fact retrieval to complex reasoning and code analysis.
|
||||||
|
|
||||||
|
Search Prerequisites:
|
||||||
|
- **LLM_API_KEY**: Required for GRAPH_COMPLETION and RAG_COMPLETION search types
|
||||||
|
- **Data Added**: Must have data previously added via `cognee.add()`
|
||||||
|
- **Knowledge Graph Built**: Must have processed data via `cognee.cognify()`
|
||||||
|
- **Dataset Permissions**: User must have 'read' permission on target datasets
|
||||||
|
- **Vector Database**: Must be accessible for semantic search functionality
|
||||||
|
|
||||||
|
Search Types & Use Cases:
|
||||||
|
|
||||||
|
**GRAPH_COMPLETION** (Default - Recommended):
|
||||||
|
Natural language Q&A using full graph context and LLM reasoning.
|
||||||
|
Best for: Complex questions, analysis, summaries, insights.
|
||||||
|
Returns: Conversational AI responses with graph-backed context.
|
||||||
|
|
||||||
|
**RAG_COMPLETION**:
|
||||||
|
Traditional RAG using document chunks without graph structure.
|
||||||
|
Best for: Direct document retrieval, specific fact-finding.
|
||||||
|
Returns: LLM responses based on relevant text chunks.
|
||||||
|
|
||||||
|
**INSIGHTS**:
|
||||||
|
Structured entity relationships and semantic connections.
|
||||||
|
Best for: Understanding concept relationships, knowledge mapping.
|
||||||
|
Returns: Formatted relationship data and entity connections.
|
||||||
|
|
||||||
|
**CHUNKS**:
|
||||||
|
Raw text segments that match the query semantically.
|
||||||
|
Best for: Finding specific passages, citations, exact content.
|
||||||
|
Returns: Ranked list of relevant text chunks with metadata.
|
||||||
|
|
||||||
|
**SUMMARIES**:
|
||||||
|
Pre-generated hierarchical summaries of content.
|
||||||
|
Best for: Quick overviews, document abstracts, topic summaries.
|
||||||
|
Returns: Multi-level summaries from detailed to high-level.
|
||||||
|
|
||||||
|
**CODE**:
|
||||||
|
Code-specific search with syntax and semantic understanding.
|
||||||
|
Best for: Finding functions, classes, implementation patterns.
|
||||||
|
Returns: Structured code information with context and relationships.
|
||||||
|
|
||||||
|
**CYPHER**:
|
||||||
|
Direct graph database queries using Cypher syntax.
|
||||||
|
Best for: Advanced users, specific graph traversals, debugging.
|
||||||
|
Returns: Raw graph query results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query_text: Your question or search query in natural language.
|
||||||
|
Examples:
|
||||||
|
- "What are the main themes in this research?"
|
||||||
|
- "How do these concepts relate to each other?"
|
||||||
|
- "Find information about machine learning algorithms"
|
||||||
|
- "What functions handle user authentication?"
|
||||||
|
|
||||||
|
query_type: SearchType enum specifying the search mode.
|
||||||
|
Defaults to GRAPH_COMPLETION for conversational AI responses.
|
||||||
|
|
||||||
|
user: User context for data access permissions. Uses default if None.
|
||||||
|
|
||||||
|
datasets: Dataset name(s) to search within. Searches all accessible if None.
|
||||||
|
- Single dataset: "research_papers"
|
||||||
|
- Multiple datasets: ["docs", "reports", "analysis"]
|
||||||
|
- None: Search across all user datasets
|
||||||
|
|
||||||
|
dataset_ids: Alternative to datasets - use specific UUID identifiers.
|
||||||
|
|
||||||
|
system_prompt_path: Custom system prompt file for LLM-based search types.
|
||||||
|
Defaults to "answer_simple_question.txt".
|
||||||
|
|
||||||
|
top_k: Maximum number of results to return (1-N)
|
||||||
|
Higher values provide more comprehensive but potentially noisy results.
|
||||||
|
|
||||||
|
node_type: Filter results to specific entity types (for advanced filtering).
|
||||||
|
|
||||||
|
node_name: Filter results to specific named entities (for targeted search).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: Search results in format determined by query_type:
|
||||||
|
|
||||||
|
**GRAPH_COMPLETION/RAG_COMPLETION**:
|
||||||
|
[List of conversational AI response strings]
|
||||||
|
|
||||||
|
**INSIGHTS**:
|
||||||
|
[List of formatted relationship descriptions and entity connections]
|
||||||
|
|
||||||
|
**CHUNKS**:
|
||||||
|
[List of relevant text passages with source metadata]
|
||||||
|
|
||||||
|
**SUMMARIES**:
|
||||||
|
[List of hierarchical summaries from general to specific]
|
||||||
|
|
||||||
|
**CODE**:
|
||||||
|
[List of structured code information with context]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Performance & Optimization:
|
||||||
|
- **GRAPH_COMPLETION**: Slower but most intelligent, uses LLM + graph context
|
||||||
|
- **RAG_COMPLETION**: Medium speed, uses LLM + document chunks (no graph traversal)
|
||||||
|
- **INSIGHTS**: Fast, returns structured relationships without LLM processing
|
||||||
|
- **CHUNKS**: Fastest, pure vector similarity search without LLM
|
||||||
|
- **SUMMARIES**: Fast, returns pre-computed summaries
|
||||||
|
- **CODE**: Medium speed, specialized for code understanding
|
||||||
|
- **top_k**: Start with 10, increase for comprehensive analysis (max 100)
|
||||||
|
- **datasets**: Specify datasets to improve speed and relevance
|
||||||
|
|
||||||
|
Next Steps After Search:
|
||||||
|
- Use results for further analysis or application integration
|
||||||
|
- Combine different search types for comprehensive understanding
|
||||||
|
- Export insights for reporting or downstream processing
|
||||||
|
- Iterate with refined queries based on initial results
|
||||||
|
|
||||||
|
Environment Variables:
|
||||||
|
Required for LLM-based search types (GRAPH_COMPLETION, RAG_COMPLETION):
|
||||||
|
- LLM_API_KEY: API key for your LLM provider
|
||||||
|
|
||||||
|
Optional:
|
||||||
|
- LLM_PROVIDER, LLM_MODEL: Configure LLM for search responses
|
||||||
|
- VECTOR_DB_PROVIDER: Must match what was used during cognify
|
||||||
|
- GRAPH_DATABASE_PROVIDER: Must match what was used during cognify
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
DatasetNotFoundError: If specified datasets don't exist or aren't accessible
|
||||||
|
PermissionDeniedError: If user lacks read access to requested datasets
|
||||||
|
NoDataError: If no relevant data found for the search query
|
||||||
|
InvalidValueError: If LLM_API_KEY is not set (for LLM-based search types)
|
||||||
|
ValueError: If query_text is empty or search parameters are invalid
|
||||||
|
CollectionNotFoundError: If vector collection not found (data not processed)
|
||||||
|
"""
|
||||||
# We use lists from now on for datasets
|
# We use lists from now on for datasets
|
||||||
if isinstance(datasets, UUID) or isinstance(datasets, str):
|
if isinstance(datasets, UUID) or isinstance(datasets, str):
|
||||||
datasets = [datasets]
|
datasets = [datasets]
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import litellm
|
import litellm
|
||||||
|
import logging
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing import Type, Optional
|
from typing import Type, Optional
|
||||||
from litellm import acompletion, JSONSchemaValidationError
|
from litellm import acompletion, JSONSchemaValidationError
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,7 @@
|
||||||
from cognee.shared.logging_utils import get_logger
|
|
||||||
import litellm
|
import litellm
|
||||||
|
|
||||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
||||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
@ -22,6 +21,9 @@ def get_max_chunk_tokens():
|
||||||
the smaller value of the embedding engine's max tokens and half of the LLM's
|
the smaller value of the embedding engine's max tokens and half of the LLM's
|
||||||
maximum tokens.
|
maximum tokens.
|
||||||
"""
|
"""
|
||||||
|
# NOTE: Import must be done in function to avoid circular import issue
|
||||||
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||||
|
|
||||||
# Calculate max chunk size based on the following formula
|
# Calculate max chunk size based on the following formula
|
||||||
embedding_engine = get_vector_engine().embedding_engine
|
embedding_engine = get_vector_engine().embedding_engine
|
||||||
llm_client = get_llm_client()
|
llm_client = get_llm_client()
|
||||||
|
|
@ -93,6 +95,9 @@ async def test_embedding_connection():
|
||||||
the exception if the connection to the embedding handler cannot be established.
|
the exception if the connection to the embedding handler cannot be established.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
# NOTE: Vector engine import must be done in function to avoid circular import issue
|
||||||
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||||
|
|
||||||
await get_vector_engine().embedding_engine.embed_text("test")
|
await get_vector_engine().embedding_engine.embed_text("test")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,6 @@
|
||||||
from typing import IO, Optional
|
from typing import IO, Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import os
|
||||||
from cognee.api.v1.add.config import get_s3_config
|
from cognee.api.v1.add.config import get_s3_config
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -24,8 +26,16 @@ def open_data_file(
|
||||||
else:
|
else:
|
||||||
return fs.open(file_path, mode=mode, encoding=encoding, **kwargs)
|
return fs.open(file_path, mode=mode, encoding=encoding, **kwargs)
|
||||||
elif file_path.startswith("file://"):
|
elif file_path.startswith("file://"):
|
||||||
# Handle local file URLs by stripping the file:// prefix
|
# Handle local file URLs by properly parsing the URI
|
||||||
file_path = file_path.replace("file://", "", 1)
|
parsed_url = urlparse(file_path)
|
||||||
return open(file_path, mode=mode, encoding=encoding, **kwargs)
|
# On Windows, urlparse handles drive letters correctly
|
||||||
|
# Convert the path component to a proper file path
|
||||||
|
if os.name == "nt": # Windows
|
||||||
|
# Remove leading slash from Windows paths like /C:/Users/...
|
||||||
|
local_path = parsed_url.path.lstrip("/")
|
||||||
|
else: # Unix-like systems
|
||||||
|
local_path = parsed_url.path
|
||||||
|
|
||||||
|
return open(local_path, mode=mode, encoding=encoding, **kwargs)
|
||||||
else:
|
else:
|
||||||
return open(file_path, mode=mode, encoding=encoding, **kwargs)
|
return open(file_path, mode=mode, encoding=encoding, **kwargs)
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,23 @@ import importlib.metadata
|
||||||
from cognee import __version__ as cognee_version
|
from cognee import __version__ as cognee_version
|
||||||
from typing import Protocol
|
from typing import Protocol
|
||||||
|
|
||||||
|
|
||||||
|
# Configure external library logging
|
||||||
|
def configure_external_library_logging():
|
||||||
|
"""Configure logging for external libraries to reduce verbosity"""
|
||||||
|
# Configure LiteLLM logging to reduce verbosity
|
||||||
|
try:
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
litellm.set_verbose = False
|
||||||
|
|
||||||
|
# Suppress LiteLLM ERROR logging using standard logging
|
||||||
|
logging.getLogger("litellm").setLevel(logging.CRITICAL)
|
||||||
|
except ImportError:
|
||||||
|
# LiteLLM not available, skip configuration
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# Export common log levels
|
# Export common log levels
|
||||||
DEBUG = logging.DEBUG
|
DEBUG = logging.DEBUG
|
||||||
INFO = logging.INFO
|
INFO = logging.INFO
|
||||||
|
|
@ -148,6 +165,44 @@ def get_logger(name=None, level=None) -> LoggerInterface:
|
||||||
return logger
|
return logger
|
||||||
|
|
||||||
|
|
||||||
|
def log_database_configuration(logger):
|
||||||
|
"""Log the current database configuration for all database types"""
|
||||||
|
# NOTE: Has to be imporated at runtime to avoid circular import
|
||||||
|
from cognee.infrastructure.databases.relational.config import get_relational_config
|
||||||
|
from cognee.infrastructure.databases.vector.config import get_vectordb_config
|
||||||
|
from cognee.infrastructure.databases.graph.config import get_graph_config
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Log relational database configuration
|
||||||
|
relational_config = get_relational_config()
|
||||||
|
logger.info(f"Relational database: {relational_config.db_provider}")
|
||||||
|
if relational_config.db_provider == "postgres":
|
||||||
|
logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}")
|
||||||
|
logger.info(f"Postgres database: {relational_config.db_name}")
|
||||||
|
elif relational_config.db_provider == "sqlite":
|
||||||
|
logger.info(f"SQLite path: {relational_config.db_path}")
|
||||||
|
logger.info(f"SQLite database: {relational_config.db_name}")
|
||||||
|
|
||||||
|
# Log vector database configuration
|
||||||
|
vector_config = get_vectordb_config()
|
||||||
|
logger.info(f"Vector database: {vector_config.vector_db_provider}")
|
||||||
|
if vector_config.vector_db_provider == "lancedb":
|
||||||
|
logger.info(f"Vector database path: {vector_config.vector_db_url}")
|
||||||
|
else:
|
||||||
|
logger.info(f"Vector database URL: {vector_config.vector_db_url}")
|
||||||
|
|
||||||
|
# Log graph database configuration
|
||||||
|
graph_config = get_graph_config()
|
||||||
|
logger.info(f"Graph database: {graph_config.graph_database_provider}")
|
||||||
|
if graph_config.graph_database_provider == "kuzu":
|
||||||
|
logger.info(f"Graph database path: {graph_config.graph_file_path}")
|
||||||
|
else:
|
||||||
|
logger.info(f"Graph database URL: {graph_config.graph_database_url}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not retrieve database configuration: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
def cleanup_old_logs(logs_dir, max_files):
|
def cleanup_old_logs(logs_dir, max_files):
|
||||||
"""
|
"""
|
||||||
Removes old log files, keeping only the most recent ones.
|
Removes old log files, keeping only the most recent ones.
|
||||||
|
|
@ -193,6 +248,9 @@ def setup_logging(log_level=None, name=None):
|
||||||
|
|
||||||
log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")]
|
log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")]
|
||||||
|
|
||||||
|
# Configure external library logging early to suppress verbose output
|
||||||
|
configure_external_library_logging()
|
||||||
|
|
||||||
def exception_handler(logger, method_name, event_dict):
|
def exception_handler(logger, method_name, event_dict):
|
||||||
"""Custom processor to handle uncaught exceptions."""
|
"""Custom processor to handle uncaught exceptions."""
|
||||||
# Check if there's an exc_info that needs to be processed
|
# Check if there's an exc_info that needs to be processed
|
||||||
|
|
@ -339,6 +397,9 @@ def setup_logging(log_level=None, name=None):
|
||||||
|
|
||||||
logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")
|
logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")
|
||||||
|
|
||||||
|
# Log database configuration
|
||||||
|
log_database_configuration(logger)
|
||||||
|
|
||||||
# Return the configured logger
|
# Return the configured logger
|
||||||
return logger
|
return logger
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import pytest
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
from cognee.modules.data.processing.document_types.open_data_file import open_data_file
|
from cognee.modules.data.processing.document_types.open_data_file import open_data_file
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -29,7 +30,8 @@ class TestOpenDataFile:
|
||||||
temp_file_path = f.name
|
temp_file_path = f.name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file_url = f"file://{temp_file_path}"
|
# Use pathlib.Path.as_uri() for proper cross-platform file URL creation
|
||||||
|
file_url = Path(temp_file_path).as_uri()
|
||||||
with open_data_file(file_url, mode="r") as f:
|
with open_data_file(file_url, mode="r") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
assert content == test_content
|
assert content == test_content
|
||||||
|
|
@ -44,7 +46,8 @@ class TestOpenDataFile:
|
||||||
temp_file_path = f.name
|
temp_file_path = f.name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file_url = f"file://{temp_file_path}"
|
# Use pathlib.Path.as_uri() for proper cross-platform file URL creation
|
||||||
|
file_url = Path(temp_file_path).as_uri()
|
||||||
with open_data_file(file_url, mode="rb") as f:
|
with open_data_file(file_url, mode="rb") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
assert content == test_content.encode()
|
assert content == test_content.encode()
|
||||||
|
|
@ -61,7 +64,8 @@ class TestOpenDataFile:
|
||||||
temp_file_path = f.name
|
temp_file_path = f.name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file_url = f"file://{temp_file_path}"
|
# Use pathlib.Path.as_uri() for proper cross-platform file URL creation
|
||||||
|
file_url = Path(temp_file_path).as_uri()
|
||||||
with open_data_file(file_url, mode="r", encoding="utf-8") as f:
|
with open_data_file(file_url, mode="r", encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
assert content == test_content
|
assert content == test_content
|
||||||
|
|
@ -84,7 +88,9 @@ class TestOpenDataFile:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Even if someone accidentally adds multiple file:// prefixes
|
# Even if someone accidentally adds multiple file:// prefixes
|
||||||
file_url = f"file://file://{temp_file_path}"
|
# Use proper file URL creation first
|
||||||
|
proper_file_url = Path(temp_file_path).as_uri()
|
||||||
|
file_url = f"file://{proper_file_url}"
|
||||||
with open_data_file(file_url, mode="r") as f:
|
with open_data_file(file_url, mode="r") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
# This should work because we only replace the first occurrence
|
# This should work because we only replace the first occurrence
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue