Merge branch 'main' of github.com:chinu0609/cognee
This commit is contained in:
commit
ce4a5c8311
39 changed files with 5507 additions and 4349 deletions
46
alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
Normal file
46
alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
"""add_last_accessed_to_data
|
||||||
|
|
||||||
|
Revision ID: e1ec1dcb50b6
|
||||||
|
Revises: 211ab850ef3d
|
||||||
|
Create Date: 2025-11-04 21:45:52.642322
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = 'e1ec1dcb50b6'
|
||||||
|
down_revision: Union[str, None] = '211ab850ef3d'
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
def _get_column(inspector, table, name, schema=None):
|
||||||
|
for col in inspector.get_columns(table, schema=schema):
|
||||||
|
if col["name"] == name:
|
||||||
|
return col
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
conn = op.get_bind()
|
||||||
|
insp = sa.inspect(conn)
|
||||||
|
|
||||||
|
last_accessed_column = _get_column(insp, "data", "last_accessed")
|
||||||
|
if not last_accessed_column:
|
||||||
|
op.add_column('data',
|
||||||
|
sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)
|
||||||
|
)
|
||||||
|
# Optionally initialize with created_at values for existing records
|
||||||
|
op.execute("UPDATE data SET last_accessed = created_at")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
conn = op.get_bind()
|
||||||
|
insp = sa.inspect(conn)
|
||||||
|
|
||||||
|
last_accessed_column = _get_column(insp, "data", "last_accessed")
|
||||||
|
if last_accessed_column:
|
||||||
|
op.drop_column('data', 'last_accessed')
|
||||||
|
|
@ -110,6 +110,47 @@ If you'd rather run cognee-mcp in a container, you have two options:
|
||||||
# For stdio transport (default)
|
# For stdio transport (default)
|
||||||
docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
|
docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Installing optional dependencies at runtime:**
|
||||||
|
|
||||||
|
You can install optional dependencies when running the container by setting the `EXTRAS` environment variable:
|
||||||
|
```bash
|
||||||
|
# Install a single optional dependency group at runtime
|
||||||
|
docker run \
|
||||||
|
-e TRANSPORT_MODE=http \
|
||||||
|
-e EXTRAS=aws \
|
||||||
|
--env-file ./.env \
|
||||||
|
-p 8000:8000 \
|
||||||
|
--rm -it cognee/cognee-mcp:main
|
||||||
|
|
||||||
|
# Install multiple optional dependency groups at runtime (comma-separated)
|
||||||
|
docker run \
|
||||||
|
-e TRANSPORT_MODE=sse \
|
||||||
|
-e EXTRAS=aws,postgres,neo4j \
|
||||||
|
--env-file ./.env \
|
||||||
|
-p 8000:8000 \
|
||||||
|
--rm -it cognee/cognee-mcp:main
|
||||||
|
```
|
||||||
|
|
||||||
|
**Available optional dependency groups:**
|
||||||
|
- `aws` - S3 storage support
|
||||||
|
- `postgres` / `postgres-binary` - PostgreSQL database support
|
||||||
|
- `neo4j` - Neo4j graph database support
|
||||||
|
- `neptune` - AWS Neptune support
|
||||||
|
- `chromadb` - ChromaDB vector store support
|
||||||
|
- `scraping` - Web scraping capabilities
|
||||||
|
- `distributed` - Modal distributed execution
|
||||||
|
- `langchain` - LangChain integration
|
||||||
|
- `llama-index` - LlamaIndex integration
|
||||||
|
- `anthropic` - Anthropic models
|
||||||
|
- `groq` - Groq models
|
||||||
|
- `mistral` - Mistral models
|
||||||
|
- `ollama` / `huggingface` - Local model support
|
||||||
|
- `docs` - Document processing
|
||||||
|
- `codegraph` - Code analysis
|
||||||
|
- `monitoring` - Sentry & Langfuse monitoring
|
||||||
|
- `redis` - Redis support
|
||||||
|
- And more (see [pyproject.toml](https://github.com/topoteretes/cognee/blob/main/pyproject.toml) for full list)
|
||||||
2. **Pull from Docker Hub** (no build required):
|
2. **Pull from Docker Hub** (no build required):
|
||||||
```bash
|
```bash
|
||||||
# With HTTP transport (recommended for web deployments)
|
# With HTTP transport (recommended for web deployments)
|
||||||
|
|
@ -119,6 +160,17 @@ If you'd rather run cognee-mcp in a container, you have two options:
|
||||||
# With stdio transport (default)
|
# With stdio transport (default)
|
||||||
docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
|
docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**With runtime installation of optional dependencies:**
|
||||||
|
```bash
|
||||||
|
# Install optional dependencies from Docker Hub image
|
||||||
|
docker run \
|
||||||
|
-e TRANSPORT_MODE=http \
|
||||||
|
-e EXTRAS=aws,postgres \
|
||||||
|
--env-file ./.env \
|
||||||
|
-p 8000:8000 \
|
||||||
|
--rm -it cognee/cognee-mcp:main
|
||||||
|
```
|
||||||
|
|
||||||
### **Important: Docker vs Direct Usage**
|
### **Important: Docker vs Direct Usage**
|
||||||
**Docker uses environment variables**, not command line arguments:
|
**Docker uses environment variables**, not command line arguments:
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,42 @@ set -e # Exit on error
|
||||||
echo "Debug mode: $DEBUG"
|
echo "Debug mode: $DEBUG"
|
||||||
echo "Environment: $ENVIRONMENT"
|
echo "Environment: $ENVIRONMENT"
|
||||||
|
|
||||||
|
# Install optional dependencies if EXTRAS is set
|
||||||
|
if [ -n "$EXTRAS" ]; then
|
||||||
|
echo "Installing optional dependencies: $EXTRAS"
|
||||||
|
|
||||||
|
# Get the cognee version that's currently installed
|
||||||
|
COGNEE_VERSION=$(uv pip show cognee | grep "Version:" | awk '{print $2}')
|
||||||
|
echo "Current cognee version: $COGNEE_VERSION"
|
||||||
|
|
||||||
|
# Build the extras list for cognee
|
||||||
|
IFS=',' read -ra EXTRA_ARRAY <<< "$EXTRAS"
|
||||||
|
# Combine base extras from pyproject.toml with requested extras
|
||||||
|
ALL_EXTRAS=""
|
||||||
|
for extra in "${EXTRA_ARRAY[@]}"; do
|
||||||
|
# Trim whitespace
|
||||||
|
extra=$(echo "$extra" | xargs)
|
||||||
|
# Add to extras list if not already present
|
||||||
|
if [[ ! "$ALL_EXTRAS" =~ (^|,)"$extra"(,|$) ]]; then
|
||||||
|
if [ -z "$ALL_EXTRAS" ]; then
|
||||||
|
ALL_EXTRAS="$extra"
|
||||||
|
else
|
||||||
|
ALL_EXTRAS="$ALL_EXTRAS,$extra"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Installing cognee with extras: $ALL_EXTRAS"
|
||||||
|
echo "Running: uv pip install 'cognee[$ALL_EXTRAS]==$COGNEE_VERSION'"
|
||||||
|
uv pip install "cognee[$ALL_EXTRAS]==$COGNEE_VERSION"
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
echo ""
|
||||||
|
echo "✓ Optional dependencies installation completed"
|
||||||
|
else
|
||||||
|
echo "No optional dependencies specified"
|
||||||
|
fi
|
||||||
|
|
||||||
# Set default transport mode if not specified
|
# Set default transport mode if not specified
|
||||||
TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"}
|
TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"}
|
||||||
echo "Transport mode: $TRANSPORT_MODE"
|
echo "Transport mode: $TRANSPORT_MODE"
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from cognee.root_dir import get_absolute_path, ensure_absolute_path
|
from cognee.root_dir import get_absolute_path, ensure_absolute_path
|
||||||
|
|
@ -11,6 +12,9 @@ class BaseConfig(BaseSettings):
|
||||||
data_root_directory: str = get_absolute_path(".data_storage")
|
data_root_directory: str = get_absolute_path(".data_storage")
|
||||||
system_root_directory: str = get_absolute_path(".cognee_system")
|
system_root_directory: str = get_absolute_path(".cognee_system")
|
||||||
cache_root_directory: str = get_absolute_path(".cognee_cache")
|
cache_root_directory: str = get_absolute_path(".cognee_cache")
|
||||||
|
logs_root_directory: str = os.getenv(
|
||||||
|
"COGNEE_LOGS_DIR", str(os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs"))
|
||||||
|
)
|
||||||
monitoring_tool: object = Observer.NONE
|
monitoring_tool: object = Observer.NONE
|
||||||
|
|
||||||
@pydantic.model_validator(mode="after")
|
@pydantic.model_validator(mode="after")
|
||||||
|
|
@ -30,6 +34,8 @@ class BaseConfig(BaseSettings):
|
||||||
# Require absolute paths for root directories
|
# Require absolute paths for root directories
|
||||||
self.data_root_directory = ensure_absolute_path(self.data_root_directory)
|
self.data_root_directory = ensure_absolute_path(self.data_root_directory)
|
||||||
self.system_root_directory = ensure_absolute_path(self.system_root_directory)
|
self.system_root_directory = ensure_absolute_path(self.system_root_directory)
|
||||||
|
self.logs_root_directory = ensure_absolute_path(self.logs_root_directory)
|
||||||
|
|
||||||
# Set monitoring tool based on available keys
|
# Set monitoring tool based on available keys
|
||||||
if self.langfuse_public_key and self.langfuse_secret_key:
|
if self.langfuse_public_key and self.langfuse_secret_key:
|
||||||
self.monitoring_tool = Observer.LANGFUSE
|
self.monitoring_tool = Observer.LANGFUSE
|
||||||
|
|
@ -49,6 +55,7 @@ class BaseConfig(BaseSettings):
|
||||||
"system_root_directory": self.system_root_directory,
|
"system_root_directory": self.system_root_directory,
|
||||||
"monitoring_tool": self.monitoring_tool,
|
"monitoring_tool": self.monitoring_tool,
|
||||||
"cache_root_directory": self.cache_root_directory,
|
"cache_root_directory": self.cache_root_directory,
|
||||||
|
"logs_root_directory": self.logs_root_directory,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,7 @@ def create_vector_engine(
|
||||||
embedding_engine=embedding_engine,
|
embedding_engine=embedding_engine,
|
||||||
)
|
)
|
||||||
|
|
||||||
if vector_db_provider == "pgvector":
|
if vector_db_provider.lower() == "pgvector":
|
||||||
from cognee.infrastructure.databases.relational import get_relational_config
|
from cognee.infrastructure.databases.relational import get_relational_config
|
||||||
|
|
||||||
# Get configuration for postgres database
|
# Get configuration for postgres database
|
||||||
|
|
@ -78,7 +78,7 @@ def create_vector_engine(
|
||||||
embedding_engine,
|
embedding_engine,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif vector_db_provider == "chromadb":
|
elif vector_db_provider.lower() == "chromadb":
|
||||||
try:
|
try:
|
||||||
import chromadb
|
import chromadb
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
@ -94,7 +94,7 @@ def create_vector_engine(
|
||||||
embedding_engine=embedding_engine,
|
embedding_engine=embedding_engine,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif vector_db_provider == "neptune_analytics":
|
elif vector_db_provider.lower() == "neptune_analytics":
|
||||||
try:
|
try:
|
||||||
from langchain_aws import NeptuneAnalyticsGraph
|
from langchain_aws import NeptuneAnalyticsGraph
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
@ -122,7 +122,7 @@ def create_vector_engine(
|
||||||
embedding_engine=embedding_engine,
|
embedding_engine=embedding_engine,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
elif vector_db_provider.lower() == "lancedb":
|
||||||
from .lancedb.LanceDBAdapter import LanceDBAdapter
|
from .lancedb.LanceDBAdapter import LanceDBAdapter
|
||||||
|
|
||||||
return LanceDBAdapter(
|
return LanceDBAdapter(
|
||||||
|
|
@ -130,3 +130,9 @@ def create_vector_engine(
|
||||||
api_key=vector_db_key,
|
api_key=vector_db_key,
|
||||||
embedding_engine=embedding_engine,
|
embedding_engine=embedding_engine,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise EnvironmentError(
|
||||||
|
f"Unsupported graph database provider: {vector_db_provider}. "
|
||||||
|
f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['LanceDB', 'PGVector', 'neptune_analytics', 'ChromaDB'])}"
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,9 @@ class DataPoint(BaseModel):
|
||||||
updated_at: int = Field(
|
updated_at: int = Field(
|
||||||
default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)
|
default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)
|
||||||
)
|
)
|
||||||
|
last_accessed_at: int = Field(
|
||||||
|
default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)
|
||||||
|
)
|
||||||
ontology_valid: bool = False
|
ontology_valid: bool = False
|
||||||
version: int = 1 # Default version
|
version: int = 1 # Default version
|
||||||
topological_rank: Optional[int] = 0
|
topological_rank: Optional[int] = 0
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import io
|
import io
|
||||||
import os.path
|
import os.path
|
||||||
from typing import BinaryIO, TypedDict
|
from typing import BinaryIO, TypedDict, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
@ -27,7 +27,7 @@ class FileMetadata(TypedDict):
|
||||||
file_size: int
|
file_size: int
|
||||||
|
|
||||||
|
|
||||||
async def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
async def get_file_metadata(file: BinaryIO, name: Optional[str] = None) -> FileMetadata:
|
||||||
"""
|
"""
|
||||||
Retrieve metadata from a file object.
|
Retrieve metadata from a file object.
|
||||||
|
|
||||||
|
|
@ -53,7 +53,7 @@ async def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
||||||
except io.UnsupportedOperation as error:
|
except io.UnsupportedOperation as error:
|
||||||
logger.error(f"Error retrieving content hash for file: {file.name} \n{str(error)}\n\n")
|
logger.error(f"Error retrieving content hash for file: {file.name} \n{str(error)}\n\n")
|
||||||
|
|
||||||
file_type = guess_file_type(file)
|
file_type = guess_file_type(file, name)
|
||||||
|
|
||||||
file_path = getattr(file, "name", None) or getattr(file, "full_name", None)
|
file_path = getattr(file, "name", None) or getattr(file, "full_name", None)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,9 @@
|
||||||
from typing import BinaryIO
|
import io
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import BinaryIO, Optional, Any
|
||||||
import filetype
|
import filetype
|
||||||
from .is_text_content import is_text_content
|
from tempfile import SpooledTemporaryFile
|
||||||
|
from filetype.types.base import Type
|
||||||
|
|
||||||
|
|
||||||
class FileTypeException(Exception):
|
class FileTypeException(Exception):
|
||||||
|
|
@ -22,90 +25,7 @@ class FileTypeException(Exception):
|
||||||
self.message = message
|
self.message = message
|
||||||
|
|
||||||
|
|
||||||
class TxtFileType(filetype.Type):
|
def guess_file_type(file: BinaryIO, name: Optional[str] = None) -> filetype.Type:
|
||||||
"""
|
|
||||||
Represents a text file type with specific MIME and extension properties.
|
|
||||||
|
|
||||||
Public methods:
|
|
||||||
- match: Determines whether a given buffer matches the text file type.
|
|
||||||
"""
|
|
||||||
|
|
||||||
MIME = "text/plain"
|
|
||||||
EXTENSION = "txt"
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super(TxtFileType, self).__init__(mime=TxtFileType.MIME, extension=TxtFileType.EXTENSION)
|
|
||||||
|
|
||||||
def match(self, buf):
|
|
||||||
"""
|
|
||||||
Determine if the given buffer contains text content.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
-----------
|
|
||||||
|
|
||||||
- buf: The buffer to check for text content.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
--------
|
|
||||||
|
|
||||||
Returns True if the buffer is identified as text content, otherwise False.
|
|
||||||
"""
|
|
||||||
return is_text_content(buf)
|
|
||||||
|
|
||||||
|
|
||||||
txt_file_type = TxtFileType()
|
|
||||||
|
|
||||||
filetype.add_type(txt_file_type)
|
|
||||||
|
|
||||||
|
|
||||||
class CustomPdfMatcher(filetype.Type):
|
|
||||||
"""
|
|
||||||
Match PDF file types based on MIME type and extension.
|
|
||||||
|
|
||||||
Public methods:
|
|
||||||
- match
|
|
||||||
|
|
||||||
Instance variables:
|
|
||||||
- MIME: The MIME type of the PDF.
|
|
||||||
- EXTENSION: The file extension of the PDF.
|
|
||||||
"""
|
|
||||||
|
|
||||||
MIME = "application/pdf"
|
|
||||||
EXTENSION = "pdf"
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super(CustomPdfMatcher, self).__init__(
|
|
||||||
mime=CustomPdfMatcher.MIME, extension=CustomPdfMatcher.EXTENSION
|
|
||||||
)
|
|
||||||
|
|
||||||
def match(self, buf):
|
|
||||||
"""
|
|
||||||
Determine if the provided buffer is a PDF file.
|
|
||||||
|
|
||||||
This method checks for the presence of the PDF signature in the buffer.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
- TypeError: If the buffer is not of bytes type.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
-----------
|
|
||||||
|
|
||||||
- buf: The buffer containing the data to be checked.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
--------
|
|
||||||
|
|
||||||
Returns True if the buffer contains a PDF signature, otherwise returns False.
|
|
||||||
"""
|
|
||||||
return b"PDF-" in buf
|
|
||||||
|
|
||||||
|
|
||||||
custom_pdf_matcher = CustomPdfMatcher()
|
|
||||||
|
|
||||||
filetype.add_type(custom_pdf_matcher)
|
|
||||||
|
|
||||||
|
|
||||||
def guess_file_type(file: BinaryIO) -> filetype.Type:
|
|
||||||
"""
|
"""
|
||||||
Guess the file type from the given binary file stream.
|
Guess the file type from the given binary file stream.
|
||||||
|
|
||||||
|
|
@ -122,12 +42,23 @@ def guess_file_type(file: BinaryIO) -> filetype.Type:
|
||||||
|
|
||||||
- filetype.Type: The guessed file type, represented as filetype.Type.
|
- filetype.Type: The guessed file type, represented as filetype.Type.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Note: If file has .txt or .text extension, consider it a plain text file as filetype.guess may not detect it properly
|
||||||
|
# as it contains no magic number encoding
|
||||||
|
ext = None
|
||||||
|
if isinstance(file, str):
|
||||||
|
ext = Path(file).suffix
|
||||||
|
elif name is not None:
|
||||||
|
ext = Path(name).suffix
|
||||||
|
|
||||||
|
if ext in [".txt", ".text"]:
|
||||||
|
file_type = Type("text/plain", "txt")
|
||||||
|
return file_type
|
||||||
|
|
||||||
file_type = filetype.guess(file)
|
file_type = filetype.guess(file)
|
||||||
|
|
||||||
# If file type could not be determined consider it a plain text file as they don't have magic number encoding
|
# If file type could not be determined consider it a plain text file as they don't have magic number encoding
|
||||||
if file_type is None:
|
if file_type is None:
|
||||||
from filetype.types.base import Type
|
|
||||||
|
|
||||||
file_type = Type("text/plain", "txt")
|
file_type = Type("text/plain", "txt")
|
||||||
|
|
||||||
if file_type is None:
|
if file_type is None:
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,13 @@
|
||||||
For the purposes of identifying timestamps in a query, you are tasked with extracting relevant timestamps from the query.
|
You are tasked with identifying relevant time periods where the answer to a given query should be searched.
|
||||||
## Timestamp requirements
|
Current date is: `{{ time_now }}`. Determine relevant period(s) and return structured intervals.
|
||||||
- If the query contains interval extrack both starts_at and ends_at properties
|
|
||||||
- If the query contains an instantaneous timestamp, starts_at and ends_at should be the same
|
Extraction rules:
|
||||||
- If the query its open-ended (before 2009 or after 2009), the corresponding non defined end of the time should be none
|
|
||||||
-For example: "before 2009" -- starts_at: None, ends_at: 2009 or "after 2009" -- starts_at: 2009, ends_at: None
|
1. Query without specific timestamp: use the time period with starts_at set to None and ends_at set to now.
|
||||||
- Put always the data that comes first in time as starts_at and the timestamps that comes second in time as ends_at
|
2. Explicit time intervals: If the query specifies a range (e.g., from 2010 to 2020, between January and March 2023), extract both start and end dates. Always assign the earlier date to starts_at and the later date to ends_at.
|
||||||
- If starts_at or ends_at cannot be extracted both of them has to be None
|
3. Single timestamp: If the query refers to one specific moment (e.g., in 2015, on March 5, 2022), set starts_at and ends_at to that same timestamp.
|
||||||
## Output Format
|
4. Open-ended time references: For phrases such as "before X" or "after X", represent the unspecified side as None. For example: before 2009 → starts_at: None, ends_at: 2009; after 2009 → starts_at: 2009, ends_at: None.
|
||||||
Your reply should be a JSON: list of dictionaries with the following structure:
|
5. Current-time references ("now", "current", "today"): If the query explicitly refers to the present, set both starts_at and ends_at to now (the ingestion timestamp).
|
||||||
```python
|
6. "Who is" and "Who was" questions: These imply a general identity or biographical inquiry without a specific temporal scope. Set both starts_at and ends_at to None.
|
||||||
class QueryInterval(BaseModel):
|
7. Ordering rule: Always ensure the earlier date is assigned to starts_at and the later date to ends_at.
|
||||||
starts_at: Optional[Timestamp] = None
|
8. No temporal information: If no valid or inferable time reference is found, set both starts_at and ends_at to None.
|
||||||
ends_at: Optional[Timestamp] = None
|
|
||||||
```
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import filetype
|
import filetype
|
||||||
from typing import Dict, List, Optional, Any
|
from typing import Dict, List, Optional, Any
|
||||||
from .LoaderInterface import LoaderInterface
|
from .LoaderInterface import LoaderInterface
|
||||||
|
from cognee.infrastructure.files.utils.guess_file_type import guess_file_type
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
@ -80,7 +81,7 @@ class LoaderEngine:
|
||||||
"""
|
"""
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
file_info = filetype.guess(file_path)
|
file_info = guess_file_type(file_path)
|
||||||
|
|
||||||
path_extension = Path(file_path).suffix.lstrip(".")
|
path_extension = Path(file_path).suffix.lstrip(".")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,7 @@ class AudioLoader(LoaderInterface):
|
||||||
"audio/wav",
|
"audio/wav",
|
||||||
"audio/amr",
|
"audio/amr",
|
||||||
"audio/aiff",
|
"audio/aiff",
|
||||||
|
"audio/x-wav",
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from datetime import datetime, timezone
|
||||||
from cognee.infrastructure.engine import DataPoint
|
from cognee.infrastructure.engine import DataPoint
|
||||||
from cognee.modules.data.processing.document_types import Document
|
from cognee.modules.data.processing.document_types import Document
|
||||||
from cognee.modules.engine.models import Entity
|
from cognee.modules.engine.models import Entity
|
||||||
|
|
@ -22,6 +24,7 @@ class DocumentChunk(DataPoint):
|
||||||
- cut_type: The type of cut that defined this chunk.
|
- cut_type: The type of cut that defined this chunk.
|
||||||
- is_part_of: The document to which this chunk belongs.
|
- is_part_of: The document to which this chunk belongs.
|
||||||
- contains: A list of entities or events contained within the chunk (default is None).
|
- contains: A list of entities or events contained within the chunk (default is None).
|
||||||
|
- last_accessed_at: The timestamp of the last time the chunk was accessed.
|
||||||
- metadata: A dictionary to hold meta information related to the chunk, including index
|
- metadata: A dictionary to hold meta information related to the chunk, including index
|
||||||
fields.
|
fields.
|
||||||
"""
|
"""
|
||||||
|
|
@ -32,5 +35,4 @@ class DocumentChunk(DataPoint):
|
||||||
cut_type: str
|
cut_type: str
|
||||||
is_part_of: Document
|
is_part_of: Document
|
||||||
contains: List[Union[Entity, Event]] = None
|
contains: List[Union[Entity, Event]] = None
|
||||||
|
|
||||||
metadata: dict = {"index_fields": ["text"]}
|
metadata: dict = {"index_fields": ["text"]}
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,7 @@ class Data(Base):
|
||||||
data_size = Column(Integer, nullable=True) # File size in bytes
|
data_size = Column(Integer, nullable=True) # File size in bytes
|
||||||
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||||
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
||||||
|
last_accessed = Column(DateTime(timezone=True), nullable=True)
|
||||||
|
|
||||||
datasets = relationship(
|
datasets = relationship(
|
||||||
"Dataset",
|
"Dataset",
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,11 @@
|
||||||
from cognee.infrastructure.engine import DataPoint
|
from cognee.infrastructure.engine import DataPoint
|
||||||
from cognee.modules.engine.models.EntityType import EntityType
|
from cognee.modules.engine.models.EntityType import EntityType
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
class Entity(DataPoint):
|
class Entity(DataPoint):
|
||||||
name: str
|
name: str
|
||||||
is_a: Optional[EntityType] = None
|
is_a: Optional[EntityType] = None
|
||||||
description: str
|
description: str
|
||||||
|
|
||||||
metadata: dict = {"index_fields": ["name"]}
|
metadata: dict = {"index_fields": ["name"]}
|
||||||
|
|
|
||||||
|
|
@ -5,3 +5,4 @@ from .retrieve_existing_edges import retrieve_existing_edges
|
||||||
from .convert_node_to_data_point import convert_node_to_data_point
|
from .convert_node_to_data_point import convert_node_to_data_point
|
||||||
from .deduplicate_nodes_and_edges import deduplicate_nodes_and_edges
|
from .deduplicate_nodes_and_edges import deduplicate_nodes_and_edges
|
||||||
from .resolve_edges_to_text import resolve_edges_to_text
|
from .resolve_edges_to_text import resolve_edges_to_text
|
||||||
|
from .get_entity_nodes_from_triplets import get_entity_nodes_from_triplets
|
||||||
|
|
|
||||||
13
cognee/modules/graph/utils/get_entity_nodes_from_triplets.py
Normal file
13
cognee/modules/graph/utils/get_entity_nodes_from_triplets.py
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
|
||||||
|
def get_entity_nodes_from_triplets(triplets):
|
||||||
|
entity_nodes = []
|
||||||
|
seen_ids = set()
|
||||||
|
for triplet in triplets:
|
||||||
|
if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids:
|
||||||
|
entity_nodes.append({"id": str(triplet.node1.id)})
|
||||||
|
seen_ids.add(triplet.node1.id)
|
||||||
|
if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids:
|
||||||
|
entity_nodes.append({"id": str(triplet.node2.id)})
|
||||||
|
seen_ids.add(triplet.node2.id)
|
||||||
|
|
||||||
|
return entity_nodes
|
||||||
|
|
@ -30,7 +30,7 @@ class BinaryData(IngestionData):
|
||||||
|
|
||||||
async def ensure_metadata(self):
|
async def ensure_metadata(self):
|
||||||
if self.metadata is None:
|
if self.metadata is None:
|
||||||
self.metadata = await get_file_metadata(self.data)
|
self.metadata = await get_file_metadata(self.data, name=self.name)
|
||||||
|
|
||||||
if self.metadata["name"] is None:
|
if self.metadata["name"] is None:
|
||||||
self.metadata["name"] = self.name
|
self.metadata["name"] = self.name
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,8 @@ def get_ontology_resolver_from_env(
|
||||||
Supported value: "rdflib".
|
Supported value: "rdflib".
|
||||||
matching_strategy (str): The matching strategy to apply.
|
matching_strategy (str): The matching strategy to apply.
|
||||||
Supported value: "fuzzy".
|
Supported value: "fuzzy".
|
||||||
ontology_file_path (str): Path to the ontology file required for the resolver.
|
ontology_file_path (str): Path to the ontology file(s) required for the resolver.
|
||||||
|
Can be a single path or comma-separated paths for multiple files.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
BaseOntologyResolver: An instance of the requested ontology resolver.
|
BaseOntologyResolver: An instance of the requested ontology resolver.
|
||||||
|
|
@ -31,8 +32,13 @@ def get_ontology_resolver_from_env(
|
||||||
or if required parameters are missing.
|
or if required parameters are missing.
|
||||||
"""
|
"""
|
||||||
if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
|
if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
|
||||||
|
if "," in ontology_file_path:
|
||||||
|
file_paths = [path.strip() for path in ontology_file_path.split(",")]
|
||||||
|
else:
|
||||||
|
file_paths = ontology_file_path
|
||||||
|
|
||||||
return RDFLibOntologyResolver(
|
return RDFLibOntologyResolver(
|
||||||
matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path
|
matching_strategy=FuzzyMatchingStrategy(), ontology_file=file_paths
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise EnvironmentError(
|
raise EnvironmentError(
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ import os
|
||||||
import difflib
|
import difflib
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from typing import List, Tuple, Dict, Optional, Any
|
from typing import List, Tuple, Dict, Optional, Any, Union
|
||||||
from rdflib import Graph, URIRef, RDF, RDFS, OWL
|
from rdflib import Graph, URIRef, RDF, RDFS, OWL
|
||||||
|
|
||||||
from cognee.modules.ontology.exceptions import (
|
from cognee.modules.ontology.exceptions import (
|
||||||
|
|
@ -26,22 +26,50 @@ class RDFLibOntologyResolver(BaseOntologyResolver):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
ontology_file: Optional[str] = None,
|
ontology_file: Optional[Union[str, List[str]]] = None,
|
||||||
matching_strategy: Optional[MatchingStrategy] = None,
|
matching_strategy: Optional[MatchingStrategy] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(matching_strategy)
|
super().__init__(matching_strategy)
|
||||||
self.ontology_file = ontology_file
|
self.ontology_file = ontology_file
|
||||||
try:
|
try:
|
||||||
if ontology_file and os.path.exists(ontology_file):
|
files_to_load = []
|
||||||
|
if ontology_file is not None:
|
||||||
|
if isinstance(ontology_file, str):
|
||||||
|
files_to_load = [ontology_file]
|
||||||
|
elif isinstance(ontology_file, list):
|
||||||
|
files_to_load = ontology_file
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if files_to_load:
|
||||||
self.graph = Graph()
|
self.graph = Graph()
|
||||||
self.graph.parse(ontology_file)
|
loaded_files = []
|
||||||
logger.info("Ontology loaded successfully from file: %s", ontology_file)
|
for file_path in files_to_load:
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
self.graph.parse(file_path)
|
||||||
|
loaded_files.append(file_path)
|
||||||
|
logger.info("Ontology loaded successfully from file: %s", file_path)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"Ontology file '%s' not found. Skipping this file.",
|
||||||
|
file_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not loaded_files:
|
||||||
|
logger.info(
|
||||||
|
"No valid ontology files found. No owl ontology will be attached to the graph."
|
||||||
|
)
|
||||||
|
self.graph = None
|
||||||
|
else:
|
||||||
|
logger.info("Total ontology files loaded: %d", len(loaded_files))
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Ontology file '%s' not found. No owl ontology will be attached to the graph.",
|
"No ontology file provided. No owl ontology will be attached to the graph."
|
||||||
ontology_file,
|
|
||||||
)
|
)
|
||||||
self.graph = None
|
self.graph = None
|
||||||
|
|
||||||
self.build_lookup()
|
self.build_lookup()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Failed to load ontology", exc_info=e)
|
logger.error("Failed to load ontology", exc_info=e)
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@ async def handle_task(
|
||||||
additional_properties={
|
additional_properties={
|
||||||
"task_name": running_task.executable.__name__,
|
"task_name": running_task.executable.__name__,
|
||||||
"cognee_version": cognee_version,
|
"cognee_version": cognee_version,
|
||||||
|
"tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -49,6 +50,7 @@ async def handle_task(
|
||||||
additional_properties={
|
additional_properties={
|
||||||
"task_name": running_task.executable.__name__,
|
"task_name": running_task.executable.__name__,
|
||||||
"cognee_version": cognee_version,
|
"cognee_version": cognee_version,
|
||||||
|
"tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
except Exception as error:
|
except Exception as error:
|
||||||
|
|
@ -62,6 +64,7 @@ async def handle_task(
|
||||||
additional_properties={
|
additional_properties={
|
||||||
"task_name": running_task.executable.__name__,
|
"task_name": running_task.executable.__name__,
|
||||||
"cognee_version": cognee_version,
|
"cognee_version": cognee_version,
|
||||||
|
"tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
raise error
|
raise error
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ async def run_tasks_with_telemetry(
|
||||||
additional_properties={
|
additional_properties={
|
||||||
"pipeline_name": str(pipeline_name),
|
"pipeline_name": str(pipeline_name),
|
||||||
"cognee_version": cognee_version,
|
"cognee_version": cognee_version,
|
||||||
|
"tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant",
|
||||||
}
|
}
|
||||||
| config,
|
| config,
|
||||||
)
|
)
|
||||||
|
|
@ -42,6 +43,7 @@ async def run_tasks_with_telemetry(
|
||||||
additional_properties={
|
additional_properties={
|
||||||
"pipeline_name": str(pipeline_name),
|
"pipeline_name": str(pipeline_name),
|
||||||
"cognee_version": cognee_version,
|
"cognee_version": cognee_version,
|
||||||
|
"tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant",
|
||||||
}
|
}
|
||||||
| config,
|
| config,
|
||||||
)
|
)
|
||||||
|
|
@ -58,6 +60,7 @@ async def run_tasks_with_telemetry(
|
||||||
additional_properties={
|
additional_properties={
|
||||||
"pipeline_name": str(pipeline_name),
|
"pipeline_name": str(pipeline_name),
|
||||||
"cognee_version": cognee_version,
|
"cognee_version": cognee_version,
|
||||||
|
"tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant",
|
||||||
}
|
}
|
||||||
| config,
|
| config,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||||
from cognee.modules.retrieval.base_retriever import BaseRetriever
|
from cognee.modules.retrieval.base_retriever import BaseRetriever
|
||||||
from cognee.modules.retrieval.exceptions.exceptions import NoDataError
|
from cognee.modules.retrieval.exceptions.exceptions import NoDataError
|
||||||
from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
|
from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
logger = get_logger("ChunksRetriever")
|
logger = get_logger("ChunksRetriever")
|
||||||
|
|
||||||
|
|
@ -27,21 +28,16 @@ class ChunksRetriever(BaseRetriever):
|
||||||
):
|
):
|
||||||
self.top_k = top_k
|
self.top_k = top_k
|
||||||
|
|
||||||
async def get_context(self, query: str) -> Any:
|
async def get_context(self, query: str) -> Any:
|
||||||
"""
|
"""
|
||||||
Retrieves document chunks context based on the query.
|
Retrieves document chunks context based on the query.
|
||||||
|
|
||||||
Searches for document chunks relevant to the specified query using a vector engine.
|
Searches for document chunks relevant to the specified query using a vector engine.
|
||||||
Raises a NoDataError if no data is found in the system.
|
Raises a NoDataError if no data is found in the system.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
- query (str): The query string to search for relevant document chunks.
|
- query (str): The query string to search for relevant document chunks.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
--------
|
--------
|
||||||
|
|
||||||
- Any: A list of document chunk payloads retrieved from the search.
|
- Any: A list of document chunk payloads retrieved from the search.
|
||||||
"""
|
"""
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
@ -53,13 +49,14 @@ class ChunksRetriever(BaseRetriever):
|
||||||
try:
|
try:
|
||||||
found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)
|
found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k)
|
||||||
logger.info(f"Found {len(found_chunks)} chunks from vector search")
|
logger.info(f"Found {len(found_chunks)} chunks from vector search")
|
||||||
|
await update_node_access_timestamps(found_chunks)
|
||||||
|
|
||||||
except CollectionNotFoundError as error:
|
except CollectionNotFoundError as error:
|
||||||
logger.error("DocumentChunk_text collection not found in vector database")
|
logger.error("DocumentChunk_text collection not found in vector database")
|
||||||
raise NoDataError("No data found in the system, please add data first.") from error
|
raise NoDataError("No data found in the system, please add data first.") from error
|
||||||
|
|
||||||
chunk_payloads = [result.payload for result in found_chunks]
|
chunk_payloads = [result.payload for result in found_chunks]
|
||||||
logger.info(f"Returning {len(chunk_payloads)} chunk payloads")
|
logger.info(f"Returning {len(chunk_payloads)} chunk payloads")
|
||||||
return chunk_payloads
|
|
||||||
|
|
||||||
async def get_completion(
|
async def get_completion(
|
||||||
self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None
|
self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ from cognee.modules.retrieval.utils.session_cache import (
|
||||||
save_conversation_history,
|
save_conversation_history,
|
||||||
get_conversation_history,
|
get_conversation_history,
|
||||||
)
|
)
|
||||||
|
from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
|
||||||
from cognee.modules.retrieval.base_retriever import BaseRetriever
|
from cognee.modules.retrieval.base_retriever import BaseRetriever
|
||||||
from cognee.modules.retrieval.exceptions.exceptions import NoDataError
|
from cognee.modules.retrieval.exceptions.exceptions import NoDataError
|
||||||
from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
|
from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
|
||||||
|
|
@ -65,7 +66,7 @@ class CompletionRetriever(BaseRetriever):
|
||||||
|
|
||||||
if len(found_chunks) == 0:
|
if len(found_chunks) == 0:
|
||||||
return ""
|
return ""
|
||||||
|
await update_node_access_timestamps(found_chunks)
|
||||||
# Combine all chunks text returned from vector search (number of chunks is determined by top_k
|
# Combine all chunks text returned from vector search (number of chunks is determined by top_k
|
||||||
chunks_payload = [found_chunk.payload["text"] for found_chunk in found_chunks]
|
chunks_payload = [found_chunk.payload["text"] for found_chunk in found_chunks]
|
||||||
combined_context = "\n".join(chunks_payload)
|
combined_context = "\n".join(chunks_payload)
|
||||||
|
|
|
||||||
|
|
@ -16,11 +16,13 @@ from cognee.modules.retrieval.utils.session_cache import (
|
||||||
)
|
)
|
||||||
from cognee.shared.logging_utils import get_logger
|
from cognee.shared.logging_utils import get_logger
|
||||||
from cognee.modules.retrieval.utils.extract_uuid_from_node import extract_uuid_from_node
|
from cognee.modules.retrieval.utils.extract_uuid_from_node import extract_uuid_from_node
|
||||||
|
from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
|
||||||
from cognee.modules.retrieval.utils.models import CogneeUserInteraction
|
from cognee.modules.retrieval.utils.models import CogneeUserInteraction
|
||||||
from cognee.modules.engine.models.node_set import NodeSet
|
from cognee.modules.engine.models.node_set import NodeSet
|
||||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||||
from cognee.context_global_variables import session_user
|
from cognee.context_global_variables import session_user
|
||||||
from cognee.infrastructure.databases.cache.config import CacheConfig
|
from cognee.infrastructure.databases.cache.config import CacheConfig
|
||||||
|
from cognee.modules.graph.utils import get_entity_nodes_from_triplets
|
||||||
|
|
||||||
logger = get_logger("GraphCompletionRetriever")
|
logger = get_logger("GraphCompletionRetriever")
|
||||||
|
|
||||||
|
|
@ -139,6 +141,9 @@ class GraphCompletionRetriever(BaseGraphRetriever):
|
||||||
|
|
||||||
# context = await self.resolve_edges_to_text(triplets)
|
# context = await self.resolve_edges_to_text(triplets)
|
||||||
|
|
||||||
|
entity_nodes = get_entity_nodes_from_triplets(triplets)
|
||||||
|
|
||||||
|
await update_node_access_timestamps(entity_nodes)
|
||||||
return triplets
|
return triplets
|
||||||
|
|
||||||
async def get_completion(
|
async def get_completion(
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ from cognee.shared.logging_utils import get_logger
|
||||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||||
from cognee.modules.retrieval.base_retriever import BaseRetriever
|
from cognee.modules.retrieval.base_retriever import BaseRetriever
|
||||||
from cognee.modules.retrieval.exceptions.exceptions import NoDataError
|
from cognee.modules.retrieval.exceptions.exceptions import NoDataError
|
||||||
|
from cognee.modules.retrieval.utils.access_tracking import update_node_access_timestamps
|
||||||
from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
|
from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
|
||||||
|
|
||||||
logger = get_logger("SummariesRetriever")
|
logger = get_logger("SummariesRetriever")
|
||||||
|
|
@ -54,6 +55,9 @@ class SummariesRetriever(BaseRetriever):
|
||||||
"TextSummary_text", query, limit=self.top_k
|
"TextSummary_text", query, limit=self.top_k
|
||||||
)
|
)
|
||||||
logger.info(f"Found {len(summaries_results)} summaries from vector search")
|
logger.info(f"Found {len(summaries_results)} summaries from vector search")
|
||||||
|
|
||||||
|
await update_node_access_timestamps(summaries_results)
|
||||||
|
|
||||||
except CollectionNotFoundError as error:
|
except CollectionNotFoundError as error:
|
||||||
logger.error("TextSummary_text collection not found in vector database")
|
logger.error("TextSummary_text collection not found in vector database")
|
||||||
raise NoDataError("No data found in the system, please add data first.") from error
|
raise NoDataError("No data found in the system, please add data first.") from error
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import Any, Optional, List, Type
|
from typing import Any, Optional, List, Type
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||||
|
|
@ -79,7 +79,11 @@ class TemporalRetriever(GraphCompletionRetriever):
|
||||||
else:
|
else:
|
||||||
base_directory = None
|
base_directory = None
|
||||||
|
|
||||||
system_prompt = render_prompt(prompt_path, {}, base_directory=base_directory)
|
time_now = datetime.now().strftime("%d-%m-%Y")
|
||||||
|
|
||||||
|
system_prompt = render_prompt(
|
||||||
|
prompt_path, {"time_now": time_now}, base_directory=base_directory
|
||||||
|
)
|
||||||
|
|
||||||
interval = await LLMGateway.acreate_structured_output(query, system_prompt, QueryInterval)
|
interval = await LLMGateway.acreate_structured_output(query, system_prompt, QueryInterval)
|
||||||
|
|
||||||
|
|
@ -108,8 +112,6 @@ class TemporalRetriever(GraphCompletionRetriever):
|
||||||
|
|
||||||
graph_engine = await get_graph_engine()
|
graph_engine = await get_graph_engine()
|
||||||
|
|
||||||
triplets = []
|
|
||||||
|
|
||||||
if time_from and time_to:
|
if time_from and time_to:
|
||||||
ids = await graph_engine.collect_time_ids(time_from=time_from, time_to=time_to)
|
ids = await graph_engine.collect_time_ids(time_from=time_from, time_to=time_to)
|
||||||
elif time_from:
|
elif time_from:
|
||||||
|
|
|
||||||
95
cognee/modules/retrieval/utils/access_tracking.py
Normal file
95
cognee/modules/retrieval/utils/access_tracking.py
Normal file
|
|
@ -0,0 +1,95 @@
|
||||||
|
"""Utilities for tracking data access in retrievers."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import List, Any
|
||||||
|
from uuid import UUID
|
||||||
|
|
||||||
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||||
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||||
|
from cognee.modules.data.models import Data
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
from sqlalchemy import update
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def update_node_access_timestamps(items: List[Any]):
|
||||||
|
"""
|
||||||
|
Update last_accessed_at for nodes in graph database and corresponding Data records in SQL.
|
||||||
|
|
||||||
|
This function:
|
||||||
|
1. Updates last_accessed_at in the graph database nodes (in properties JSON)
|
||||||
|
2. Traverses to find origin TextDocument nodes (without hardcoded relationship names)
|
||||||
|
3. Updates last_accessed in the SQL Data table for those documents
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
items : List[Any]
|
||||||
|
List of items with payload containing 'id' field (from vector search results)
|
||||||
|
"""
|
||||||
|
if not items:
|
||||||
|
return
|
||||||
|
|
||||||
|
graph_engine = await get_graph_engine()
|
||||||
|
timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
|
||||||
|
timestamp_dt = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
# Extract node IDs
|
||||||
|
node_ids = []
|
||||||
|
for item in items:
|
||||||
|
item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")
|
||||||
|
if item_id:
|
||||||
|
node_ids.append(str(item_id))
|
||||||
|
|
||||||
|
if not node_ids:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Step 1: Batch update graph nodes
|
||||||
|
for node_id in node_ids:
|
||||||
|
result = await graph_engine.query(
|
||||||
|
"MATCH (n:Node {id: $id}) RETURN n.properties",
|
||||||
|
{"id": node_id}
|
||||||
|
)
|
||||||
|
|
||||||
|
if result and result[0]:
|
||||||
|
props = json.loads(result[0][0]) if result[0][0] else {}
|
||||||
|
props["last_accessed_at"] = timestamp_ms
|
||||||
|
|
||||||
|
await graph_engine.query(
|
||||||
|
"MATCH (n:Node {id: $id}) SET n.properties = $props",
|
||||||
|
{"id": node_id, "props": json.dumps(props)}
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(f"Updated access timestamps for {len(node_ids)} graph nodes")
|
||||||
|
|
||||||
|
# Step 2: Find origin TextDocument nodes (without hardcoded relationship names)
|
||||||
|
origin_query = """
|
||||||
|
UNWIND $node_ids AS node_id
|
||||||
|
MATCH (chunk:Node {id: node_id})-[e:EDGE]-(doc:Node)
|
||||||
|
WHERE chunk.type = 'DocumentChunk' AND doc.type IN ['TextDocument', 'Document']
|
||||||
|
RETURN DISTINCT doc.id
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = await graph_engine.query(origin_query, {"node_ids": node_ids})
|
||||||
|
|
||||||
|
# Extract and deduplicate document IDs
|
||||||
|
doc_ids = list(set([row[0] for row in result if row and row[0]])) if result else []
|
||||||
|
|
||||||
|
# Step 3: Update SQL Data table
|
||||||
|
if doc_ids:
|
||||||
|
db_engine = get_relational_engine()
|
||||||
|
async with db_engine.get_async_session() as session:
|
||||||
|
stmt = update(Data).where(
|
||||||
|
Data.id.in_([UUID(doc_id) for doc_id in doc_ids])
|
||||||
|
).values(last_accessed=timestamp_dt)
|
||||||
|
|
||||||
|
await session.execute(stmt)
|
||||||
|
await session.commit()
|
||||||
|
|
||||||
|
logger.debug(f"Updated last_accessed for {len(doc_ids)} Data records in SQL")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to update timestamps: {e}")
|
||||||
|
raise
|
||||||
|
|
@ -67,7 +67,10 @@ async def search(
|
||||||
send_telemetry(
|
send_telemetry(
|
||||||
"cognee.search EXECUTION STARTED",
|
"cognee.search EXECUTION STARTED",
|
||||||
user.id,
|
user.id,
|
||||||
additional_properties={"cognee_version": cognee_version},
|
additional_properties={
|
||||||
|
"cognee_version": cognee_version,
|
||||||
|
"tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant",
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use search function filtered by permissions if access control is enabled
|
# Use search function filtered by permissions if access control is enabled
|
||||||
|
|
@ -108,7 +111,10 @@ async def search(
|
||||||
send_telemetry(
|
send_telemetry(
|
||||||
"cognee.search EXECUTION COMPLETED",
|
"cognee.search EXECUTION COMPLETED",
|
||||||
user.id,
|
user.id,
|
||||||
additional_properties={"cognee_version": cognee_version},
|
additional_properties={
|
||||||
|
"cognee_version": cognee_version,
|
||||||
|
"tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant",
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
await log_result(
|
await log_result(
|
||||||
|
|
|
||||||
|
|
@ -16,17 +16,17 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
|
|
||||||
nodes_list = []
|
nodes_list = []
|
||||||
color_map = {
|
color_map = {
|
||||||
"Entity": "#f47710",
|
"Entity": "#5C10F4",
|
||||||
"EntityType": "#6510f4",
|
"EntityType": "#A550FF",
|
||||||
"DocumentChunk": "#801212",
|
"DocumentChunk": "#0DFF00",
|
||||||
"TextSummary": "#1077f4",
|
"TextSummary": "#5C10F4",
|
||||||
"TableRow": "#f47710",
|
"TableRow": "#A550FF",
|
||||||
"TableType": "#6510f4",
|
"TableType": "#5C10F4",
|
||||||
"ColumnValue": "#13613a",
|
"ColumnValue": "#757470",
|
||||||
"SchemaTable": "#f47710",
|
"SchemaTable": "#A550FF",
|
||||||
"DatabaseSchema": "#6510f4",
|
"DatabaseSchema": "#5C10F4",
|
||||||
"SchemaRelationship": "#13613a",
|
"SchemaRelationship": "#323332",
|
||||||
"default": "#D3D3D3",
|
"default": "#D8D8D8",
|
||||||
}
|
}
|
||||||
|
|
||||||
for node_id, node_info in nodes_data:
|
for node_id, node_info in nodes_data:
|
||||||
|
|
@ -98,16 +98,19 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
<head>
|
<head>
|
||||||
<meta charset="utf-8">
|
<meta charset="utf-8">
|
||||||
<script src="https://d3js.org/d3.v5.min.js"></script>
|
<script src="https://d3js.org/d3.v5.min.js"></script>
|
||||||
|
<script src="https://d3js.org/d3-contour.v1.min.js"></script>
|
||||||
<style>
|
<style>
|
||||||
body, html { margin: 0; padding: 0; width: 100%; height: 100%; overflow: hidden; background: linear-gradient(90deg, #101010, #1a1a2e); color: white; font-family: 'Inter', sans-serif; }
|
body, html { margin: 0; padding: 0; width: 100%; height: 100%; overflow: hidden; background: linear-gradient(90deg, #101010, #1a1a2e); color: white; font-family: 'Inter', sans-serif; }
|
||||||
|
|
||||||
svg { width: 100vw; height: 100vh; display: block; }
|
svg { width: 100vw; height: 100vh; display: block; }
|
||||||
.links line { stroke: rgba(255, 255, 255, 0.4); stroke-width: 2px; }
|
.links line { stroke: rgba(160, 160, 160, 0.25); stroke-width: 1.5px; stroke-linecap: round; }
|
||||||
.links line.weighted { stroke: rgba(255, 215, 0, 0.7); }
|
.links line.weighted { stroke: rgba(255, 215, 0, 0.4); }
|
||||||
.links line.multi-weighted { stroke: rgba(0, 255, 127, 0.8); }
|
.links line.multi-weighted { stroke: rgba(0, 255, 127, 0.45); }
|
||||||
.nodes circle { stroke: white; stroke-width: 0.5px; filter: drop-shadow(0 0 5px rgba(255,255,255,0.3)); }
|
.nodes circle { stroke: white; stroke-width: 0.5px; }
|
||||||
.node-label { font-size: 5px; font-weight: bold; fill: white; text-anchor: middle; dominant-baseline: middle; font-family: 'Inter', sans-serif; pointer-events: none; }
|
.node-label { font-size: 5px; font-weight: bold; fill: #F4F4F4; text-anchor: middle; dominant-baseline: middle; font-family: 'Inter', sans-serif; pointer-events: none; }
|
||||||
.edge-label { font-size: 3px; fill: rgba(255, 255, 255, 0.7); text-anchor: middle; dominant-baseline: middle; font-family: 'Inter', sans-serif; pointer-events: none; }
|
.edge-label { font-size: 3px; fill: #F4F4F4; text-anchor: middle; dominant-baseline: middle; font-family: 'Inter', sans-serif; pointer-events: none; paint-order: stroke; stroke: rgba(50,51,50,0.75); stroke-width: 1px; }
|
||||||
|
|
||||||
|
.density path { mix-blend-mode: screen; }
|
||||||
|
|
||||||
.tooltip {
|
.tooltip {
|
||||||
position: absolute;
|
position: absolute;
|
||||||
|
|
@ -125,11 +128,32 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
max-width: 300px;
|
max-width: 300px;
|
||||||
word-wrap: break-word;
|
word-wrap: break-word;
|
||||||
}
|
}
|
||||||
|
#info-panel {
|
||||||
|
position: fixed;
|
||||||
|
left: 12px;
|
||||||
|
top: 12px;
|
||||||
|
width: 340px;
|
||||||
|
max-height: calc(100vh - 24px);
|
||||||
|
overflow: auto;
|
||||||
|
background: rgba(50, 51, 50, 0.7);
|
||||||
|
backdrop-filter: blur(6px);
|
||||||
|
border: 1px solid rgba(216, 216, 216, 0.35);
|
||||||
|
border-radius: 8px;
|
||||||
|
color: #F4F4F4;
|
||||||
|
padding: 12px 14px;
|
||||||
|
z-index: 1100;
|
||||||
|
}
|
||||||
|
#info-panel h3 { margin: 0 0 8px 0; font-size: 14px; color: #F4F4F4; }
|
||||||
|
#info-panel .kv { font-size: 12px; line-height: 1.4; }
|
||||||
|
#info-panel .kv .k { color: #D8D8D8; }
|
||||||
|
#info-panel .kv .v { color: #F4F4F4; }
|
||||||
|
#info-panel .placeholder { opacity: 0.7; font-size: 12px; }
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<svg></svg>
|
<svg></svg>
|
||||||
<div class="tooltip" id="tooltip"></div>
|
<div class="tooltip" id="tooltip"></div>
|
||||||
|
<div id="info-panel"><div class="placeholder">Hover a node or edge to inspect details</div></div>
|
||||||
<script>
|
<script>
|
||||||
var nodes = {nodes};
|
var nodes = {nodes};
|
||||||
var links = {links};
|
var links = {links};
|
||||||
|
|
@ -140,19 +164,141 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
|
|
||||||
var container = svg.append("g");
|
var container = svg.append("g");
|
||||||
var tooltip = d3.select("#tooltip");
|
var tooltip = d3.select("#tooltip");
|
||||||
|
var infoPanel = d3.select('#info-panel');
|
||||||
|
|
||||||
|
function renderInfo(title, entries){
|
||||||
|
function esc(s){ return String(s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>'); }
|
||||||
|
var html = '<h3>' + esc(title) + '</h3>';
|
||||||
|
html += '<div class="kv">';
|
||||||
|
entries.forEach(function(e){
|
||||||
|
html += '<div><span class="k">' + esc(e.k) + ':</span> <span class="v">' + esc(e.v) + '</span></div>';
|
||||||
|
});
|
||||||
|
html += '</div>';
|
||||||
|
infoPanel.html(html);
|
||||||
|
}
|
||||||
|
function pickDescription(obj){
|
||||||
|
if (!obj) return null;
|
||||||
|
var keys = ['description','summary','text','content'];
|
||||||
|
for (var i=0; i<keys.length; i++){
|
||||||
|
var v = obj[keys[i]];
|
||||||
|
if (typeof v === 'string' && v.trim()) return v.trim();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
function truncate(s, n){ if (!s) return s; return s.length > n ? (s.slice(0, n) + '…') : s; }
|
||||||
|
function renderNodeInfo(n){
|
||||||
|
var entries = [];
|
||||||
|
if (n.name) entries.push({k:'Name', v: n.name});
|
||||||
|
if (n.type) entries.push({k:'Type', v: n.type});
|
||||||
|
if (n.id) entries.push({k:'ID', v: n.id});
|
||||||
|
var desc = pickDescription(n) || pickDescription(n.properties);
|
||||||
|
if (desc) entries.push({k:'Description', v: truncate(desc.replace(/\s+/g,' ').trim(), 280)});
|
||||||
|
if (n.properties) {
|
||||||
|
Object.keys(n.properties).slice(0, 12).forEach(function(key){
|
||||||
|
var v = n.properties[key];
|
||||||
|
if (v !== undefined && v !== null && typeof v !== 'object') entries.push({k: key, v: String(v)});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
renderInfo(n.name || 'Node', entries);
|
||||||
|
}
|
||||||
|
function renderEdgeInfo(e){
|
||||||
|
var entries = [];
|
||||||
|
if (e.relation) entries.push({k:'Relation', v: e.relation});
|
||||||
|
if (e.weight !== undefined && e.weight !== null) entries.push({k:'Weight', v: e.weight});
|
||||||
|
if (e.all_weights && Object.keys(e.all_weights).length){
|
||||||
|
Object.keys(e.all_weights).slice(0, 8).forEach(function(k){ entries.push({k: 'w.'+k, v: e.all_weights[k]}); });
|
||||||
|
}
|
||||||
|
if (e.relationship_type) entries.push({k:'Type', v: e.relationship_type});
|
||||||
|
var edesc = pickDescription(e.edge_info);
|
||||||
|
if (edesc) entries.push({k:'Description', v: truncate(edesc.replace(/\s+/g,' ').trim(), 280)});
|
||||||
|
renderInfo('Edge', entries);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic runtime diagnostics
|
||||||
|
console.log('[Cognee Visualization] nodes:', nodes ? nodes.length : 0, 'links:', links ? links.length : 0);
|
||||||
|
window.addEventListener('error', function(e){
|
||||||
|
try {
|
||||||
|
tooltip.html('<strong>Error:</strong> ' + e.message)
|
||||||
|
.style('left', '12px')
|
||||||
|
.style('top', '12px')
|
||||||
|
.style('opacity', 1);
|
||||||
|
} catch(_) {}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Normalize node IDs and link endpoints for robustness
|
||||||
|
function resolveId(d){ return (d && (d.id || d.node_id || d.uuid || d.external_id || d.name)) || undefined; }
|
||||||
|
if (Array.isArray(nodes)) {
|
||||||
|
nodes.forEach(function(n){ var id = resolveId(n); if (id !== undefined) n.id = id; });
|
||||||
|
}
|
||||||
|
if (Array.isArray(links)) {
|
||||||
|
links.forEach(function(l){
|
||||||
|
if (typeof l.source === 'object') l.source = resolveId(l.source);
|
||||||
|
if (typeof l.target === 'object') l.target = resolveId(l.target);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!nodes || nodes.length === 0) {
|
||||||
|
container.append('text')
|
||||||
|
.attr('x', width / 2)
|
||||||
|
.attr('y', height / 2)
|
||||||
|
.attr('fill', '#fff')
|
||||||
|
.attr('font-size', 14)
|
||||||
|
.attr('text-anchor', 'middle')
|
||||||
|
.text('No graph data available');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Visual defs - reusable glow
|
||||||
|
var defs = svg.append("defs");
|
||||||
|
var glow = defs.append("filter").attr("id", "glow")
|
||||||
|
.attr("x", "-30%")
|
||||||
|
.attr("y", "-30%")
|
||||||
|
.attr("width", "160%")
|
||||||
|
.attr("height", "160%");
|
||||||
|
glow.append("feGaussianBlur").attr("stdDeviation", 8).attr("result", "coloredBlur");
|
||||||
|
var feMerge = glow.append("feMerge");
|
||||||
|
feMerge.append("feMergeNode").attr("in", "coloredBlur");
|
||||||
|
feMerge.append("feMergeNode").attr("in", "SourceGraphic");
|
||||||
|
|
||||||
|
// Stronger glow for hovered adjacency
|
||||||
|
var glowStrong = defs.append("filter").attr("id", "glow-strong")
|
||||||
|
.attr("x", "-40%")
|
||||||
|
.attr("y", "-40%")
|
||||||
|
.attr("width", "180%")
|
||||||
|
.attr("height", "180%");
|
||||||
|
glowStrong.append("feGaussianBlur").attr("stdDeviation", 14).attr("result", "coloredBlur");
|
||||||
|
var feMerge2 = glowStrong.append("feMerge");
|
||||||
|
feMerge2.append("feMergeNode").attr("in", "coloredBlur");
|
||||||
|
feMerge2.append("feMergeNode").attr("in", "SourceGraphic");
|
||||||
|
|
||||||
|
var currentTransform = d3.zoomIdentity;
|
||||||
|
var densityZoomTimer = null;
|
||||||
|
var isInteracting = false;
|
||||||
|
var labelBaseSize = 10;
|
||||||
|
function getGroupKey(d){ return d && (d.type || d.category || d.group || d.color) || 'default'; }
|
||||||
|
|
||||||
var simulation = d3.forceSimulation(nodes)
|
var simulation = d3.forceSimulation(nodes)
|
||||||
.force("link", d3.forceLink(links).id(d => d.id).strength(0.1))
|
.force("link", d3.forceLink(links).id(function(d){ return d.id; }).distance(100).strength(0.2))
|
||||||
.force("charge", d3.forceManyBody().strength(-275))
|
.force("charge", d3.forceManyBody().strength(-180))
|
||||||
|
.force("collide", d3.forceCollide().radius(16).iterations(2))
|
||||||
.force("center", d3.forceCenter(width / 2, height / 2))
|
.force("center", d3.forceCenter(width / 2, height / 2))
|
||||||
.force("x", d3.forceX().strength(0.1).x(width / 2))
|
.force("x", d3.forceX().strength(0.06).x(width / 2))
|
||||||
.force("y", d3.forceY().strength(0.1).y(height / 2));
|
.force("y", d3.forceY().strength(0.06).y(height / 2))
|
||||||
|
.alphaDecay(0.06)
|
||||||
|
.velocityDecay(0.6);
|
||||||
|
|
||||||
|
// Density layer (sibling of container to avoid double transforms)
|
||||||
|
var densityLayer = svg.append("g")
|
||||||
|
.attr("class", "density")
|
||||||
|
.style("pointer-events", "none");
|
||||||
|
if (densityLayer.lower) densityLayer.lower();
|
||||||
|
|
||||||
var link = container.append("g")
|
var link = container.append("g")
|
||||||
.attr("class", "links")
|
.attr("class", "links")
|
||||||
.selectAll("line")
|
.selectAll("line")
|
||||||
.data(links)
|
.data(links)
|
||||||
.enter().append("line")
|
.enter().append("line")
|
||||||
|
.style("opacity", 0)
|
||||||
|
.style("pointer-events", "none")
|
||||||
.attr("stroke-width", d => {
|
.attr("stroke-width", d => {
|
||||||
if (d.weight) return Math.max(2, d.weight * 5);
|
if (d.weight) return Math.max(2, d.weight * 5);
|
||||||
if (d.all_weights && Object.keys(d.all_weights).length > 0) {
|
if (d.all_weights && Object.keys(d.all_weights).length > 0) {
|
||||||
|
|
@ -168,6 +314,7 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
})
|
})
|
||||||
.on("mouseover", function(d) {
|
.on("mouseover", function(d) {
|
||||||
// Create tooltip content for edge
|
// Create tooltip content for edge
|
||||||
|
renderEdgeInfo(d);
|
||||||
var content = "<strong>Edge Information</strong><br/>";
|
var content = "<strong>Edge Information</strong><br/>";
|
||||||
content += "Relationship: " + d.relation + "<br/>";
|
content += "Relationship: " + d.relation + "<br/>";
|
||||||
|
|
||||||
|
|
@ -212,6 +359,7 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
.data(links)
|
.data(links)
|
||||||
.enter().append("text")
|
.enter().append("text")
|
||||||
.attr("class", "edge-label")
|
.attr("class", "edge-label")
|
||||||
|
.style("opacity", 0)
|
||||||
.text(d => {
|
.text(d => {
|
||||||
var label = d.relation;
|
var label = d.relation;
|
||||||
if (d.all_weights && Object.keys(d.all_weights).length > 1) {
|
if (d.all_weights && Object.keys(d.all_weights).length > 1) {
|
||||||
|
|
@ -232,21 +380,225 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
.data(nodes)
|
.data(nodes)
|
||||||
.enter().append("g");
|
.enter().append("g");
|
||||||
|
|
||||||
|
// Color fallback by type when d.color is missing
|
||||||
|
var colorByType = {
|
||||||
|
"Entity": "#5C10F4",
|
||||||
|
"EntityType": "#A550FF",
|
||||||
|
"DocumentChunk": "#0DFF00",
|
||||||
|
"TextSummary": "#5C10F4",
|
||||||
|
"TableRow": "#A550FF",
|
||||||
|
"TableType": "#5C10F4",
|
||||||
|
"ColumnValue": "#757470",
|
||||||
|
"SchemaTable": "#A550FF",
|
||||||
|
"DatabaseSchema": "#5C10F4",
|
||||||
|
"SchemaRelationship": "#323332"
|
||||||
|
};
|
||||||
|
|
||||||
var node = nodeGroup.append("circle")
|
var node = nodeGroup.append("circle")
|
||||||
.attr("r", 13)
|
.attr("r", 13)
|
||||||
.attr("fill", d => d.color)
|
.attr("fill", function(d){ return d.color || colorByType[d.type] || "#D3D3D3"; })
|
||||||
|
.style("filter", "url(#glow)")
|
||||||
|
.attr("shape-rendering", "geometricPrecision")
|
||||||
.call(d3.drag()
|
.call(d3.drag()
|
||||||
.on("start", dragstarted)
|
.on("start", dragstarted)
|
||||||
.on("drag", dragged)
|
.on("drag", function(d){ dragged(d); updateDensity(); showAdjacency(d); })
|
||||||
.on("end", dragended));
|
.on("end", dragended));
|
||||||
|
|
||||||
nodeGroup.append("text")
|
// Show links only for hovered node adjacency
|
||||||
|
function isAdjacent(linkDatum, nodeId) {
|
||||||
|
var sid = linkDatum && linkDatum.source && (linkDatum.source.id || linkDatum.source);
|
||||||
|
var tid = linkDatum && linkDatum.target && (linkDatum.target.id || linkDatum.target);
|
||||||
|
return sid === nodeId || tid === nodeId;
|
||||||
|
}
|
||||||
|
|
||||||
|
function showAdjacency(d) {
|
||||||
|
var nodeId = d && (d.id || d.node_id || d.uuid || d.external_id || d.name);
|
||||||
|
if (!nodeId) return;
|
||||||
|
// Build neighbor set
|
||||||
|
var neighborIds = {};
|
||||||
|
neighborIds[nodeId] = true;
|
||||||
|
for (var i = 0; i < links.length; i++) {
|
||||||
|
var l = links[i];
|
||||||
|
var sid = l && l.source && (l.source.id || l.source);
|
||||||
|
var tid = l && l.target && (l.target.id || l.target);
|
||||||
|
if (sid === nodeId) neighborIds[tid] = true;
|
||||||
|
if (tid === nodeId) neighborIds[sid] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
link
|
||||||
|
.style("opacity", function(l){ return isAdjacent(l, nodeId) ? 0.95 : 0; })
|
||||||
|
.style("stroke", function(l){ return isAdjacent(l, nodeId) ? "rgba(255,255,255,0.95)" : null; })
|
||||||
|
.style("stroke-width", function(l){ return isAdjacent(l, nodeId) ? 2.5 : 1.5; });
|
||||||
|
edgeLabels.style("opacity", function(l){ return isAdjacent(l, nodeId) ? 1 : 0; });
|
||||||
|
densityLayer.style("opacity", 0.35);
|
||||||
|
|
||||||
|
// Highlight neighbor nodes and dim others
|
||||||
|
node
|
||||||
|
.style("opacity", function(n){ return neighborIds[n.id] ? 1 : 0.25; })
|
||||||
|
.style("filter", function(n){ return neighborIds[n.id] ? "url(#glow-strong)" : "url(#glow)"; })
|
||||||
|
.attr("r", function(n){ return neighborIds[n.id] ? 15 : 13; });
|
||||||
|
// Raise highlighted nodes
|
||||||
|
node.filter(function(n){ return neighborIds[n.id]; }).raise();
|
||||||
|
// Neighbor labels brighter
|
||||||
|
nodeGroup.select("text")
|
||||||
|
.style("opacity", function(n){ return neighborIds[n.id] ? 1 : 0.2; })
|
||||||
|
.style("font-size", function(n){
|
||||||
|
var size = neighborIds[n.id] ? Math.min(22, labelBaseSize * 1.25) : labelBaseSize;
|
||||||
|
return size + "px";
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function clearAdjacency() {
|
||||||
|
link.style("opacity", 0)
|
||||||
|
.style("stroke", null)
|
||||||
|
.style("stroke-width", 1.5);
|
||||||
|
edgeLabels.style("opacity", 0);
|
||||||
|
densityLayer.style("opacity", 1);
|
||||||
|
node
|
||||||
|
.style("opacity", 1)
|
||||||
|
.style("filter", "url(#glow)")
|
||||||
|
.attr("r", 13);
|
||||||
|
nodeGroup.select("text")
|
||||||
|
.style("opacity", 1)
|
||||||
|
.style("font-size", labelBaseSize + "px");
|
||||||
|
}
|
||||||
|
|
||||||
|
node.on("mouseover", function(d){ showAdjacency(d); })
|
||||||
|
.on("mouseout", function(){ clearAdjacency(); });
|
||||||
|
node.on("mouseover", function(d){ renderNodeInfo(d); tooltip.style('opacity', 0); });
|
||||||
|
// Also bind on the group so labels trigger adjacency too
|
||||||
|
nodeGroup.on("mouseover", function(d){ showAdjacency(d); })
|
||||||
|
.on("mouseout", function(){ clearAdjacency(); });
|
||||||
|
|
||||||
|
// Density always on; no hover gating
|
||||||
|
|
||||||
|
// Add labels sparsely to reduce clutter (every ~50th node), and truncate long text
|
||||||
|
nodeGroup
|
||||||
|
.filter(function(d, i){ return i % 14 === 0; })
|
||||||
|
.append("text")
|
||||||
.attr("class", "node-label")
|
.attr("class", "node-label")
|
||||||
.attr("dy", 4)
|
.attr("dy", 4)
|
||||||
.attr("text-anchor", "middle")
|
.attr("text-anchor", "middle")
|
||||||
.text(d => d.name);
|
.text(function(d){
|
||||||
|
var s = d && d.name ? String(d.name) : '';
|
||||||
|
return s.length > 40 ? (s.slice(0, 40) + "…") : s;
|
||||||
|
})
|
||||||
|
.style("font-size", labelBaseSize + "px");
|
||||||
|
|
||||||
node.append("title").text(d => JSON.stringify(d));
|
function applyLabelSize() {
|
||||||
|
var k = (currentTransform && currentTransform.k) || 1;
|
||||||
|
// Keep labels readable across zoom levels and hide when too small
|
||||||
|
labelBaseSize = Math.max(7, Math.min(18, 10 / Math.sqrt(k)));
|
||||||
|
nodeGroup.select("text")
|
||||||
|
.style("font-size", labelBaseSize + "px")
|
||||||
|
.style("display", (k < 0.35 ? "none" : null));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Density cloud computation (throttled)
|
||||||
|
var densityTick = 0;
|
||||||
|
var geoPath = d3.geoPath().projection(null);
|
||||||
|
var MAX_POINTS_PER_GROUP = 400;
|
||||||
|
function updateDensity() {
|
||||||
|
try {
|
||||||
|
if (isInteracting) return; // skip during interaction for smoother UX
|
||||||
|
if (typeof d3 === 'undefined' || typeof d3.contourDensity !== 'function') {
|
||||||
|
return; // d3-contour not available; skip gracefully
|
||||||
|
}
|
||||||
|
if (!nodes || nodes.length === 0) return;
|
||||||
|
var usable = nodes.filter(function(d){ return d && typeof d.x === 'number' && isFinite(d.x) && typeof d.y === 'number' && isFinite(d.y); });
|
||||||
|
if (usable.length < 3) return; // not enough positioned points yet
|
||||||
|
|
||||||
|
var t = currentTransform || d3.zoomIdentity;
|
||||||
|
if (t.k && t.k < 0.08) {
|
||||||
|
// Skip density at extreme zoom-out to avoid numerical instability/perf issues
|
||||||
|
densityLayer.selectAll('*').remove();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
function hexToRgb(hex){
|
||||||
|
if (!hex) return {r: 0, g: 200, b: 255};
|
||||||
|
var c = hex.replace('#','');
|
||||||
|
if (c.length === 3) c = c.split('').map(function(x){ return x+x; }).join('');
|
||||||
|
var num = parseInt(c, 16);
|
||||||
|
return { r: (num >> 16) & 255, g: (num >> 8) & 255, b: num & 255 };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build groups across all nodes
|
||||||
|
var groups = {};
|
||||||
|
for (var i = 0; i < usable.length; i++) {
|
||||||
|
var k = getGroupKey(usable[i]);
|
||||||
|
if (!groups[k]) groups[k] = [];
|
||||||
|
groups[k].push(usable[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
densityLayer.selectAll('*').remove();
|
||||||
|
|
||||||
|
Object.keys(groups).forEach(function(key){
|
||||||
|
var arr = groups[key];
|
||||||
|
if (!arr || arr.length < 3) return;
|
||||||
|
|
||||||
|
// Transform positions into screen space and sample to cap cost
|
||||||
|
var arrT = [];
|
||||||
|
var step = Math.max(1, Math.floor(arr.length / MAX_POINTS_PER_GROUP));
|
||||||
|
for (var j = 0; j < arr.length; j += step) {
|
||||||
|
var nx = t.applyX(arr[j].x);
|
||||||
|
var ny = t.applyY(arr[j].y);
|
||||||
|
if (isFinite(nx) && isFinite(ny)) {
|
||||||
|
arrT.push({ x: nx, y: ny, type: arr[j].type, color: arr[j].color });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (arrT.length < 3) return;
|
||||||
|
|
||||||
|
// Compute adaptive bandwidth based on group spread
|
||||||
|
var cx = 0, cy = 0;
|
||||||
|
for (var k = 0; k < arrT.length; k++){ cx += arrT[k].x; cy += arrT[k].y; }
|
||||||
|
cx /= arrT.length; cy /= arrT.length;
|
||||||
|
var sumR = 0;
|
||||||
|
for (var k2 = 0; k2 < arrT.length; k2++){
|
||||||
|
var dx = arrT[k2].x - cx, dy = arrT[k2].y - cy;
|
||||||
|
sumR += Math.sqrt(dx*dx + dy*dy);
|
||||||
|
}
|
||||||
|
var avgR = sumR / arrT.length;
|
||||||
|
var dynamicBandwidth = Math.max(12, Math.min(80, avgR));
|
||||||
|
var densityBandwidth = dynamicBandwidth / (t.k || 1);
|
||||||
|
|
||||||
|
var contours = d3.contourDensity()
|
||||||
|
.x(function(d){ return d.x; })
|
||||||
|
.y(function(d){ return d.y; })
|
||||||
|
.size([width, height])
|
||||||
|
.bandwidth(densityBandwidth)
|
||||||
|
.thresholds(8)
|
||||||
|
(arrT);
|
||||||
|
|
||||||
|
if (!contours || contours.length === 0) return;
|
||||||
|
var maxVal = d3.max(contours, function(d){ return d.value; }) || 1;
|
||||||
|
|
||||||
|
// Use the first node color in the group or fallback neon palette
|
||||||
|
var baseColor = (arr.find(function(d){ return d && d.color; }) || {}).color || '#00c8ff';
|
||||||
|
var rgb = hexToRgb(baseColor);
|
||||||
|
|
||||||
|
var g = densityLayer.append('g').attr('data-group', key);
|
||||||
|
g.selectAll('path')
|
||||||
|
.data(contours)
|
||||||
|
.enter()
|
||||||
|
.append('path')
|
||||||
|
.attr('d', geoPath)
|
||||||
|
.attr('fill', 'rgb(' + rgb.r + ',' + rgb.g + ',' + rgb.b + ')')
|
||||||
|
.attr('stroke', 'none')
|
||||||
|
.style('opacity', function(d){
|
||||||
|
var v = maxVal ? (d.value / maxVal) : 0;
|
||||||
|
var alpha = Math.pow(Math.max(0, Math.min(1, v)), 1.6); // accentuate clusters
|
||||||
|
return 0.65 * alpha; // up to 0.65 opacity at peak density
|
||||||
|
})
|
||||||
|
.style('filter', 'blur(2px)');
|
||||||
|
});
|
||||||
|
} catch (e) {
|
||||||
|
// Reduce impact of any runtime errors during zoom
|
||||||
|
console.warn('Density update failed:', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
simulation.on("tick", function() {
|
simulation.on("tick", function() {
|
||||||
link.attr("x1", d => d.source.x)
|
link.attr("x1", d => d.source.x)
|
||||||
|
|
@ -266,16 +618,29 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
.attr("y", d => d.y)
|
.attr("y", d => d.y)
|
||||||
.attr("dy", 4)
|
.attr("dy", 4)
|
||||||
.attr("text-anchor", "middle");
|
.attr("text-anchor", "middle");
|
||||||
|
|
||||||
|
densityTick += 1;
|
||||||
|
if (densityTick % 24 === 0) updateDensity();
|
||||||
});
|
});
|
||||||
|
|
||||||
svg.call(d3.zoom().on("zoom", function() {
|
var zoomBehavior = d3.zoom()
|
||||||
container.attr("transform", d3.event.transform);
|
.on("start", function(){ isInteracting = true; densityLayer.style("opacity", 0.2); })
|
||||||
}));
|
.on("zoom", function(){
|
||||||
|
currentTransform = d3.event.transform;
|
||||||
|
container.attr("transform", currentTransform);
|
||||||
|
})
|
||||||
|
.on("end", function(){
|
||||||
|
if (densityZoomTimer) clearTimeout(densityZoomTimer);
|
||||||
|
densityZoomTimer = setTimeout(function(){ isInteracting = false; densityLayer.style("opacity", 1); updateDensity(); }, 140);
|
||||||
|
});
|
||||||
|
svg.call(zoomBehavior);
|
||||||
|
|
||||||
function dragstarted(d) {
|
function dragstarted(d) {
|
||||||
if (!d3.event.active) simulation.alphaTarget(0.3).restart();
|
if (!d3.event.active) simulation.alphaTarget(0.3).restart();
|
||||||
d.fx = d.x;
|
d.fx = d.x;
|
||||||
d.fy = d.y;
|
d.fy = d.y;
|
||||||
|
isInteracting = true;
|
||||||
|
densityLayer.style("opacity", 0.2);
|
||||||
}
|
}
|
||||||
|
|
||||||
function dragged(d) {
|
function dragged(d) {
|
||||||
|
|
@ -287,6 +652,8 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
if (!d3.event.active) simulation.alphaTarget(0);
|
if (!d3.event.active) simulation.alphaTarget(0);
|
||||||
d.fx = null;
|
d.fx = null;
|
||||||
d.fy = null;
|
d.fy = null;
|
||||||
|
if (densityZoomTimer) clearTimeout(densityZoomTimer);
|
||||||
|
densityZoomTimer = setTimeout(function(){ isInteracting = false; densityLayer.style("opacity", 1); updateDensity(); }, 140);
|
||||||
}
|
}
|
||||||
|
|
||||||
window.addEventListener("resize", function() {
|
window.addEventListener("resize", function() {
|
||||||
|
|
@ -295,7 +662,13 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
svg.attr("width", width).attr("height", height);
|
svg.attr("width", width).attr("height", height);
|
||||||
simulation.force("center", d3.forceCenter(width / 2, height / 2));
|
simulation.force("center", d3.forceCenter(width / 2, height / 2));
|
||||||
simulation.alpha(1).restart();
|
simulation.alpha(1).restart();
|
||||||
|
updateDensity();
|
||||||
|
applyLabelSize();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Initial density draw
|
||||||
|
updateDensity();
|
||||||
|
applyLabelSize();
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<svg style="position: fixed; bottom: 10px; right: 10px; width: 150px; height: auto; z-index: 9999;" viewBox="0 0 158 44" fill="none" xmlns="http://www.w3.org/2000/svg">
|
<svg style="position: fixed; bottom: 10px; right: 10px; width: 150px; height: auto; z-index: 9999;" viewBox="0 0 158 44" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||||
|
|
@ -305,8 +678,12 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
html_content = html_template.replace("{nodes}", json.dumps(nodes_list))
|
# Safely embed JSON inside <script> by escaping </ to avoid prematurely closing the tag
|
||||||
html_content = html_content.replace("{links}", json.dumps(links_list))
|
def _safe_json_embed(obj):
|
||||||
|
return json.dumps(obj).replace("</", "<\\/")
|
||||||
|
|
||||||
|
html_content = html_template.replace("{nodes}", _safe_json_embed(nodes_list))
|
||||||
|
html_content = html_content.replace("{links}", _safe_json_embed(links_list))
|
||||||
|
|
||||||
if not destination_file_path:
|
if not destination_file_path:
|
||||||
home_dir = os.path.expanduser("~")
|
home_dir = os.path.expanduser("~")
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
|
import tempfile
|
||||||
import structlog
|
import structlog
|
||||||
import traceback
|
import traceback
|
||||||
import platform
|
import platform
|
||||||
|
|
@ -76,9 +77,38 @@ log_levels = {
|
||||||
# Track if structlog logging has been configured
|
# Track if structlog logging has been configured
|
||||||
_is_structlog_configured = False
|
_is_structlog_configured = False
|
||||||
|
|
||||||
# Path to logs directory
|
|
||||||
LOGS_DIR = Path(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "logs"))
|
def resolve_logs_dir():
|
||||||
LOGS_DIR.mkdir(exist_ok=True) # Create logs dir if it doesn't exist
|
"""Resolve a writable logs directory.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1) BaseConfig.logs_root_directory (respects COGNEE_LOGS_DIR)
|
||||||
|
2) /tmp/cognee_logs (default, best-effort create)
|
||||||
|
|
||||||
|
Returns a Path or None if none are writable/creatable.
|
||||||
|
"""
|
||||||
|
from cognee.base_config import get_base_config
|
||||||
|
|
||||||
|
base_config = get_base_config()
|
||||||
|
logs_root_directory = Path(base_config.logs_root_directory)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logs_root_directory.mkdir(parents=True, exist_ok=True)
|
||||||
|
if os.access(logs_root_directory, os.W_OK):
|
||||||
|
return logs_root_directory
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
tmp_log_path = Path(os.path.join("/tmp", "cognee_logs"))
|
||||||
|
tmp_log_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
if os.access(tmp_log_path, os.W_OK):
|
||||||
|
return tmp_log_path
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# Maximum number of log files to keep
|
# Maximum number of log files to keep
|
||||||
MAX_LOG_FILES = 10
|
MAX_LOG_FILES = 10
|
||||||
|
|
@ -430,28 +460,38 @@ def setup_logging(log_level=None, name=None):
|
||||||
stream_handler.setFormatter(console_formatter)
|
stream_handler.setFormatter(console_formatter)
|
||||||
stream_handler.setLevel(log_level)
|
stream_handler.setLevel(log_level)
|
||||||
|
|
||||||
|
root_logger = logging.getLogger()
|
||||||
|
if root_logger.hasHandlers():
|
||||||
|
root_logger.handlers.clear()
|
||||||
|
root_logger.addHandler(stream_handler)
|
||||||
|
|
||||||
|
# Note: root logger needs to be set at NOTSET to allow all messages through and specific stream and file handlers
|
||||||
|
# can define their own levels.
|
||||||
|
root_logger.setLevel(logging.NOTSET)
|
||||||
|
|
||||||
|
# Resolve logs directory with env and safe fallbacks
|
||||||
|
logs_dir = resolve_logs_dir()
|
||||||
|
|
||||||
# Check if we already have a log file path from the environment
|
# Check if we already have a log file path from the environment
|
||||||
# NOTE: environment variable must be used here as it allows us to
|
# NOTE: environment variable must be used here as it allows us to
|
||||||
# log to a single file with a name based on a timestamp in a multiprocess setting.
|
# log to a single file with a name based on a timestamp in a multiprocess setting.
|
||||||
# Without it, we would have a separate log file for every process.
|
# Without it, we would have a separate log file for every process.
|
||||||
log_file_path = os.environ.get("LOG_FILE_NAME")
|
log_file_path = os.environ.get("LOG_FILE_NAME")
|
||||||
if not log_file_path:
|
if not log_file_path and logs_dir is not None:
|
||||||
# Create a new log file name with the cognee start time
|
# Create a new log file name with the cognee start time
|
||||||
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
log_file_path = os.path.join(LOGS_DIR, f"{start_time}.log")
|
log_file_path = str((logs_dir / f"{start_time}.log").resolve())
|
||||||
os.environ["LOG_FILE_NAME"] = log_file_path
|
os.environ["LOG_FILE_NAME"] = log_file_path
|
||||||
|
|
||||||
# Create a file handler that uses our custom PlainFileHandler
|
try:
|
||||||
file_handler = PlainFileHandler(log_file_path, encoding="utf-8")
|
# Create a file handler that uses our custom PlainFileHandler
|
||||||
file_handler.setLevel(DEBUG)
|
file_handler = PlainFileHandler(log_file_path, encoding="utf-8")
|
||||||
|
file_handler.setLevel(DEBUG)
|
||||||
# Configure root logger
|
root_logger.addHandler(file_handler)
|
||||||
root_logger = logging.getLogger()
|
except Exception as e:
|
||||||
if root_logger.hasHandlers():
|
# Note: Exceptions happen in case of read only file systems or log file path poiting to location where it does
|
||||||
root_logger.handlers.clear()
|
# not have write permission. Logging to file is not mandatory so we just log a warning to console.
|
||||||
root_logger.addHandler(stream_handler)
|
root_logger.warning(f"Warning: Could not create log file handler at {log_file_path}: {e}")
|
||||||
root_logger.addHandler(file_handler)
|
|
||||||
root_logger.setLevel(log_level)
|
|
||||||
|
|
||||||
if log_level > logging.DEBUG:
|
if log_level > logging.DEBUG:
|
||||||
import warnings
|
import warnings
|
||||||
|
|
@ -466,7 +506,8 @@ def setup_logging(log_level=None, name=None):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Clean up old log files, keeping only the most recent ones
|
# Clean up old log files, keeping only the most recent ones
|
||||||
cleanup_old_logs(LOGS_DIR, MAX_LOG_FILES)
|
if logs_dir is not None:
|
||||||
|
cleanup_old_logs(logs_dir, MAX_LOG_FILES)
|
||||||
|
|
||||||
# Mark logging as configured
|
# Mark logging as configured
|
||||||
_is_structlog_configured = True
|
_is_structlog_configured = True
|
||||||
|
|
@ -490,6 +531,10 @@ def setup_logging(log_level=None, name=None):
|
||||||
|
|
||||||
# Get a configured logger and log system information
|
# Get a configured logger and log system information
|
||||||
logger = structlog.get_logger(name if name else __name__)
|
logger = structlog.get_logger(name if name else __name__)
|
||||||
|
|
||||||
|
if logs_dir is not None:
|
||||||
|
logger.info(f"Log file created at: {log_file_path}", log_file=log_file_path)
|
||||||
|
|
||||||
# Detailed initialization for regular usage
|
# Detailed initialization for regular usage
|
||||||
logger.info(
|
logger.info(
|
||||||
"Logging initialized",
|
"Logging initialized",
|
||||||
|
|
|
||||||
|
|
@ -11,8 +11,10 @@ import pathlib
|
||||||
from uuid import uuid4, uuid5, NAMESPACE_OID
|
from uuid import uuid4, uuid5, NAMESPACE_OID
|
||||||
|
|
||||||
from cognee.base_config import get_base_config
|
from cognee.base_config import get_base_config
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
# Analytics Proxy Url, currently hosted by Vercel
|
# Analytics Proxy Url, currently hosted by Vercel
|
||||||
proxy_url = "https://test.prometh.ai"
|
proxy_url = "https://test.prometh.ai"
|
||||||
|
|
@ -38,16 +40,21 @@ def get_anonymous_id():
|
||||||
|
|
||||||
home_dir = str(pathlib.Path(pathlib.Path(__file__).parent.parent.parent.resolve()))
|
home_dir = str(pathlib.Path(pathlib.Path(__file__).parent.parent.parent.resolve()))
|
||||||
|
|
||||||
if not os.path.isdir(home_dir):
|
try:
|
||||||
os.makedirs(home_dir, exist_ok=True)
|
if not os.path.isdir(home_dir):
|
||||||
anonymous_id_file = os.path.join(home_dir, ".anon_id")
|
os.makedirs(home_dir, exist_ok=True)
|
||||||
if not os.path.isfile(anonymous_id_file):
|
anonymous_id_file = os.path.join(home_dir, ".anon_id")
|
||||||
anonymous_id = str(uuid4())
|
if not os.path.isfile(anonymous_id_file):
|
||||||
with open(anonymous_id_file, "w", encoding="utf-8") as f:
|
anonymous_id = str(uuid4())
|
||||||
f.write(anonymous_id)
|
with open(anonymous_id_file, "w", encoding="utf-8") as f:
|
||||||
else:
|
f.write(anonymous_id)
|
||||||
with open(anonymous_id_file, "r", encoding="utf-8") as f:
|
else:
|
||||||
anonymous_id = f.read()
|
with open(anonymous_id_file, "r", encoding="utf-8") as f:
|
||||||
|
anonymous_id = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
# In case of read-only filesystem or other issues
|
||||||
|
logger.warning("Could not create or read anonymous id file: %s", e)
|
||||||
|
return "unknown-anonymous-id"
|
||||||
return anonymous_id
|
return anonymous_id
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
336
cognee/tasks/cleanup/cleanup_unused_data.py
Normal file
336
cognee/tasks/cleanup/cleanup_unused_data.py
Normal file
|
|
@ -0,0 +1,336 @@
|
||||||
|
"""
|
||||||
|
Task for automatically deleting unused data from the memify pipeline.
|
||||||
|
|
||||||
|
This task identifies and removes data (chunks, entities, summaries) that hasn't
|
||||||
|
been accessed by retrievers for a specified period, helping maintain system
|
||||||
|
efficiency and storage optimization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
from uuid import UUID
|
||||||
|
|
||||||
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||||
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||||
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||||
|
from cognee.modules.data.models import Data, DatasetData
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
from sqlalchemy import select, or_
|
||||||
|
import cognee
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def cleanup_unused_data(
|
||||||
|
days_threshold: Optional[int],
|
||||||
|
dry_run: bool = True,
|
||||||
|
user_id: Optional[UUID] = None,
|
||||||
|
text_doc: bool = False
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Identify and remove unused data from the memify pipeline.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
days_threshold : int
|
||||||
|
days since last access to consider data unused
|
||||||
|
dry_run : bool
|
||||||
|
If True, only report what would be deleted without actually deleting (default: True)
|
||||||
|
user_id : UUID, optional
|
||||||
|
Limit cleanup to specific user's data (default: None)
|
||||||
|
text_doc : bool
|
||||||
|
If True, use SQL-based filtering to find unused TextDocuments and call cognee.delete()
|
||||||
|
for proper whole-document deletion (default: False)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Dict[str, Any]
|
||||||
|
Cleanup results with status, counts, and timestamp
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"Starting cleanup task",
|
||||||
|
days_threshold=days_threshold,
|
||||||
|
dry_run=dry_run,
|
||||||
|
user_id=str(user_id) if user_id else None,
|
||||||
|
text_doc=text_doc
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate cutoff timestamp
|
||||||
|
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold)
|
||||||
|
|
||||||
|
if text_doc:
|
||||||
|
# SQL-based approach: Find unused TextDocuments and use cognee.delete()
|
||||||
|
return await _cleanup_via_sql(cutoff_date, dry_run, user_id)
|
||||||
|
else:
|
||||||
|
# Graph-based approach: Find unused nodes directly from graph
|
||||||
|
cutoff_timestamp_ms = int(cutoff_date.timestamp() * 1000)
|
||||||
|
logger.debug(f"Cutoff timestamp: {cutoff_date.isoformat()} ({cutoff_timestamp_ms}ms)")
|
||||||
|
|
||||||
|
# Find unused nodes
|
||||||
|
unused_nodes = await _find_unused_nodes(cutoff_timestamp_ms, user_id)
|
||||||
|
|
||||||
|
total_unused = sum(len(nodes) for nodes in unused_nodes.values())
|
||||||
|
logger.info(f"Found {total_unused} unused nodes", unused_nodes={k: len(v) for k, v in unused_nodes.items()})
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
return {
|
||||||
|
"status": "dry_run",
|
||||||
|
"unused_count": total_unused,
|
||||||
|
"deleted_count": {
|
||||||
|
"data_items": 0,
|
||||||
|
"chunks": 0,
|
||||||
|
"entities": 0,
|
||||||
|
"summaries": 0,
|
||||||
|
"associations": 0
|
||||||
|
},
|
||||||
|
"cleanup_date": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"preview": {
|
||||||
|
"chunks": len(unused_nodes["DocumentChunk"]),
|
||||||
|
"entities": len(unused_nodes["Entity"]),
|
||||||
|
"summaries": len(unused_nodes["TextSummary"])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Delete unused nodes
|
||||||
|
deleted_counts = await _delete_unused_nodes(unused_nodes)
|
||||||
|
|
||||||
|
logger.info("Cleanup completed", deleted_counts=deleted_counts)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "completed",
|
||||||
|
"unused_count": total_unused,
|
||||||
|
"deleted_count": {
|
||||||
|
"data_items": 0,
|
||||||
|
"chunks": deleted_counts["DocumentChunk"],
|
||||||
|
"entities": deleted_counts["Entity"],
|
||||||
|
"summaries": deleted_counts["TextSummary"],
|
||||||
|
"associations": deleted_counts["associations"]
|
||||||
|
},
|
||||||
|
"cleanup_date": datetime.now(timezone.utc).isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _cleanup_via_sql(
|
||||||
|
cutoff_date: datetime,
|
||||||
|
dry_run: bool,
|
||||||
|
user_id: Optional[UUID] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
SQL-based cleanup: Query Data table for unused documents and use cognee.delete().
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cutoff_date : datetime
|
||||||
|
Cutoff date for last_accessed filtering
|
||||||
|
dry_run : bool
|
||||||
|
If True, only report what would be deleted
|
||||||
|
user_id : UUID, optional
|
||||||
|
Filter by user ID if provided
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Dict[str, Any]
|
||||||
|
Cleanup results
|
||||||
|
"""
|
||||||
|
db_engine = get_relational_engine()
|
||||||
|
|
||||||
|
async with db_engine.get_async_session() as session:
|
||||||
|
# Query for Data records with old last_accessed timestamps
|
||||||
|
query = select(Data, DatasetData).join(
|
||||||
|
DatasetData, Data.id == DatasetData.data_id
|
||||||
|
).where(
|
||||||
|
or_(
|
||||||
|
Data.last_accessed < cutoff_date,
|
||||||
|
Data.last_accessed.is_(None)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if user_id:
|
||||||
|
from cognee.modules.data.models import Dataset
|
||||||
|
query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(
|
||||||
|
Dataset.owner_id == user_id
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await session.execute(query)
|
||||||
|
unused_data = result.all()
|
||||||
|
|
||||||
|
logger.info(f"Found {len(unused_data)} unused documents in SQL")
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
return {
|
||||||
|
"status": "dry_run",
|
||||||
|
"unused_count": len(unused_data),
|
||||||
|
"deleted_count": {
|
||||||
|
"data_items": 0,
|
||||||
|
"documents": 0
|
||||||
|
},
|
||||||
|
"cleanup_date": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"preview": {
|
||||||
|
"documents": len(unused_data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Delete each document using cognee.delete()
|
||||||
|
deleted_count = 0
|
||||||
|
from cognee.modules.users.methods import get_default_user
|
||||||
|
user = await get_default_user() if user_id is None else None
|
||||||
|
|
||||||
|
for data, dataset_data in unused_data:
|
||||||
|
try:
|
||||||
|
await cognee.delete(
|
||||||
|
data_id=data.id,
|
||||||
|
dataset_id=dataset_data.dataset_id,
|
||||||
|
mode="hard", # Use hard mode to also remove orphaned entities
|
||||||
|
user=user
|
||||||
|
)
|
||||||
|
deleted_count += 1
|
||||||
|
logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to delete document {data.id}: {e}")
|
||||||
|
|
||||||
|
logger.info("Cleanup completed", deleted_count=deleted_count)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "completed",
|
||||||
|
"unused_count": len(unused_data),
|
||||||
|
"deleted_count": {
|
||||||
|
"data_items": deleted_count,
|
||||||
|
"documents": deleted_count
|
||||||
|
},
|
||||||
|
"cleanup_date": datetime.now(timezone.utc).isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _find_unused_nodes(
|
||||||
|
cutoff_timestamp_ms: int,
|
||||||
|
user_id: Optional[UUID] = None
|
||||||
|
) -> Dict[str, list]:
|
||||||
|
"""
|
||||||
|
Query Kuzu for nodes with old last_accessed_at timestamps.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cutoff_timestamp_ms : int
|
||||||
|
Cutoff timestamp in milliseconds since epoch
|
||||||
|
user_id : UUID, optional
|
||||||
|
Filter by user ID if provided
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Dict[str, list]
|
||||||
|
Dictionary mapping node types to lists of unused node IDs
|
||||||
|
"""
|
||||||
|
graph_engine = await get_graph_engine()
|
||||||
|
|
||||||
|
# Query all nodes with their properties
|
||||||
|
query = "MATCH (n:Node) RETURN n.id, n.type, n.properties"
|
||||||
|
results = await graph_engine.query(query)
|
||||||
|
|
||||||
|
unused_nodes = {
|
||||||
|
"DocumentChunk": [],
|
||||||
|
"Entity": [],
|
||||||
|
"TextSummary": []
|
||||||
|
}
|
||||||
|
|
||||||
|
for node_id, node_type, props_json in results:
|
||||||
|
# Only process tracked node types
|
||||||
|
if node_type not in unused_nodes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Parse properties JSON
|
||||||
|
if props_json:
|
||||||
|
try:
|
||||||
|
props = json.loads(props_json)
|
||||||
|
last_accessed = props.get("last_accessed_at")
|
||||||
|
|
||||||
|
# Check if node is unused (never accessed or accessed before cutoff)
|
||||||
|
if last_accessed is None or last_accessed < cutoff_timestamp_ms:
|
||||||
|
unused_nodes[node_type].append(node_id)
|
||||||
|
logger.debug(
|
||||||
|
f"Found unused {node_type}",
|
||||||
|
node_id=node_id,
|
||||||
|
last_accessed=last_accessed
|
||||||
|
)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.warning(f"Failed to parse properties for node {node_id}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return unused_nodes
|
||||||
|
|
||||||
|
|
||||||
|
async def _delete_unused_nodes(unused_nodes: Dict[str, list]) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
Delete unused nodes from graph and vector databases.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
unused_nodes : Dict[str, list]
|
||||||
|
Dictionary mapping node types to lists of node IDs to delete
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Dict[str, int]
|
||||||
|
Count of deleted items by type
|
||||||
|
"""
|
||||||
|
graph_engine = await get_graph_engine()
|
||||||
|
vector_engine = get_vector_engine()
|
||||||
|
|
||||||
|
deleted_counts = {
|
||||||
|
"DocumentChunk": 0,
|
||||||
|
"Entity": 0,
|
||||||
|
"TextSummary": 0,
|
||||||
|
"associations": 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Count associations before deletion
|
||||||
|
for node_type, node_ids in unused_nodes.items():
|
||||||
|
if not node_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Count edges connected to these nodes
|
||||||
|
for node_id in node_ids:
|
||||||
|
result = await graph_engine.query(
|
||||||
|
"MATCH (n:Node {id: $id})-[r:EDGE]-() RETURN count(r)",
|
||||||
|
{"id": node_id}
|
||||||
|
)
|
||||||
|
if result and len(result) > 0:
|
||||||
|
deleted_counts["associations"] += result[0][0]
|
||||||
|
|
||||||
|
# Delete from graph database (uses DETACH DELETE, so edges are automatically removed)
|
||||||
|
for node_type, node_ids in unused_nodes.items():
|
||||||
|
if not node_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"Deleting {len(node_ids)} {node_type} nodes from graph database")
|
||||||
|
|
||||||
|
# Delete nodes in batches
|
||||||
|
await graph_engine.delete_nodes(node_ids)
|
||||||
|
deleted_counts[node_type] = len(node_ids)
|
||||||
|
|
||||||
|
# Delete from vector database
|
||||||
|
vector_collections = {
|
||||||
|
"DocumentChunk": "DocumentChunk_text",
|
||||||
|
"Entity": "Entity_name",
|
||||||
|
"TextSummary": "TextSummary_text"
|
||||||
|
}
|
||||||
|
|
||||||
|
for node_type, collection_name in vector_collections.items():
|
||||||
|
node_ids = unused_nodes[node_type]
|
||||||
|
if not node_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"Deleting {len(node_ids)} {node_type} embeddings from vector database")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Delete from vector collection
|
||||||
|
if await vector_engine.has_collection(collection_name):
|
||||||
|
for node_id in node_ids:
|
||||||
|
try:
|
||||||
|
await vector_engine.delete(collection_name, {"id": str(node_id)})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to delete {node_id} from {collection_name}: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error deleting from vector collection {collection_name}: {e}")
|
||||||
|
|
||||||
|
return deleted_counts
|
||||||
|
|
@ -239,7 +239,7 @@ async def complete_database_ingestion(schema, migrate_column_data):
|
||||||
id=uuid5(NAMESPACE_OID, name=column_node_id),
|
id=uuid5(NAMESPACE_OID, name=column_node_id),
|
||||||
name=column_node_id,
|
name=column_node_id,
|
||||||
properties=f"{key} {value} {table_name}",
|
properties=f"{key} {value} {table_name}",
|
||||||
description=f"column from relational database table={table_name}. Column name={key} and value={value}. The value of the column is related to the following row with this id: {row_node.id}. This column has the following ID: {column_node_id}",
|
description=f"column from relational database table={table_name}. Column name={key} and value={value}. This column has the following ID: {column_node_id}",
|
||||||
)
|
)
|
||||||
node_mapping[column_node_id] = column_node
|
node_mapping[column_node_id] = column_node
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from typing import Union
|
||||||
|
from datetime import datetime, timezone
|
||||||
from cognee.infrastructure.engine import DataPoint
|
from cognee.infrastructure.engine import DataPoint
|
||||||
from cognee.modules.chunking.models import DocumentChunk
|
from cognee.modules.chunking.models import DocumentChunk
|
||||||
from cognee.shared.CodeGraphEntities import CodeFile, CodePart
|
from cognee.shared.CodeGraphEntities import CodeFile, CodePart
|
||||||
|
|
@ -17,7 +19,6 @@ class TextSummary(DataPoint):
|
||||||
|
|
||||||
text: str
|
text: str
|
||||||
made_from: DocumentChunk
|
made_from: DocumentChunk
|
||||||
|
|
||||||
metadata: dict = {"index_fields": ["text"]}
|
metadata: dict = {"index_fields": ["text"]}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -489,3 +489,154 @@ def test_get_ontology_resolver_from_env_resolver_functionality():
|
||||||
assert nodes == []
|
assert nodes == []
|
||||||
assert relationships == []
|
assert relationships == []
|
||||||
assert start_node is None
|
assert start_node is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_multifile_ontology_loading_success():
|
||||||
|
"""Test successful loading of multiple ontology files."""
|
||||||
|
ns1 = Namespace("http://example.org/cars#")
|
||||||
|
ns2 = Namespace("http://example.org/tech#")
|
||||||
|
|
||||||
|
g1 = Graph()
|
||||||
|
g1.add((ns1.Vehicle, RDF.type, OWL.Class))
|
||||||
|
g1.add((ns1.Car, RDF.type, OWL.Class))
|
||||||
|
g1.add((ns1.Car, RDFS.subClassOf, ns1.Vehicle))
|
||||||
|
g1.add((ns1.Audi, RDF.type, ns1.Car))
|
||||||
|
g1.add((ns1.BMW, RDF.type, ns1.Car))
|
||||||
|
|
||||||
|
g2 = Graph()
|
||||||
|
g2.add((ns2.Company, RDF.type, OWL.Class))
|
||||||
|
g2.add((ns2.TechCompany, RDF.type, OWL.Class))
|
||||||
|
g2.add((ns2.TechCompany, RDFS.subClassOf, ns2.Company))
|
||||||
|
g2.add((ns2.Apple, RDF.type, ns2.TechCompany))
|
||||||
|
g2.add((ns2.Google, RDF.type, ns2.TechCompany))
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f1:
|
||||||
|
g1.serialize(f1.name, format="xml")
|
||||||
|
file1_path = f1.name
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f2:
|
||||||
|
g2.serialize(f2.name, format="xml")
|
||||||
|
file2_path = f2.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
resolver = RDFLibOntologyResolver(ontology_file=[file1_path, file2_path])
|
||||||
|
|
||||||
|
assert resolver.graph is not None
|
||||||
|
|
||||||
|
assert "car" in resolver.lookup["classes"]
|
||||||
|
assert "vehicle" in resolver.lookup["classes"]
|
||||||
|
assert "company" in resolver.lookup["classes"]
|
||||||
|
assert "techcompany" in resolver.lookup["classes"]
|
||||||
|
|
||||||
|
assert "audi" in resolver.lookup["individuals"]
|
||||||
|
assert "bmw" in resolver.lookup["individuals"]
|
||||||
|
assert "apple" in resolver.lookup["individuals"]
|
||||||
|
assert "google" in resolver.lookup["individuals"]
|
||||||
|
|
||||||
|
car_match = resolver.find_closest_match("Audi", "individuals")
|
||||||
|
assert car_match == "audi"
|
||||||
|
|
||||||
|
tech_match = resolver.find_closest_match("Google", "individuals")
|
||||||
|
assert tech_match == "google"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.unlink(file1_path)
|
||||||
|
os.unlink(file2_path)
|
||||||
|
|
||||||
|
|
||||||
|
def test_multifile_ontology_with_missing_files():
|
||||||
|
"""Test loading multiple ontology files where some don't exist."""
|
||||||
|
ns = Namespace("http://example.org/test#")
|
||||||
|
g = Graph()
|
||||||
|
g.add((ns.Car, RDF.type, OWL.Class))
|
||||||
|
g.add((ns.Audi, RDF.type, ns.Car))
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f:
|
||||||
|
g.serialize(f.name, format="xml")
|
||||||
|
valid_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
resolver = RDFLibOntologyResolver(
|
||||||
|
ontology_file=["nonexistent_file_1.owl", valid_file, "nonexistent_file_2.owl"]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert resolver.graph is not None
|
||||||
|
|
||||||
|
assert "car" in resolver.lookup["classes"]
|
||||||
|
assert "audi" in resolver.lookup["individuals"]
|
||||||
|
|
||||||
|
match = resolver.find_closest_match("Audi", "individuals")
|
||||||
|
assert match == "audi"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.unlink(valid_file)
|
||||||
|
|
||||||
|
|
||||||
|
def test_multifile_ontology_all_files_missing():
|
||||||
|
"""Test loading multiple ontology files where all files are missing."""
|
||||||
|
resolver = RDFLibOntologyResolver(
|
||||||
|
ontology_file=["nonexistent_file_1.owl", "nonexistent_file_2.owl", "nonexistent_file_3.owl"]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert resolver.graph is None
|
||||||
|
|
||||||
|
assert resolver.lookup["classes"] == {}
|
||||||
|
assert resolver.lookup["individuals"] == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_multifile_ontology_with_overlapping_entities():
|
||||||
|
"""Test loading multiple ontology files with overlapping/related entities."""
|
||||||
|
ns = Namespace("http://example.org/automotive#")
|
||||||
|
|
||||||
|
g1 = Graph()
|
||||||
|
g1.add((ns.Vehicle, RDF.type, OWL.Class))
|
||||||
|
g1.add((ns.Car, RDF.type, OWL.Class))
|
||||||
|
g1.add((ns.Car, RDFS.subClassOf, ns.Vehicle))
|
||||||
|
|
||||||
|
g2 = Graph()
|
||||||
|
g2.add((ns.LuxuryCar, RDF.type, OWL.Class))
|
||||||
|
g2.add((ns.LuxuryCar, RDFS.subClassOf, ns.Car))
|
||||||
|
g2.add((ns.Mercedes, RDF.type, ns.LuxuryCar))
|
||||||
|
g2.add((ns.BMW, RDF.type, ns.LuxuryCar))
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f1:
|
||||||
|
g1.serialize(f1.name, format="xml")
|
||||||
|
file1_path = f1.name
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f2:
|
||||||
|
g2.serialize(f2.name, format="xml")
|
||||||
|
file2_path = f2.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
resolver = RDFLibOntologyResolver(ontology_file=[file1_path, file2_path])
|
||||||
|
|
||||||
|
assert "vehicle" in resolver.lookup["classes"]
|
||||||
|
assert "car" in resolver.lookup["classes"]
|
||||||
|
assert "luxurycar" in resolver.lookup["classes"]
|
||||||
|
|
||||||
|
assert "mercedes" in resolver.lookup["individuals"]
|
||||||
|
assert "bmw" in resolver.lookup["individuals"]
|
||||||
|
|
||||||
|
nodes, relationships, start_node = resolver.get_subgraph("Mercedes", "individuals")
|
||||||
|
|
||||||
|
uri_labels = {resolver._uri_to_key(n.uri) for n in nodes}
|
||||||
|
assert "mercedes" in uri_labels
|
||||||
|
assert "luxurycar" in uri_labels
|
||||||
|
assert "car" in uri_labels
|
||||||
|
assert "vehicle" in uri_labels
|
||||||
|
|
||||||
|
finally:
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.unlink(file1_path)
|
||||||
|
os.unlink(file2_path)
|
||||||
|
|
|
||||||
|
|
@ -77,6 +77,7 @@ async def main():
|
||||||
"What happened between 2000 and 2006?",
|
"What happened between 2000 and 2006?",
|
||||||
"What happened between 1903 and 1995, I am interested in the Selected Works of Arnulf Øverland Ole Peter Arnulf Øverland?",
|
"What happened between 1903 and 1995, I am interested in the Selected Works of Arnulf Øverland Ole Peter Arnulf Øverland?",
|
||||||
"Who is Attaphol Buspakom Attaphol Buspakom?",
|
"Who is Attaphol Buspakom Attaphol Buspakom?",
|
||||||
|
"Who was Arnulf Øverland?",
|
||||||
]
|
]
|
||||||
|
|
||||||
for query_text in queries:
|
for query_text in queries:
|
||||||
|
|
|
||||||
6
poetry.lock
generated
6
poetry.lock
generated
|
|
@ -1,4 +1,4 @@
|
||||||
# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "accelerate"
|
name = "accelerate"
|
||||||
|
|
@ -2543,7 +2543,6 @@ files = [
|
||||||
{file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9b31dd488d0778c36f8279b306dc92a42f16904cba54acca71e107d65b60b0c"},
|
{file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9b31dd488d0778c36f8279b306dc92a42f16904cba54acca71e107d65b60b0c"},
|
||||||
{file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:b19361ee649365eefc717ec08005972d3d1eb9ee39908022d98e3bfa9da59e37"},
|
{file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:b19361ee649365eefc717ec08005972d3d1eb9ee39908022d98e3bfa9da59e37"},
|
||||||
{file = "fastuuid-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:8fc66b11423e6f3e1937385f655bedd67aebe56a3dcec0cb835351cfe7d358c9"},
|
{file = "fastuuid-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:8fc66b11423e6f3e1937385f655bedd67aebe56a3dcec0cb835351cfe7d358c9"},
|
||||||
{file = "fastuuid-0.12.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:2925f67b88d47cb16aa3eb1ab20fdcf21b94d74490e0818c91ea41434b987493"},
|
|
||||||
{file = "fastuuid-0.12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7b15c54d300279ab20a9cc0579ada9c9f80d1bc92997fc61fb7bf3103d7cb26b"},
|
{file = "fastuuid-0.12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7b15c54d300279ab20a9cc0579ada9c9f80d1bc92997fc61fb7bf3103d7cb26b"},
|
||||||
{file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:458f1bc3ebbd76fdb89ad83e6b81ccd3b2a99fa6707cd3650b27606745cfb170"},
|
{file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:458f1bc3ebbd76fdb89ad83e6b81ccd3b2a99fa6707cd3650b27606745cfb170"},
|
||||||
{file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_34_x86_64.whl", hash = "sha256:a8f0f83fbba6dc44271a11b22e15838641b8c45612cdf541b4822a5930f6893c"},
|
{file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_34_x86_64.whl", hash = "sha256:a8f0f83fbba6dc44271a11b22e15838641b8c45612cdf541b4822a5930f6893c"},
|
||||||
|
|
@ -4170,8 +4169,6 @@ groups = ["main"]
|
||||||
markers = "extra == \"dlt\""
|
markers = "extra == \"dlt\""
|
||||||
files = [
|
files = [
|
||||||
{file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"},
|
{file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"},
|
||||||
{file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"},
|
|
||||||
{file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"},
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
|
|
@ -8593,7 +8590,6 @@ files = [
|
||||||
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
|
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
|
||||||
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
|
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
|
||||||
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
|
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
|
||||||
{file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"},
|
|
||||||
{file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
|
{file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
|
||||||
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
|
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
|
||||||
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
|
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
[project]
|
[project]
|
||||||
name = "cognee"
|
name = "cognee"
|
||||||
|
|
||||||
version = "0.3.7"
|
version = "0.3.9"
|
||||||
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
|
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
|
||||||
authors = [
|
authors = [
|
||||||
{ name = "Vasilije Markovic" },
|
{ name = "Vasilije Markovic" },
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue