Merge branch 'dev' into limit_major_dep_version_change

This commit is contained in:
Vasilije 2025-07-13 20:29:10 +02:00 committed by GitHub
commit a9b1be8b3e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 310 additions and 222 deletions

View file

@ -21,6 +21,11 @@ runs:
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install poetry pip install poetry
- name: Rebuild Poetry lock file
shell: bash
run: poetry lock
- name: Install dependencies - name: Install dependencies
shell: bash shell: bash
run: poetry install --no-interaction -E api -E docs -E evals -E gemini -E codegraph -E ollama -E dev -E neo4j run: poetry install --no-interaction -E api -E docs -E evals -E gemini -E codegraph -E ollama -E dev -E neo4j

View file

@ -1,3 +1,4 @@
import os
from uuid import UUID from uuid import UUID
from fastapi import Form, UploadFile, Depends from fastapi import Form, UploadFile, Depends
@ -31,8 +32,11 @@ def get_add_router() -> APIRouter:
raise ValueError("Either datasetId or datasetName must be provided.") raise ValueError("Either datasetId or datasetName must be provided.")
try: try:
# TODO: Add check if HTTP Requests are enabled before allowing requests and git clone if (
if isinstance(data, str) and data.startswith("http"): isinstance(data, str)
and data.startswith("http")
and (os.getenv("ALLOW_HTTP_REQUESTS", "true").lower() == "true")
):
if "github" in data: if "github" in data:
# Perform git clone if the URL is from GitHub # Perform git clone if the URL is from GitHub
repo_name = data.split("/")[-1].replace(".git", "") repo_name = data.split("/")[-1].replace(".git", "")

View file

@ -10,6 +10,7 @@ from fastapi.responses import JSONResponse, FileResponse
from cognee.api.DTO import InDTO, OutDTO from cognee.api.DTO import InDTO, OutDTO
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.methods import get_authorized_existing_datasets
from cognee.modules.data.methods import create_dataset, get_datasets_by_name from cognee.modules.data.methods import create_dataset, get_datasets_by_name
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.api.v1.delete.exceptions import DataNotFoundError, DatasetNotFoundError from cognee.api.v1.delete.exceptions import DataNotFoundError, DatasetNotFoundError
@ -177,7 +178,8 @@ def get_datasets_router() -> APIRouter:
async def get_dataset_data(dataset_id: UUID, user: User = Depends(get_authenticated_user)): async def get_dataset_data(dataset_id: UUID, user: User = Depends(get_authenticated_user)):
from cognee.modules.data.methods import get_dataset_data, get_dataset from cognee.modules.data.methods import get_dataset_data, get_dataset
dataset = await get_dataset(user.id, dataset_id) # Verify user has permission to read dataset
dataset = await get_authorized_existing_datasets([dataset_id], "read", user)
if dataset is None: if dataset is None:
return JSONResponse( return JSONResponse(
@ -185,7 +187,7 @@ def get_datasets_router() -> APIRouter:
content=ErrorResponseDTO(f"Dataset ({str(dataset_id)}) not found."), content=ErrorResponseDTO(f"Dataset ({str(dataset_id)}) not found."),
) )
dataset_data = await get_dataset_data(dataset_id=dataset.id) dataset_data = await get_dataset_data(dataset_id=dataset[0].id)
if dataset_data is None: if dataset_data is None:
return [] return []
@ -200,6 +202,9 @@ def get_datasets_router() -> APIRouter:
from cognee.api.v1.datasets.datasets import datasets as cognee_datasets from cognee.api.v1.datasets.datasets import datasets as cognee_datasets
try: try:
# Verify user has permission to read dataset
await get_authorized_existing_datasets(datasets, "read", user)
datasets_statuses = await cognee_datasets.get_status(datasets) datasets_statuses = await cognee_datasets.get_status(datasets)
return datasets_statuses return datasets_statuses
@ -211,16 +216,17 @@ def get_datasets_router() -> APIRouter:
dataset_id: UUID, data_id: UUID, user: User = Depends(get_authenticated_user) dataset_id: UUID, data_id: UUID, user: User = Depends(get_authenticated_user)
): ):
from cognee.modules.data.methods import get_data from cognee.modules.data.methods import get_data
from cognee.modules.data.methods import get_dataset, get_dataset_data from cognee.modules.data.methods import get_dataset_data
dataset = await get_dataset(user.id, dataset_id) # Verify user has permission to read dataset
dataset = await get_authorized_existing_datasets([dataset_id], "read", user)
if dataset is None: if dataset is None:
return JSONResponse( return JSONResponse(
status_code=404, content={"detail": f"Dataset ({dataset_id}) not found."} status_code=404, content={"detail": f"Dataset ({dataset_id}) not found."}
) )
dataset_data = await get_dataset_data(dataset.id) dataset_data = await get_dataset_data(dataset[0].id)
if dataset_data is None: if dataset_data is None:
raise DataNotFoundError(message=f"No data found in dataset ({dataset_id}).") raise DataNotFoundError(message=f"No data found in dataset ({dataset_id}).")

View file

@ -1,3 +1,4 @@
import os
from fastapi import Form, UploadFile, Depends from fastapi import Form, UploadFile, Depends
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from fastapi import APIRouter from fastapi import APIRouter
@ -37,8 +38,9 @@ def get_delete_router() -> APIRouter:
# Handle each file in the list # Handle each file in the list
results = [] results = []
for file in data: for file in data:
# TODO: Add check if HTTP Requests are enabled before allowing requests and git clone if file.filename.startswith("http") and (
if file.filename.startswith("http"): os.getenv("ALLOW_HTTP_REQUESTS", "true").lower() == "true"
):
if "github" in file.filename: if "github" in file.filename:
# For GitHub repos, we need to get the content hash of each file # For GitHub repos, we need to get the content hash of each file
repo_name = file.filename.split("/")[-1].replace(".git", "") repo_name = file.filename.split("/")[-1].replace(".git", "")

View file

@ -1,6 +1,12 @@
from fastapi import APIRouter from fastapi import APIRouter, Depends
from fastapi.responses import HTMLResponse, JSONResponse from fastapi.responses import HTMLResponse, JSONResponse
from uuid import UUID
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.modules.users.methods import get_authenticated_user
from cognee.modules.data.methods import get_authorized_existing_datasets
from cognee.modules.users.models import User
from cognee.context_global_variables import set_database_global_context_variables
logger = get_logger() logger = get_logger()
@ -9,11 +15,17 @@ def get_visualize_router() -> APIRouter:
router = APIRouter() router = APIRouter()
@router.get("", response_model=None) @router.get("", response_model=None)
async def visualize(): async def visualize(dataset_id: UUID, user: User = Depends(get_authenticated_user)):
"""This endpoint is responsible for adding data to the graph.""" """This endpoint is responsible for adding data to the graph."""
from cognee.api.v1.visualize import visualize_graph from cognee.api.v1.visualize import visualize_graph
try: try:
# Verify user has permission to read dataset
dataset = await get_authorized_existing_datasets([dataset_id], "read", user)
# Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
await set_database_global_context_variables(dataset[0].id, dataset[0].owner_id)
html_visualization = await visualize_graph() html_visualization = await visualize_graph()
return HTMLResponse(html_visualization) return HTMLResponse(html_visualization)

View file

@ -11,235 +11,292 @@ from cognee.shared.data_models import KnowledgeGraph
from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver
def _create_node_key(node_id: str, category: str) -> str:
"""Create a standardized node key"""
return f"{node_id}_{category}"
def _create_edge_key(source_id: str, target_id: str, relationship_name: str) -> str:
"""Create a standardized edge key"""
return f"{source_id}_{target_id}_{relationship_name}"
def _process_ontology_nodes(
ontology_nodes: list,
data_chunk: DocumentChunk,
added_nodes_map: dict,
added_ontology_nodes_map: dict
) -> None:
"""Process and store ontology nodes"""
for ontology_node in ontology_nodes:
ont_node_id = generate_node_id(ontology_node.name)
ont_node_name = generate_node_name(ontology_node.name)
if ontology_node.category == "classes":
ont_node_key = _create_node_key(ont_node_id, "type")
if ont_node_key not in added_nodes_map and ont_node_key not in added_ontology_nodes_map:
added_ontology_nodes_map[ont_node_key] = EntityType(
id=ont_node_id,
name=ont_node_name,
description=ont_node_name,
ontology_valid=True,
)
elif ontology_node.category == "individuals":
ont_node_key = _create_node_key(ont_node_id, "entity")
if ont_node_key not in added_nodes_map and ont_node_key not in added_ontology_nodes_map:
added_ontology_nodes_map[ont_node_key] = Entity(
id=ont_node_id,
name=ont_node_name,
description=ont_node_name,
ontology_valid=True,
belongs_to_set=data_chunk.belongs_to_set,
)
def _process_ontology_edges(
ontology_edges: list,
existing_edges_map: dict,
ontology_relationships: list
) -> None:
"""Process ontology edges and add them if new"""
for source, relation, target in ontology_edges:
source_node_id = generate_node_id(source)
target_node_id = generate_node_id(target)
relationship_name = generate_edge_name(relation)
edge_key = _create_edge_key(source_node_id, target_node_id, relationship_name)
if edge_key not in existing_edges_map:
ontology_relationships.append(
(
source_node_id,
target_node_id,
relationship_name,
{
"relationship_name": relationship_name,
"source_node_id": source_node_id,
"target_node_id": target_node_id,
"ontology_valid": True,
},
)
)
existing_edges_map[edge_key] = True
def _create_type_node(
node_type: str,
ontology_resolver: OntologyResolver,
added_nodes_map: dict,
added_ontology_nodes_map: dict,
name_mapping: dict,
key_mapping: dict,
data_chunk: DocumentChunk,
existing_edges_map: dict,
ontology_relationships: list
) -> EntityType:
"""Create or retrieve a type node with ontology validation"""
node_id = generate_node_id(node_type)
node_name = generate_node_name(node_type)
type_node_key = _create_node_key(node_id, "type")
if type_node_key in added_nodes_map or type_node_key in key_mapping:
return added_nodes_map.get(type_node_key) or added_nodes_map.get(key_mapping.get(type_node_key))
# Get ontology validation
ontology_nodes, ontology_edges, closest_class = ontology_resolver.get_subgraph(
node_name=node_name, node_type="classes"
)
ontology_validated = bool(closest_class)
if ontology_validated:
old_key = type_node_key
node_id = generate_node_id(closest_class.name)
type_node_key = _create_node_key(node_id, "type")
new_node_name = generate_node_name(closest_class.name)
name_mapping[node_name] = closest_class.name
key_mapping[old_key] = type_node_key
node_name = new_node_name
type_node = EntityType(
id=node_id,
name=node_name,
type=node_name,
description=node_name,
ontology_valid=ontology_validated,
)
added_nodes_map[type_node_key] = type_node
# Process ontology nodes and edges
_process_ontology_nodes(ontology_nodes, data_chunk, added_nodes_map, added_ontology_nodes_map)
_process_ontology_edges(ontology_edges, existing_edges_map, ontology_relationships)
return type_node
def _create_entity_node(
node_id: str,
node_name: str,
node_description: str,
type_node: EntityType,
ontology_resolver: OntologyResolver,
added_nodes_map: dict,
added_ontology_nodes_map: dict,
name_mapping: dict,
key_mapping: dict,
data_chunk: DocumentChunk,
existing_edges_map: dict,
ontology_relationships: list
) -> Entity:
"""Create or retrieve an entity node with ontology validation"""
generated_node_id = generate_node_id(node_id)
generated_node_name = generate_node_name(node_name)
entity_node_key = _create_node_key(generated_node_id, "entity")
if entity_node_key in added_nodes_map or entity_node_key in key_mapping:
return added_nodes_map.get(entity_node_key) or added_nodes_map.get(key_mapping.get(entity_node_key))
# Get ontology validation
ontology_nodes, ontology_edges, start_ent_ont = ontology_resolver.get_subgraph(
node_name=generated_node_name, node_type="individuals"
)
ontology_validated = bool(start_ent_ont)
if ontology_validated:
old_key = entity_node_key
generated_node_id = generate_node_id(start_ent_ont.name)
entity_node_key = _create_node_key(generated_node_id, "entity")
new_node_name = generate_node_name(start_ent_ont.name)
name_mapping[generated_node_name] = start_ent_ont.name
key_mapping[old_key] = entity_node_key
generated_node_name = new_node_name
entity_node = Entity(
id=generated_node_id,
name=generated_node_name,
is_a=type_node,
description=node_description,
ontology_valid=ontology_validated,
belongs_to_set=data_chunk.belongs_to_set,
)
added_nodes_map[entity_node_key] = entity_node
# Process ontology nodes and edges
_process_ontology_nodes(ontology_nodes, data_chunk, added_nodes_map, added_ontology_nodes_map)
_process_ontology_edges(ontology_edges, existing_edges_map, ontology_relationships)
return entity_node
def _process_graph_nodes(
data_chunk: DocumentChunk,
graph: KnowledgeGraph,
ontology_resolver: OntologyResolver,
added_nodes_map: dict,
added_ontology_nodes_map: dict,
name_mapping: dict,
key_mapping: dict,
existing_edges_map: dict,
ontology_relationships: list
) -> None:
"""Process nodes in a knowledge graph"""
for node in graph.nodes:
# Create type node
type_node = _create_type_node(
node.type, ontology_resolver, added_nodes_map, added_ontology_nodes_map,
name_mapping, key_mapping, data_chunk, existing_edges_map, ontology_relationships
)
# Create entity node
entity_node = _create_entity_node(
node.id, node.name, node.description, type_node, ontology_resolver,
added_nodes_map, added_ontology_nodes_map, name_mapping, key_mapping,
data_chunk, existing_edges_map, ontology_relationships
)
# Add entity to data chunk
if data_chunk.contains is None:
data_chunk.contains = []
data_chunk.contains.append(entity_node)
def _process_graph_edges(
graph: KnowledgeGraph,
name_mapping: dict,
existing_edges_map: dict,
relationships: list
) -> None:
"""Process edges in a knowledge graph"""
for edge in graph.edges:
# Apply name mapping if exists
source_id = name_mapping.get(edge.source_node_id, edge.source_node_id)
target_id = name_mapping.get(edge.target_node_id, edge.target_node_id)
source_node_id = generate_node_id(source_id)
target_node_id = generate_node_id(target_id)
relationship_name = generate_edge_name(edge.relationship_name)
edge_key = _create_edge_key(source_node_id, target_node_id, relationship_name)
if edge_key not in existing_edges_map:
relationships.append(
(
source_node_id,
target_node_id,
relationship_name,
{
"relationship_name": relationship_name,
"source_node_id": source_node_id,
"target_node_id": target_node_id,
"ontology_valid": False,
},
)
)
existing_edges_map[edge_key] = True
def expand_with_nodes_and_edges( def expand_with_nodes_and_edges(
data_chunks: list[DocumentChunk], data_chunks: list[DocumentChunk],
chunk_graphs: list[KnowledgeGraph], chunk_graphs: list[KnowledgeGraph],
ontology_resolver: OntologyResolver = None, ontology_resolver: OntologyResolver = None,
existing_edges_map: Optional[dict[str, bool]] = None, existing_edges_map: Optional[dict[str, bool]] = None,
): ):
"""
Expand data chunks with nodes and edges, validating against ontology.
"""
if existing_edges_map is None: if existing_edges_map is None:
existing_edges_map = {} existing_edges_map = {}
if ontology_resolver is None: if ontology_resolver is None:
ontology_resolver = OntologyResolver() ontology_resolver = OntologyResolver()
added_nodes_map = {} added_nodes_map = {}
added_ontology_nodes_map = {} added_ontology_nodes_map = {}
relationships = [] relationships = []
ontology_relationships = [] ontology_relationships = []
name_mapping = {} name_mapping = {}
key_mapping = {} key_mapping = {}
# Process each chunk and its corresponding graph
for data_chunk, graph in zip(data_chunks, chunk_graphs): for data_chunk, graph in zip(data_chunks, chunk_graphs):
if not graph: if not graph:
continue continue
for node in graph.nodes: # Process nodes first
node_id = generate_node_id(node.id) _process_graph_nodes(
node_name = generate_node_name(node.name) data_chunk, graph, ontology_resolver, added_nodes_map, added_ontology_nodes_map,
type_node_id = generate_node_id(node.type) name_mapping, key_mapping, existing_edges_map, ontology_relationships
type_node_name = generate_node_name(node.type) )
ontology_validated_source_type = False # Then process edges
ontology_validated_source_ent = False _process_graph_edges(graph, name_mapping, existing_edges_map, relationships)
type_node_key = f"{type_node_id}_type" # Return combined results
if type_node_key not in added_nodes_map and type_node_key not in key_mapping:
(
ontology_entity_type_nodes,
ontology_entity_type_edges,
ontology_closest_class_node,
) = ontology_resolver.get_subgraph(node_name=type_node_name, node_type="classes")
if ontology_closest_class_node:
name_mapping[type_node_name] = ontology_closest_class_node.name
ontology_validated_source_type = True
old_key = type_node_key
type_node_id = generate_node_id(ontology_closest_class_node.name)
type_node_key = f"{type_node_id}_type"
type_node_name = generate_node_name(ontology_closest_class_node.name)
key_mapping[old_key] = type_node_key
type_node = EntityType(
id=type_node_id,
name=type_node_name,
type=type_node_name,
description=type_node_name,
ontology_valid=ontology_validated_source_type,
)
added_nodes_map[type_node_key] = type_node
for ontology_node_to_store in ontology_entity_type_nodes:
ont_node_id = generate_node_id(ontology_node_to_store.name)
ont_node_name = generate_node_name(ontology_node_to_store.name)
if ontology_node_to_store.category == "classes":
ont_node_key = f"{ont_node_id}_type"
if (ont_node_key not in added_nodes_map) and (
ont_node_key not in added_ontology_nodes_map
):
added_ontology_nodes_map[ont_node_key] = EntityType(
id=ont_node_id,
name=ont_node_name,
description=ont_node_name,
ontology_valid=True,
)
elif ontology_node_to_store.category == "individuals":
ont_node_key = f"{ont_node_id}_entity"
if (ont_node_key not in added_nodes_map) and (
ont_node_key not in added_ontology_nodes_map
):
added_ontology_nodes_map[ont_node_key] = Entity(
id=ont_node_id,
name=ont_node_name,
description=ont_node_name,
ontology_valid=True,
belongs_to_set=data_chunk.belongs_to_set,
)
for source, relation, target in ontology_entity_type_edges:
source_node_id = generate_node_id(source)
target_node_id = generate_node_id(target)
relationship_name = generate_edge_name(relation)
edge_key = f"{source_node_id}_{target_node_id}_{relationship_name}"
if edge_key not in existing_edges_map:
ontology_relationships.append(
(
source_node_id,
target_node_id,
relationship_name,
dict(
relationship_name=relationship_name,
source_node_id=source_node_id,
target_node_id=target_node_id,
),
)
)
existing_edges_map[edge_key] = True
else:
type_node = added_nodes_map.get(type_node_key) or added_nodes_map.get(
key_mapping.get(type_node_key)
)
entity_node_key = f"{node_id}_entity"
if entity_node_key not in added_nodes_map and entity_node_key not in key_mapping:
ontology_entity_nodes, ontology_entity_edges, start_ent_ont = (
ontology_resolver.get_subgraph(node_name=node_name, node_type="individuals")
)
if start_ent_ont:
name_mapping[node_name] = start_ent_ont.name
ontology_validated_source_ent = True
old_key = entity_node_key
node_id = generate_node_id(start_ent_ont.name)
entity_node_key = f"{node_id}_entity"
node_name = generate_node_name(start_ent_ont.name)
key_mapping[old_key] = entity_node_key
entity_node = Entity(
id=node_id,
name=node_name,
is_a=type_node,
description=node.description,
ontology_valid=ontology_validated_source_ent,
belongs_to_set=data_chunk.belongs_to_set,
)
added_nodes_map[entity_node_key] = entity_node
for ontology_node_to_store in ontology_entity_nodes:
ont_node_id = generate_node_id(ontology_node_to_store.name)
ont_node_name = generate_node_name(ontology_node_to_store.name)
if ontology_node_to_store.category == "classes":
ont_node_key = f"{ont_node_id}_type"
if (ont_node_key not in added_nodes_map) and (
ont_node_key not in added_ontology_nodes_map
):
added_ontology_nodes_map[ont_node_key] = EntityType(
id=ont_node_id,
name=ont_node_name,
description=ont_node_name,
ontology_valid=True,
)
elif ontology_node_to_store.category == "individuals":
ont_node_key = f"{ont_node_id}_entity"
if (ont_node_key not in added_nodes_map) and (
ont_node_key not in added_ontology_nodes_map
):
added_ontology_nodes_map[ont_node_key] = Entity(
id=ont_node_id,
name=ont_node_name,
description=ont_node_name,
ontology_valid=True,
belongs_to_set=data_chunk.belongs_to_set,
)
for source, relation, target in ontology_entity_edges:
source_node_id = generate_node_id(source)
target_node_id = generate_node_id(target)
relationship_name = generate_edge_name(relation)
edge_key = f"{source_node_id}_{target_node_id}_{relationship_name}"
if edge_key not in existing_edges_map:
ontology_relationships.append(
(
source_node_id,
target_node_id,
relationship_name,
dict(
relationship_name=relationship_name,
source_node_id=source_node_id,
target_node_id=target_node_id,
ontology_valid=True,
),
)
)
existing_edges_map[edge_key] = True
else:
entity_node = added_nodes_map.get(entity_node_key) or added_nodes_map.get(
key_mapping.get(entity_node_key)
)
if data_chunk.contains is None:
data_chunk.contains = []
data_chunk.contains.append(entity_node)
for edge in graph.edges:
source_node_id = generate_node_id(
name_mapping.get(edge.source_node_id, edge.source_node_id)
)
target_node_id = generate_node_id(
name_mapping.get(edge.target_node_id, edge.target_node_id)
)
relationship_name = generate_edge_name(edge.relationship_name)
edge_key = f"{source_node_id}_{target_node_id}_{relationship_name}"
if edge_key not in existing_edges_map:
relationships.append(
(
source_node_id,
target_node_id,
relationship_name,
dict(
relationship_name=relationship_name,
source_node_id=source_node_id,
target_node_id=target_node_id,
ontology_valid=False,
),
)
)
existing_edges_map[edge_key] = True
graph_nodes = list(added_ontology_nodes_map.values()) graph_nodes = list(added_ontology_nodes_map.values())
graph_edges = relationships + ontology_relationships graph_edges = relationships + ontology_relationships
return graph_nodes, graph_edges return graph_nodes, graph_edges

View file

@ -1,3 +1,4 @@
import os
from typing import Union, BinaryIO, Any from typing import Union, BinaryIO, Any
from cognee.modules.ingestion.exceptions import IngestionError from cognee.modules.ingestion.exceptions import IngestionError
@ -20,7 +21,8 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], datase
file_path = data_item file_path = data_item
# data is a file path # data is a file path
elif data_item.startswith("file://") or data_item.startswith("/"): elif data_item.startswith("file://") or data_item.startswith("/"):
# TODO: Add check if ACCEPT_LOCAL_FILE_PATH is enabled, if it's not raise an error if os.getenv("ACCEPT_LOCAL_FILE_PATH", "true").lower() == "false":
raise IngestionError(message="Local files are not accepted.")
file_path = data_item.replace("file://", "") file_path = data_item.replace("file://", "")
# data is text # data is text
else: else: