fix: support structured data conversion to data points (#512)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- New Features
- Introduced version tracking and enhanced metadata in core data models
for improved data consistency.
  
- Bug Fixes
- Improved error handling during graph data loading to prevent
disruptions from unexpected identifier formats.
  
- Refactor
- Centralized identifier parsing and streamlined model definitions,
ensuring smoother and more consistent operations across search,
retrieval, and indexing workflows.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Boris 2025-02-10 17:16:13 +01:00 committed by GitHub
parent 55a50153c6
commit 8f84713b54
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 51 additions and 63 deletions

View file

@ -25,8 +25,6 @@ from cognee.tasks.documents import (
)
from cognee.tasks.graph import extract_graph_from_data
from cognee.tasks.storage import add_data_points
from cognee.modules.data.methods import store_descriptive_metrics
from cognee.tasks.storage.index_graph_edges import index_graph_edges
from cognee.tasks.summarization import summarize_text
logger = logging.getLogger("cognify.v2")
@ -112,8 +110,6 @@ async def run_cognify_pipeline(dataset: Dataset, user: User, tasks: list[Task]):
async for result in pipeline:
print(result)
await index_graph_edges()
send_telemetry("cognee.cognify EXECUTION COMPLETED", user.id)
await log_pipeline_status(

View file

@ -5,7 +5,6 @@ import os
import json
import asyncio
import logging
from re import A
from typing import Dict, Any, List, Union
from uuid import UUID
import aiofiles
@ -13,6 +12,7 @@ import aiofiles.os as aiofiles_os
import networkx as nx
from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface
from cognee.infrastructure.engine import DataPoint
from cognee.infrastructure.engine.utils import parse_id
from cognee.modules.storage.utils import JSONEncoder
import numpy as np
@ -268,7 +268,11 @@ class NetworkXAdapter(GraphDBInterface):
for node in graph_data["nodes"]:
try:
if not isinstance(node["id"], UUID):
node["id"] = UUID(node["id"])
try:
node["id"] = UUID(node["id"])
except Exception:
# If conversion fails, keep the original id
pass
except Exception as e:
logger.error(e)
raise e
@ -285,12 +289,12 @@ class NetworkXAdapter(GraphDBInterface):
for edge in graph_data["links"]:
try:
if not isinstance(edge["source"], UUID):
source_id = UUID(edge["source"])
source_id = parse_id(edge["source"])
else:
source_id = edge["source"]
if not isinstance(edge["target"], UUID):
target_id = UUID(edge["target"])
target_id = parse_id(edge["target"])
else:
target_id = edge["target"]

View file

@ -69,7 +69,7 @@ class FalkorDBAdapter(VectorDBInterface, GraphDBInterface):
return ",".join([f"{key}:{parse_value(value)}" for key, value in properties.items()])
async def create_data_point_query(self, data_point: DataPoint, vectorized_values: dict):
node_label = type(data_point).__tablename__
node_label = type(data_point).__name__
property_names = DataPoint.get_embeddable_property_names(data_point)
node_properties = await self.stringify_properties(

View file

@ -1,6 +1,5 @@
import asyncio
from typing import Generic, List, Optional, TypeVar, get_type_hints
from uuid import UUID
import lancedb
from lancedb.pydantic import LanceModel, Vector
@ -8,6 +7,7 @@ from pydantic import BaseModel
from cognee.exceptions import InvalidValueError
from cognee.infrastructure.engine import DataPoint
from cognee.infrastructure.engine.utils import parse_id
from cognee.infrastructure.files.storage import LocalStorage
from cognee.modules.storage.utils import copy_model, get_own_properties
@ -133,7 +133,7 @@ class LanceDBAdapter(VectorDBInterface):
return [
ScoredResult(
id=UUID(result["id"]),
id=parse_id(result["id"]),
payload=result["payload"],
score=0,
)
@ -162,7 +162,7 @@ class LanceDBAdapter(VectorDBInterface):
return [
ScoredResult(
id=UUID(result["id"]),
id=parse_id(result["id"]),
payload=result["payload"],
score=normalized_values[value_index],
)
@ -195,7 +195,7 @@ class LanceDBAdapter(VectorDBInterface):
return [
ScoredResult(
id=UUID(result["id"]),
id=parse_id(result["id"]),
payload=result["payload"],
score=normalized_values[value_index],
)

View file

@ -3,9 +3,9 @@ from __future__ import annotations
import asyncio
import logging
from typing import List, Optional
from uuid import UUID
from cognee.infrastructure.engine import DataPoint
from cognee.infrastructure.engine.utils import parse_id
from ..embeddings.EmbeddingEngine import EmbeddingEngine
from ..models.ScoredResult import ScoredResult
@ -193,7 +193,7 @@ class MilvusAdapter(VectorDBInterface):
return [
ScoredResult(
id=UUID(result["id"]),
id=parse_id(result["id"]),
score=result["distance"],
payload=result.get("entity", {}),
)

View file

@ -9,6 +9,7 @@ from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker
from cognee.exceptions import InvalidValueError
from cognee.infrastructure.databases.exceptions import EntityNotFoundError
from cognee.infrastructure.engine import DataPoint
from cognee.infrastructure.engine.utils import parse_id
from ...relational.ModelBase import Base
from ...relational.sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter
@ -169,7 +170,7 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
results = results.all()
return [
ScoredResult(id=UUID(result.id), payload=result.payload, score=0)
ScoredResult(id=parse_id(result.id), payload=result.payload, score=0)
for result in results
]
@ -208,7 +209,7 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
# Create and return ScoredResult objects
return [
ScoredResult(id=UUID(str(row.id)), payload=row.payload, score=row.similarity)
ScoredResult(id=parse_id(str(row.id)), payload=row.payload, score=row.similarity)
for row in vector_list
]
@ -249,7 +250,7 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
for vector in closest_items:
vector_list.append(
{
"id": UUID(str(vector.id)),
"id": parse_id(str(vector.id)),
"payload": vector.payload,
"_distance": vector.similarity,
}

View file

@ -1,7 +1,7 @@
import logging
from typing import Dict, List, Optional
from uuid import UUID
from cognee.infrastructure.engine.utils import parse_id
from qdrant_client import AsyncQdrantClient, models
from cognee.exceptions import InvalidValueError
@ -170,10 +170,10 @@ class QDrantAdapter(VectorDBInterface):
return [
ScoredResult(
id=UUID(result.id),
id=parse_id(result.id),
payload={
**result.payload,
"id": UUID(result.id),
"id": parse_id(result.id),
},
score=1 - result.score,
)
@ -209,10 +209,10 @@ class QDrantAdapter(VectorDBInterface):
return [
ScoredResult(
id=UUID(result.id),
id=parse_id(result.id),
payload={
**result.payload,
"id": UUID(result.id),
"id": parse_id(result.id),
},
score=1 - result.score,
)

View file

@ -1,10 +1,10 @@
import asyncio
import logging
from typing import List, Optional
from uuid import UUID
from cognee.exceptions import InvalidValueError
from cognee.infrastructure.engine import DataPoint
from cognee.infrastructure.engine.utils import parse_id
from ..embeddings.EmbeddingEngine import EmbeddingEngine
from ..models.ScoredResult import ScoredResult
@ -188,7 +188,7 @@ class WeaviateAdapter(VectorDBInterface):
return [
ScoredResult(
id=UUID(str(result.uuid)),
id=parse_id(str(result.uuid)),
payload=result.properties,
score=1 - float(result.metadata.score),
)
@ -221,7 +221,7 @@ class WeaviateAdapter(VectorDBInterface):
return [
ScoredResult(
id=UUID(str(result.uuid)),
id=parse_id(str(result.uuid)),
payload=result.properties,
score=1 - float(result.metadata.score),
)

View file

@ -15,7 +15,6 @@ class MetaData(TypedDict):
# Updated DataPoint model with versioning and new fields
class DataPoint(BaseModel):
__tablename__ = "data_point"
id: UUID = Field(default_factory=uuid4)
created_at: int = Field(
default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000)

View file

@ -0,0 +1 @@
from .parse_id import parse_id

View file

@ -0,0 +1,10 @@
from uuid import UUID
def parse_id(id: any):
if isinstance(id, str):
try:
return UUID(id)
except Exception:
pass
return id

View file

@ -6,14 +6,12 @@ from cognee.modules.engine.models import Entity
class DocumentChunk(DataPoint):
__tablename__ = "document_chunk"
text: str
word_count: int
token_count: int
chunk_index: int
cut_type: str
is_part_of: Document
pydantic_type: str = "DocumentChunk"
contains: List[Entity] = None
metadata: dict = {"index_fields": ["text"]}

View file

@ -3,10 +3,8 @@ from cognee.modules.engine.models.EntityType import EntityType
class Entity(DataPoint):
__tablename__ = "entity"
name: str
is_a: EntityType
description: str
pydantic_type: str = "Entity"
metadata: dict = {"index_fields": ["name"]}

View file

@ -2,9 +2,7 @@ from cognee.infrastructure.engine import DataPoint
class EntityType(DataPoint):
__tablename__ = "entity_type"
name: str
description: str
pydantic_type: str = "EntityType"
metadata: dict = {"index_fields": ["name"]}

View file

@ -137,7 +137,7 @@ class CogneeGraph(CogneeAbstractGraph):
raise ValueError("Failed to generate query embedding.")
edge_distances = await vector_engine.get_distance_from_collection_elements(
"edge_type_relationship_name", query_text=query
"EdgeType_relationship_name", query_text=query
)
embedding_map = {result.payload["text"]: result.score for result in edge_distances}

View file

@ -2,7 +2,6 @@ from cognee.infrastructure.engine import DataPoint
class EdgeType(DataPoint):
__tablename__ = "edge_type"
relationship_name: str
number_of_edges: int

View file

@ -61,9 +61,6 @@ async def get_graph_from_model(
if include_root and str(data_point.id) not in added_nodes:
SimpleDataPointModel = copy_model(
type(data_point),
include_fields={
"__tablename__": (str, data_point.__tablename__),
},
exclude_fields=list(excluded_properties),
)
nodes.append(SimpleDataPointModel(**data_point_properties))

View file

@ -1,8 +1,8 @@
import json
from uuid import UUID
from typing import Callable
from cognee.exceptions import InvalidValueError
from cognee.infrastructure.engine.utils import parse_id
from cognee.modules.retrieval.code_graph_retrieval import code_graph_retrieval
from cognee.modules.search.types import SearchType
from cognee.modules.storage.utils import JSONEncoder
@ -32,7 +32,7 @@ async def search(
for search_result in search_results:
document_id = search_result["document_id"] if "document_id" in search_result else None
document_id = UUID(document_id) if isinstance(document_id, str) else document_id
document_id = parse_id(document_id)
if document_id is None or document_id in own_document_ids:
filtered_search_results.append(search_result)

View file

@ -3,16 +3,12 @@ from cognee.infrastructure.engine import DataPoint
class Repository(DataPoint):
__tablename__ = "Repository"
path: str
pydantic_type: str = "Repository"
metadata: dict = {"index_fields": []}
class CodeFile(DataPoint):
__tablename__ = "codefile"
extracted_id: str # actually file path
pydantic_type: str = "CodeFile"
source_code: Optional[str] = None
part_of: Optional[Repository] = None
depends_on: Optional[List["CodeFile"]] = None
@ -22,19 +18,15 @@ class CodeFile(DataPoint):
class CodePart(DataPoint):
__tablename__ = "codepart"
file_path: str # file path
# part_of: Optional[CodeFile] = None
pydantic_type: str = "CodePart"
source_code: Optional[str] = None
metadata: dict = {"index_fields": []}
class SourceCodeChunk(DataPoint):
__tablename__ = "sourcecodechunk"
code_chunk_of: Optional[CodePart] = None
source_code: Optional[str] = None
pydantic_type: str = "SourceCodeChunk"
previous_chunk: Optional["SourceCodeChunk"] = None
metadata: dict = {"index_fields": ["source_code"]}

View file

@ -231,7 +231,6 @@ class SummarizedContent(BaseModel):
summary: str
description: str
pydantic_type: str = "SummarizedContent"
class SummarizedFunction(BaseModel):
@ -240,7 +239,6 @@ class SummarizedFunction(BaseModel):
inputs: Optional[List[str]] = None
outputs: Optional[List[str]] = None
decorators: Optional[List[str]] = None
pydantic_type: str = "SummarizedFunction"
class SummarizedClass(BaseModel):
@ -248,7 +246,6 @@ class SummarizedClass(BaseModel):
description: str
methods: Optional[List[SummarizedFunction]] = None
decorators: Optional[List[str]] = None
pydantic_type: str = "SummarizedClass"
class SummarizedCode(BaseModel):
@ -259,7 +256,6 @@ class SummarizedCode(BaseModel):
classes: List[SummarizedClass] = []
functions: List[SummarizedFunction] = []
workflow_description: Optional[str] = None
pydantic_type: str = "SummarizedCode"
class GraphDBType(Enum):

View file

@ -3,6 +3,7 @@ from cognee.infrastructure.engine import DataPoint
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.graph.utils import deduplicate_nodes_and_edges, get_graph_from_model
from .index_data_points import index_data_points
from .index_graph_edges import index_graph_edges
async def add_data_points(data_points: list[DataPoint]):
@ -38,4 +39,7 @@ async def add_data_points(data_points: list[DataPoint]):
await graph_engine.add_nodes(nodes)
await graph_engine.add_edges(edges)
# This step has to happen after adding nodes and edges because we query the graph.
await index_graph_edges()
return data_points

View file

@ -51,10 +51,10 @@ async def index_graph_edges():
data_point_type = type(edge)
for field_name in edge.metadata["index_fields"]:
index_name = f"{data_point_type.__tablename__}.{field_name}"
index_name = f"{data_point_type.__name__}.{field_name}"
if index_name not in created_indexes:
await vector_engine.create_vector_index(data_point_type.__tablename__, field_name)
await vector_engine.create_vector_index(data_point_type.__name__, field_name)
created_indexes[index_name] = True
if index_name not in index_points:

View file

@ -6,7 +6,6 @@ from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
class TextSummary(DataPoint):
__tablename__ = "text_summary"
text: str
made_from: DocumentChunk
@ -14,9 +13,7 @@ class TextSummary(DataPoint):
class CodeSummary(DataPoint):
__tablename__ = "code_summary"
text: str
summarizes: Union[CodeFile, CodePart, SourceCodeChunk]
pydantic_type: str = "CodeSummary"
metadata: dict = {"index_fields": ["text"]}

View file

@ -3,10 +3,8 @@ from typing import Optional
class GraphitiNode(DataPoint):
__tablename__ = "graphitinode"
content: Optional[str] = None
name: Optional[str] = None
summary: Optional[str] = None
pydantic_type: str = "GraphitiNode"
metadata: dict = {"index_fields": ["name", "summary", "content"]}

View file

@ -37,10 +37,10 @@ async def index_and_transform_graphiti_nodes_and_edges():
data_point_type = type(graphiti_node)
for field_name in graphiti_node.metadata["index_fields"]:
index_name = f"{data_point_type.__tablename__}.{field_name}"
index_name = f"{data_point_type.__name__}.{field_name}"
if index_name not in created_indexes:
await vector_engine.create_vector_index(data_point_type.__tablename__, field_name)
await vector_engine.create_vector_index(data_point_type.__name__, field_name)
created_indexes[index_name] = True
if index_name not in index_points:
@ -66,10 +66,10 @@ async def index_and_transform_graphiti_nodes_and_edges():
data_point_type = type(edge)
for field_name in edge.metadata["index_fields"]:
index_name = f"{data_point_type.__tablename__}.{field_name}"
index_name = f"{data_point_type.__name__}.{field_name}"
if index_name not in created_indexes:
await vector_engine.create_vector_index(data_point_type.__tablename__, field_name)
await vector_engine.create_vector_index(data_point_type.__name__, field_name)
created_indexes[index_name] = True
if index_name not in index_points: