Add type to DataPoint metadata (#364)
* Add type to DataPoint metadata * Add missing index_fields * Use DataPoint UUID type in pgvector create_data_points * Make _metadata mandatory everywhere
This commit is contained in:
parent
5360093097
commit
bfa0f06fb4
19 changed files with 167 additions and 65 deletions
|
|
@ -1,21 +1,26 @@
|
|||
import asyncio
|
||||
# from datetime import datetime
|
||||
import json
|
||||
from uuid import UUID
|
||||
from textwrap import dedent
|
||||
from uuid import UUID
|
||||
|
||||
from falkordb import FalkorDB
|
||||
|
||||
from cognee.exceptions import InvalidValueError
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface
|
||||
from cognee.infrastructure.databases.graph.graph_db_interface import \
|
||||
GraphDBInterface
|
||||
from cognee.infrastructure.databases.vector.embeddings import EmbeddingEngine
|
||||
from cognee.infrastructure.databases.vector.vector_db_interface import VectorDBInterface
|
||||
from cognee.infrastructure.databases.vector.vector_db_interface import \
|
||||
VectorDBInterface
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
|
||||
class IndexSchema(DataPoint):
|
||||
text: str
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["text"]
|
||||
"index_fields": ["text"],
|
||||
"type": "IndexSchema"
|
||||
}
|
||||
|
||||
class FalkorDBAdapter(VectorDBInterface, GraphDBInterface):
|
||||
|
|
|
|||
|
|
@ -1,25 +1,29 @@
|
|||
from typing import List, Optional, get_type_hints, Generic, TypeVar
|
||||
import asyncio
|
||||
from typing import Generic, List, Optional, TypeVar, get_type_hints
|
||||
from uuid import UUID
|
||||
|
||||
import lancedb
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
from pydantic import BaseModel
|
||||
from lancedb.pydantic import Vector, LanceModel
|
||||
|
||||
from cognee.exceptions import InvalidValueError
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.infrastructure.files.storage import LocalStorage
|
||||
from cognee.modules.storage.utils import copy_model, get_own_properties
|
||||
from ..models.ScoredResult import ScoredResult
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
from ..utils import normalize_distances
|
||||
|
||||
from ..embeddings.EmbeddingEngine import EmbeddingEngine
|
||||
from ..models.ScoredResult import ScoredResult
|
||||
from ..utils import normalize_distances
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
|
||||
|
||||
class IndexSchema(DataPoint):
|
||||
id: str
|
||||
text: str
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["text"]
|
||||
"index_fields": ["text"],
|
||||
"type": "IndexSchema"
|
||||
}
|
||||
|
||||
class LanceDBAdapter(VectorDBInterface):
|
||||
|
|
|
|||
|
|
@ -4,10 +4,12 @@ import asyncio
|
|||
import logging
|
||||
from typing import List, Optional
|
||||
from uuid import UUID
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
from ..models.ScoredResult import ScoredResult
|
||||
|
||||
from ..embeddings.EmbeddingEngine import EmbeddingEngine
|
||||
from ..models.ScoredResult import ScoredResult
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
|
||||
logger = logging.getLogger("MilvusAdapter")
|
||||
|
||||
|
|
@ -16,7 +18,8 @@ class IndexSchema(DataPoint):
|
|||
text: str
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["text"]
|
||||
"index_fields": ["text"],
|
||||
"type": "IndexSchema"
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import asyncio
|
||||
from uuid import UUID
|
||||
from typing import List, Optional, get_type_hints
|
||||
from uuid import UUID
|
||||
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import JSON, Column, Table, select, delete, MetaData
|
||||
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker
|
||||
|
|
@ -9,19 +10,21 @@ from cognee.exceptions import InvalidValueError
|
|||
from cognee.infrastructure.databases.exceptions import EntityNotFoundError
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
from .serialize_data import serialize_data
|
||||
from ..models.ScoredResult import ScoredResult
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
from ..utils import normalize_distances
|
||||
from ..embeddings.EmbeddingEngine import EmbeddingEngine
|
||||
from ...relational.sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter
|
||||
from ...relational.ModelBase import Base
|
||||
from ...relational.sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter
|
||||
from ..embeddings.EmbeddingEngine import EmbeddingEngine
|
||||
from ..models.ScoredResult import ScoredResult
|
||||
from ..utils import normalize_distances
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
from .serialize_data import serialize_data
|
||||
|
||||
|
||||
class IndexSchema(DataPoint):
|
||||
text: str
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["text"]
|
||||
"index_fields": ["text"],
|
||||
"type": "IndexSchema"
|
||||
}
|
||||
|
||||
class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
|
||||
|
|
@ -89,6 +92,7 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
|
|||
async def create_data_points(
|
||||
self, collection_name: str, data_points: List[DataPoint]
|
||||
):
|
||||
data_point_types = get_type_hints(DataPoint)
|
||||
if not await self.has_collection(collection_name):
|
||||
await self.create_collection(
|
||||
collection_name = collection_name,
|
||||
|
|
@ -108,7 +112,7 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
|
|||
primary_key: Mapped[int] = mapped_column(
|
||||
primary_key=True, autoincrement=True
|
||||
)
|
||||
id: Mapped[type(data_points[0].id)]
|
||||
id: Mapped[data_point_types["id"]]
|
||||
payload = Column(JSON)
|
||||
vector = Column(self.Vector(vector_size))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,13 +1,16 @@
|
|||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
from uuid import UUID
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
from qdrant_client import AsyncQdrantClient, models
|
||||
|
||||
from cognee.exceptions import InvalidValueError
|
||||
from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult
|
||||
from cognee.infrastructure.databases.vector.models.ScoredResult import \
|
||||
ScoredResult
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
|
||||
from ..embeddings.EmbeddingEngine import EmbeddingEngine
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
|
||||
logger = logging.getLogger("QDrantAdapter")
|
||||
|
||||
|
|
@ -15,7 +18,8 @@ class IndexSchema(DataPoint):
|
|||
text: str
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["text"]
|
||||
"index_fields": ["text"],
|
||||
"type": "IndexSchema"
|
||||
}
|
||||
|
||||
# class CollectionConfig(BaseModel, extra = "forbid"):
|
||||
|
|
|
|||
|
|
@ -5,9 +5,10 @@ from uuid import UUID
|
|||
|
||||
from cognee.exceptions import InvalidValueError
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
from ..models.ScoredResult import ScoredResult
|
||||
|
||||
from ..embeddings.EmbeddingEngine import EmbeddingEngine
|
||||
from ..models.ScoredResult import ScoredResult
|
||||
from ..vector_db_interface import VectorDBInterface
|
||||
|
||||
logger = logging.getLogger("WeaviateAdapter")
|
||||
|
||||
|
|
@ -15,7 +16,8 @@ class IndexSchema(DataPoint):
|
|||
text: str
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["text"]
|
||||
"index_fields": ["text"],
|
||||
"type": "IndexSchema"
|
||||
}
|
||||
|
||||
class WeaviateAdapter(VectorDBInterface):
|
||||
|
|
|
|||
|
|
@ -1,8 +1,10 @@
|
|||
from typing_extensions import TypedDict
|
||||
from uuid import UUID, uuid4
|
||||
from typing import Optional
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
|
||||
class MetaData(TypedDict):
|
||||
index_fields: list[str]
|
||||
|
|
@ -13,7 +15,8 @@ class DataPoint(BaseModel):
|
|||
updated_at: Optional[datetime] = datetime.now(timezone.utc)
|
||||
topological_rank: Optional[int] = 0
|
||||
_metadata: Optional[MetaData] = {
|
||||
"index_fields": []
|
||||
"index_fields": [],
|
||||
"type": "DataPoint"
|
||||
}
|
||||
|
||||
# class Config:
|
||||
|
|
@ -39,4 +42,4 @@ class DataPoint(BaseModel):
|
|||
|
||||
@classmethod
|
||||
def get_embeddable_property_names(self, data_point):
|
||||
return data_point._metadata["index_fields"] or []
|
||||
return data_point._metadata["index_fields"] or []
|
||||
|
|
@ -1,8 +1,10 @@
|
|||
from typing import List, Optional
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.modules.data.processing.document_types import Document
|
||||
from cognee.modules.engine.models import Entity
|
||||
|
||||
|
||||
class DocumentChunk(DataPoint):
|
||||
__tablename__ = "document_chunk"
|
||||
text: str
|
||||
|
|
@ -12,6 +14,7 @@ class DocumentChunk(DataPoint):
|
|||
is_part_of: Document
|
||||
contains: List[Entity] = None
|
||||
|
||||
_metadata: Optional[dict] = {
|
||||
_metadata: dict = {
|
||||
"index_fields": ["text"],
|
||||
"type": "DocumentChunk"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,12 +1,17 @@
|
|||
from cognee.infrastructure.engine import DataPoint
|
||||
from uuid import UUID
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
|
||||
class Document(DataPoint):
|
||||
type: str
|
||||
name: str
|
||||
raw_data_location: str
|
||||
metadata_id: UUID
|
||||
mime_type: str
|
||||
_metadata: dict = {
|
||||
"index_fields": ["name"],
|
||||
"type": "Document"
|
||||
}
|
||||
|
||||
def read(self, chunk_size: int) -> str:
|
||||
pass
|
||||
pass
|
||||
|
|
@ -10,4 +10,5 @@ class Entity(DataPoint):
|
|||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["name"],
|
||||
"type": "Entity"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,11 +1,12 @@
|
|||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
|
||||
class EntityType(DataPoint):
|
||||
__tablename__ = "entity_type"
|
||||
name: str
|
||||
type: str
|
||||
description: str
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["name"],
|
||||
"type": "EntityType"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,11 +1,14 @@
|
|||
from typing import Optional
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
|
||||
class EdgeType(DataPoint):
|
||||
__tablename__ = "edge_type"
|
||||
relationship_name: str
|
||||
number_of_edges: int
|
||||
|
||||
_metadata: Optional[dict] = {
|
||||
_metadata: dict = {
|
||||
"index_fields": ["relationship_name"],
|
||||
"type": "EdgeType"
|
||||
}
|
||||
|
|
@ -2,7 +2,7 @@ from cognee.infrastructure.engine import DataPoint
|
|||
|
||||
|
||||
def convert_node_to_data_point(node_data: dict) -> DataPoint:
|
||||
subclass = find_subclass_by_name(DataPoint, node_data["type"])
|
||||
subclass = find_subclass_by_name(DataPoint, node_data._metadata["type"])
|
||||
|
||||
return subclass(**node_data)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,19 @@
|
|||
from typing import List, Optional
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
|
||||
class Repository(DataPoint):
|
||||
__tablename__ = "Repository"
|
||||
path: str
|
||||
type: Optional[str] = "Repository"
|
||||
_metadata: dict = {
|
||||
"index_fields": ["source_code"],
|
||||
"type": "Repository"
|
||||
}
|
||||
|
||||
class CodeFile(DataPoint):
|
||||
__tablename__ = "codefile"
|
||||
extracted_id: str # actually file path
|
||||
type: Optional[str] = "CodeFile"
|
||||
source_code: Optional[str] = None
|
||||
part_of: Optional[Repository] = None
|
||||
depends_on: Optional[List["CodeFile"]] = None
|
||||
|
|
@ -17,24 +21,27 @@ class CodeFile(DataPoint):
|
|||
contains: Optional[List["CodePart"]] = None
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["source_code"]
|
||||
"index_fields": ["source_code"],
|
||||
"type": "CodeFile"
|
||||
}
|
||||
|
||||
class CodePart(DataPoint):
|
||||
__tablename__ = "codepart"
|
||||
# part_of: Optional[CodeFile]
|
||||
source_code: str
|
||||
type: Optional[str] = "CodePart"
|
||||
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["source_code"]
|
||||
"index_fields": ["source_code"],
|
||||
"type": "CodePart"
|
||||
}
|
||||
|
||||
class CodeRelationship(DataPoint):
|
||||
source_id: str
|
||||
target_id: str
|
||||
type: str # between files
|
||||
relation: str # depends on or depends directly
|
||||
_metadata: dict = {
|
||||
"type": "CodeRelationship"
|
||||
}
|
||||
|
||||
CodeFile.model_rebuild()
|
||||
CodePart.model_rebuild()
|
||||
|
|
|
|||
|
|
@ -1,79 +1,90 @@
|
|||
from typing import Any, List, Union, Literal, Optional
|
||||
from typing import Any, List, Literal, Optional, Union
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
|
||||
class Variable(DataPoint):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Variable"] = "Variable"
|
||||
description: str
|
||||
is_static: Optional[bool] = False
|
||||
default_value: Optional[str] = None
|
||||
data_type: str
|
||||
|
||||
_metadata = {
|
||||
"index_fields": ["name"]
|
||||
"index_fields": ["name"],
|
||||
"type": "Variable"
|
||||
}
|
||||
|
||||
class Operator(DataPoint):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Operator"] = "Operator"
|
||||
description: str
|
||||
return_type: str
|
||||
_metadata = {
|
||||
"index_fields": ["name"],
|
||||
"type": "Operator"
|
||||
}
|
||||
|
||||
class Class(DataPoint):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Class"] = "Class"
|
||||
description: str
|
||||
constructor_parameters: List[Variable]
|
||||
extended_from_class: Optional["Class"] = None
|
||||
has_methods: List["Function"]
|
||||
|
||||
_metadata = {
|
||||
"index_fields": ["name"]
|
||||
"index_fields": ["name"],
|
||||
"type": "Class"
|
||||
}
|
||||
|
||||
class ClassInstance(DataPoint):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["ClassInstance"] = "ClassInstance"
|
||||
description: str
|
||||
from_class: Class
|
||||
instantiated_by: Union["Function"]
|
||||
instantiation_arguments: List[Variable]
|
||||
|
||||
_metadata = {
|
||||
"index_fields": ["name"]
|
||||
"index_fields": ["name"],
|
||||
"type": "ClassInstance"
|
||||
}
|
||||
|
||||
class Function(DataPoint):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Function"] = "Function"
|
||||
description: str
|
||||
parameters: List[Variable]
|
||||
return_type: str
|
||||
is_static: Optional[bool] = False
|
||||
|
||||
_metadata = {
|
||||
"index_fields": ["name"]
|
||||
"index_fields": ["name"],
|
||||
"type": "Function"
|
||||
}
|
||||
|
||||
class FunctionCall(DataPoint):
|
||||
id: str
|
||||
type: Literal["FunctionCall"] = "FunctionCall"
|
||||
called_by: Union[Function, Literal["main"]]
|
||||
function_called: Function
|
||||
function_arguments: List[Any]
|
||||
_metadata = {
|
||||
"index_fields": [],
|
||||
"type": "FunctionCall"
|
||||
}
|
||||
|
||||
class Expression(DataPoint):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Expression"] = "Expression"
|
||||
description: str
|
||||
expression: str
|
||||
members: List[Union[Variable, Function, Operator, "Expression"]]
|
||||
_metadata = {
|
||||
"index_fields": ["name"],
|
||||
"type": "Expression"
|
||||
}
|
||||
|
||||
class SourceCodeGraph(DataPoint):
|
||||
id: str
|
||||
|
|
@ -89,8 +100,13 @@ class SourceCodeGraph(DataPoint):
|
|||
Operator,
|
||||
Expression,
|
||||
]]
|
||||
_metadata = {
|
||||
"index_fields": ["name"],
|
||||
"type": "SourceCodeGraph"
|
||||
}
|
||||
|
||||
Class.model_rebuild()
|
||||
ClassInstance.model_rebuild()
|
||||
Expression.model_rebuild()
|
||||
FunctionCall.model_rebuild()
|
||||
SourceCodeGraph.model_rebuild()
|
||||
SourceCodeGraph.model_rebuild()
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
|
||||
async def index_data_points(data_points: list[DataPoint]):
|
||||
created_indexes = {}
|
||||
index_points = {}
|
||||
|
|
@ -80,11 +81,20 @@ if __name__ == "__main__":
|
|||
class Car(DataPoint):
|
||||
model: str
|
||||
color: str
|
||||
_metadata = {
|
||||
"index_fields": ["name"],
|
||||
"type": "Car"
|
||||
}
|
||||
|
||||
|
||||
class Person(DataPoint):
|
||||
name: str
|
||||
age: int
|
||||
owns_car: list[Car]
|
||||
_metadata = {
|
||||
"index_fields": ["name"],
|
||||
"type": "Person"
|
||||
}
|
||||
|
||||
car1 = Car(model = "Tesla Model S", color = "Blue")
|
||||
car2 = Car(model = "Toyota Camry", color = "Red")
|
||||
|
|
@ -92,4 +102,4 @@ if __name__ == "__main__":
|
|||
|
||||
data_points = get_data_points_from_model(person)
|
||||
|
||||
print(data_points)
|
||||
print(data_points)
|
||||
|
|
@ -10,6 +10,7 @@ class TextSummary(DataPoint):
|
|||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["text"],
|
||||
"type": "TextSummary"
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -20,4 +21,5 @@ class CodeSummary(DataPoint):
|
|||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["text"],
|
||||
"type": "CodeSummary"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import asyncio
|
|||
import random
|
||||
import time
|
||||
from typing import List
|
||||
from uuid import uuid5, NAMESPACE_OID
|
||||
from uuid import NAMESPACE_OID, uuid5
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.modules.graph.utils import get_graph_from_model
|
||||
|
|
@ -11,16 +11,28 @@ random.seed(1500)
|
|||
|
||||
class Repository(DataPoint):
|
||||
path: str
|
||||
_metadata = {
|
||||
"index_fields": [],
|
||||
"type": "Repository"
|
||||
}
|
||||
|
||||
class CodeFile(DataPoint):
|
||||
part_of: Repository
|
||||
contains: List["CodePart"] = []
|
||||
depends_on: List["CodeFile"] = []
|
||||
source_code: str
|
||||
_metadata = {
|
||||
"index_fields": [],
|
||||
"type": "CodeFile"
|
||||
}
|
||||
|
||||
class CodePart(DataPoint):
|
||||
part_of: CodeFile
|
||||
source_code: str
|
||||
_metadata = {
|
||||
"index_fields": [],
|
||||
"type": "CodePart"
|
||||
}
|
||||
|
||||
CodeFile.model_rebuild()
|
||||
CodePart.model_rebuild()
|
||||
|
|
|
|||
|
|
@ -1,25 +1,42 @@
|
|||
import asyncio
|
||||
import random
|
||||
from typing import List
|
||||
from uuid import uuid5, NAMESPACE_OID
|
||||
from uuid import NAMESPACE_OID, uuid5
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.modules.graph.utils import get_graph_from_model
|
||||
|
||||
|
||||
class Document(DataPoint):
|
||||
path: str
|
||||
_metadata = {
|
||||
"index_fields": [],
|
||||
"type": "Document"
|
||||
}
|
||||
|
||||
class DocumentChunk(DataPoint):
|
||||
part_of: Document
|
||||
text: str
|
||||
contains: List["Entity"] = None
|
||||
_metadata = {
|
||||
"index_fields": ["text"],
|
||||
"type": "DocumentChunk"
|
||||
}
|
||||
|
||||
class EntityType(DataPoint):
|
||||
name: str
|
||||
_metadata = {
|
||||
"index_fields": ["name"],
|
||||
"type": "EntityType"
|
||||
}
|
||||
|
||||
class Entity(DataPoint):
|
||||
name: str
|
||||
is_type: EntityType
|
||||
_metadata = {
|
||||
"index_fields": ["name"],
|
||||
"type": "Entity"
|
||||
}
|
||||
|
||||
DocumentChunk.model_rebuild()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue