Feat: add number of tokens and descriptive graph metrics to metric table [COG-1132] (#481)

* Count the number of tokens in documents

* save token count to relational db

* Add metrics to metric table

* Store list as json instead of array in relational db table

* Sum in sql instead of python

* Unify naming

* Return data_points in descriptive metric calculation task

---------

Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com>
This commit is contained in:
alekszievr 2025-01-30 12:39:14 +01:00 committed by GitHub
parent edae2771a5
commit a79f7133fd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 60 additions and 10 deletions

View file

@ -25,6 +25,7 @@ from cognee.tasks.documents import (
)
from cognee.tasks.graph import extract_graph_from_data
from cognee.tasks.storage import add_data_points
from cognee.tasks.storage.descriptive_metrics import store_descriptive_metrics
from cognee.tasks.storage.index_graph_edges import index_graph_edges
from cognee.tasks.summarization import summarize_text
@ -164,6 +165,7 @@ async def get_default_tasks(
task_config={"batch_size": 10},
),
Task(add_data_points, only_root=True, task_config={"batch_size": 10}),
Task(store_descriptive_metrics),
]
except Exception as error:
send_telemetry("cognee.cognify DEFAULT TASKS CREATION ERRORED", user.id)

View file

@ -1,23 +1,23 @@
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, Float, Integer, ARRAY, UUID
from sqlalchemy import Column, DateTime, Float, Integer, JSON, UUID
from cognee.infrastructure.databases.relational import Base
from uuid import uuid4
class GraphMetricData(Base):
__tablename__ = "graph_metric_table"
class GraphMetrics(Base):
__tablename__ = "graph_metrics_table"
# TODO: Change ID to reflect unique id of graph database
id = Column(UUID, primary_key=True, default=uuid4)
num_tokens = Column(Integer)
num_nodes = Column(Integer)
num_edges = Column(Integer)
mean_degree = Column(Float)
edge_density = Column(Float)
num_connected_components = Column(Integer)
sizes_of_connected_components = Column(ARRAY(Integer))
num_tokens = Column(Integer, nullable=True)
num_nodes = Column(Integer, nullable=True)
num_edges = Column(Integer, nullable=True)
mean_degree = Column(Float, nullable=True)
edge_density = Column(Float, nullable=True)
num_connected_components = Column(Integer, nullable=True)
sizes_of_connected_components = Column(JSON, nullable=True)
num_selfloops = Column(Integer, nullable=True)
diameter = Column(Integer, nullable=True)
avg_shortest_path_length = Column(Float, nullable=True)

View file

@ -0,0 +1,48 @@
from cognee.infrastructure.engine import DataPoint
from cognee.modules.data.processing.document_types import Document
from cognee.infrastructure.databases.relational import get_relational_engine
from sqlalchemy import select
from sqlalchemy.sql import func
from cognee.modules.data.models import Data
from cognee.modules.data.models import GraphMetrics
import uuid
from cognee.infrastructure.databases.graph import get_graph_engine
async def fetch_token_count(db_engine) -> int:
"""
Fetches and sums token counts from the database.
Returns:
int: The total number of tokens across all documents.
"""
async with db_engine.get_async_session() as session:
token_count_sum = await session.execute(select(func.sum(Data.token_count)))
token_count_sum = token_count_sum.scalar()
return token_count_sum
async def calculate_graph_metrics(graph_data):
nodes, edges = graph_data
graph_metrics = {
"num_nodes": len(nodes),
"num_edges": len(edges),
}
return graph_metrics
async def store_descriptive_metrics(data_points: list[DataPoint]):
db_engine = get_relational_engine()
graph_engine = await get_graph_engine()
graph_data = await graph_engine.get_graph_data()
token_count_sum = await fetch_token_count(db_engine)
graph_metrics = await calculate_graph_metrics(graph_data)
table_name = "graph_metrics_table"
metrics_dict = {"id": uuid.uuid4(), "num_tokens": token_count_sum} | graph_metrics
await db_engine.insert_data(table_name, metrics_dict)
return data_points