Feat: add number of tokens and descriptive graph metrics to metric table [COG-1132] (#481)
* Count the number of tokens in documents * save token count to relational db * Add metrics to metric table * Store list as json instead of array in relational db table * Sum in sql instead of python * Unify naming * Return data_points in descriptive metric calculation task --------- Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com>
This commit is contained in:
parent
edae2771a5
commit
a79f7133fd
3 changed files with 60 additions and 10 deletions
|
|
@ -25,6 +25,7 @@ from cognee.tasks.documents import (
|
|||
)
|
||||
from cognee.tasks.graph import extract_graph_from_data
|
||||
from cognee.tasks.storage import add_data_points
|
||||
from cognee.tasks.storage.descriptive_metrics import store_descriptive_metrics
|
||||
from cognee.tasks.storage.index_graph_edges import index_graph_edges
|
||||
from cognee.tasks.summarization import summarize_text
|
||||
|
||||
|
|
@ -164,6 +165,7 @@ async def get_default_tasks(
|
|||
task_config={"batch_size": 10},
|
||||
),
|
||||
Task(add_data_points, only_root=True, task_config={"batch_size": 10}),
|
||||
Task(store_descriptive_metrics),
|
||||
]
|
||||
except Exception as error:
|
||||
send_telemetry("cognee.cognify DEFAULT TASKS CREATION ERRORED", user.id)
|
||||
|
|
|
|||
|
|
@ -1,23 +1,23 @@
|
|||
from datetime import datetime, timezone
|
||||
|
||||
from sqlalchemy import Column, DateTime, Float, Integer, ARRAY, UUID
|
||||
from sqlalchemy import Column, DateTime, Float, Integer, JSON, UUID
|
||||
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from uuid import uuid4
|
||||
|
||||
|
||||
class GraphMetricData(Base):
|
||||
__tablename__ = "graph_metric_table"
|
||||
class GraphMetrics(Base):
|
||||
__tablename__ = "graph_metrics_table"
|
||||
|
||||
# TODO: Change ID to reflect unique id of graph database
|
||||
id = Column(UUID, primary_key=True, default=uuid4)
|
||||
num_tokens = Column(Integer)
|
||||
num_nodes = Column(Integer)
|
||||
num_edges = Column(Integer)
|
||||
mean_degree = Column(Float)
|
||||
edge_density = Column(Float)
|
||||
num_connected_components = Column(Integer)
|
||||
sizes_of_connected_components = Column(ARRAY(Integer))
|
||||
num_tokens = Column(Integer, nullable=True)
|
||||
num_nodes = Column(Integer, nullable=True)
|
||||
num_edges = Column(Integer, nullable=True)
|
||||
mean_degree = Column(Float, nullable=True)
|
||||
edge_density = Column(Float, nullable=True)
|
||||
num_connected_components = Column(Integer, nullable=True)
|
||||
sizes_of_connected_components = Column(JSON, nullable=True)
|
||||
num_selfloops = Column(Integer, nullable=True)
|
||||
diameter = Column(Integer, nullable=True)
|
||||
avg_shortest_path_length = Column(Float, nullable=True)
|
||||
48
cognee/tasks/storage/descriptive_metrics.py
Normal file
48
cognee/tasks/storage/descriptive_metrics.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.modules.data.processing.document_types import Document
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.sql import func
|
||||
from cognee.modules.data.models import Data
|
||||
from cognee.modules.data.models import GraphMetrics
|
||||
import uuid
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
|
||||
|
||||
async def fetch_token_count(db_engine) -> int:
|
||||
"""
|
||||
Fetches and sums token counts from the database.
|
||||
|
||||
Returns:
|
||||
int: The total number of tokens across all documents.
|
||||
"""
|
||||
|
||||
async with db_engine.get_async_session() as session:
|
||||
token_count_sum = await session.execute(select(func.sum(Data.token_count)))
|
||||
token_count_sum = token_count_sum.scalar()
|
||||
|
||||
return token_count_sum
|
||||
|
||||
|
||||
async def calculate_graph_metrics(graph_data):
|
||||
nodes, edges = graph_data
|
||||
graph_metrics = {
|
||||
"num_nodes": len(nodes),
|
||||
"num_edges": len(edges),
|
||||
}
|
||||
return graph_metrics
|
||||
|
||||
|
||||
async def store_descriptive_metrics(data_points: list[DataPoint]):
|
||||
db_engine = get_relational_engine()
|
||||
graph_engine = await get_graph_engine()
|
||||
graph_data = await graph_engine.get_graph_data()
|
||||
|
||||
token_count_sum = await fetch_token_count(db_engine)
|
||||
graph_metrics = await calculate_graph_metrics(graph_data)
|
||||
|
||||
table_name = "graph_metrics_table"
|
||||
metrics_dict = {"id": uuid.uuid4(), "num_tokens": token_count_sum} | graph_metrics
|
||||
|
||||
await db_engine.insert_data(table_name, metrics_dict)
|
||||
return data_points
|
||||
Loading…
Add table
Reference in a new issue