feat: Calculate graph metrics for networkx graph [COG-1082] (#484)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Enabled an option to retrieve more detailed metrics, providing comprehensive analytics for graph and descriptive data. - **Refactor** - Standardized the way metrics are obtained across components for consistent behavior and improved data accuracy. - **Chore** - Made internal enhancements to support optional detailed metric calculations, streamlining system performance and ensuring future scalability. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com>
This commit is contained in:
parent
5119992fd8
commit
2858a674f5
6 changed files with 66 additions and 17 deletions
|
|
@ -165,7 +165,7 @@ async def get_default_tasks(
|
|||
task_config={"batch_size": 10},
|
||||
),
|
||||
Task(add_data_points, task_config={"batch_size": 10}),
|
||||
Task(store_descriptive_metrics),
|
||||
Task(store_descriptive_metrics, include_optional=True),
|
||||
]
|
||||
except Exception as error:
|
||||
send_telemetry("cognee.cognify DEFAULT TASKS CREATION ERRORED", user.id)
|
||||
|
|
|
|||
|
|
@ -56,5 +56,5 @@ class GraphDBInterface(Protocol):
|
|||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def get_graph_metrics(self):
|
||||
async def get_graph_metrics(self, include_optional):
|
||||
raise NotImplementedError
|
||||
|
|
|
|||
|
|
@ -531,7 +531,7 @@ class Neo4jAdapter(GraphDBInterface):
|
|||
|
||||
return (nodes, edges)
|
||||
|
||||
async def get_graph_metrics(self):
|
||||
async def get_graph_metrics(self, include_optional=False):
|
||||
return {
|
||||
"num_nodes": -1,
|
||||
"num_edges": -1,
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ import networkx as nx
|
|||
from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.modules.storage.utils import JSONEncoder
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger("NetworkXAdapter")
|
||||
|
||||
|
|
@ -386,16 +387,63 @@ class NetworkXAdapter(GraphDBInterface):
|
|||
|
||||
return filtered_nodes, filtered_edges
|
||||
|
||||
async def get_graph_metrics(self):
|
||||
return {
|
||||
"num_nodes": -1,
|
||||
"num_edges": -1,
|
||||
"mean_degree": -1,
|
||||
"edge_density": -1,
|
||||
"num_connected_components": -1,
|
||||
"sizes_of_connected_components": -1,
|
||||
"num_selfloops": -1,
|
||||
"diameter": -1,
|
||||
"avg_shortest_path_length": -1,
|
||||
"avg_clustering": -1,
|
||||
async def get_graph_metrics(self, include_optional=False):
|
||||
graph = self.graph
|
||||
|
||||
def _get_mean_degree(graph):
|
||||
degrees = [d for _, d in graph.degree()]
|
||||
return np.mean(degrees) if degrees else 0
|
||||
|
||||
def _get_edge_density(graph):
|
||||
num_nodes = graph.number_of_nodes()
|
||||
num_edges = graph.number_of_edges()
|
||||
num_possible_edges = num_nodes * (num_nodes - 1)
|
||||
edge_density = num_edges / num_possible_edges if num_possible_edges > 0 else 0
|
||||
return edge_density
|
||||
|
||||
def _get_diameter(graph):
|
||||
if nx.is_strongly_connected(graph):
|
||||
return nx.diameter(graph.to_undirected())
|
||||
else:
|
||||
return None
|
||||
|
||||
def _get_avg_shortest_path_length(graph):
|
||||
if nx.is_strongly_connected(graph):
|
||||
return nx.average_shortest_path_length(graph)
|
||||
else:
|
||||
return None
|
||||
|
||||
def _get_avg_clustering(graph):
|
||||
try:
|
||||
return nx.average_clustering(nx.DiGraph(graph))
|
||||
except Exception as e:
|
||||
logger.warning("Failed to calculate clustering coefficient: %s", e)
|
||||
return None
|
||||
|
||||
mandatory_metrics = {
|
||||
"num_nodes": graph.number_of_nodes(),
|
||||
"num_edges": graph.number_of_edges(),
|
||||
"mean_degree": _get_mean_degree(graph),
|
||||
"edge_density": _get_edge_density(graph),
|
||||
"num_connected_components": nx.number_weakly_connected_components(graph),
|
||||
"sizes_of_connected_components": [
|
||||
len(c) for c in nx.weakly_connected_components(graph)
|
||||
],
|
||||
}
|
||||
|
||||
if include_optional:
|
||||
optional_metrics = {
|
||||
"num_selfloops": sum(1 for u, v in graph.edges() if u == v),
|
||||
"diameter": _get_diameter(graph),
|
||||
"avg_shortest_path_length": _get_avg_shortest_path_length(graph),
|
||||
"avg_clustering": _get_avg_clustering(graph),
|
||||
}
|
||||
else:
|
||||
optional_metrics = {
|
||||
"num_selfloops": -1,
|
||||
"diameter": -1,
|
||||
"avg_shortest_path_length": -1,
|
||||
"avg_clustering": -1,
|
||||
}
|
||||
|
||||
return mandatory_metrics | optional_metrics
|
||||
|
|
|
|||
|
|
@ -23,10 +23,10 @@ async def fetch_token_count(db_engine) -> int:
|
|||
return token_count_sum
|
||||
|
||||
|
||||
async def store_descriptive_metrics(data_points: list[DataPoint]):
|
||||
async def store_descriptive_metrics(data_points: list[DataPoint], include_optional: bool):
|
||||
db_engine = get_relational_engine()
|
||||
graph_engine = await get_graph_engine()
|
||||
graph_metrics = await graph_engine.get_graph_metrics()
|
||||
graph_metrics = await graph_engine.get_graph_metrics(include_optional)
|
||||
|
||||
async with db_engine.get_async_session() as session:
|
||||
metrics = GraphMetrics(
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
from datetime import datetime, timezone
|
||||
from sqlalchemy.sql import func
|
||||
|
||||
from sqlalchemy import Column, DateTime, Float, Integer, JSON, UUID
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue