added lancedb pandas removal

2025-08-27 19:14:16 +02:00 · 2025-08-27 19:14:16 +02:00 · 38bbfd42cf
commit 38bbfd42cf
parent 64d6d6ede2
4 changed files with 21 additions and 318 deletions
--- a/cognee/eval_framework/modal_eval_dashboard.py
+++ b/cognee/eval_framework/modal_eval_dashboard.py
@ -1,6 +1,10 @@
 import os
 import json
-import pandas as pd
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = None
 import subprocess
 import modal
 import streamlit as st
@ -12,7 +16,7 @@ metrics_volume = modal.Volume.from_name("evaluation_dashboard_results", create_i

 image = (
    modal.Image.debian_slim(python_version="3.11")
-    .pip_install("streamlit", "pandas", "plotly")
+    .pip_install("streamlit", "plotly")
    .add_local_file(__file__, "/root/serve_dashboard.py")
 )

@ -78,6 +82,12 @@ def main():
            }
        )

+    if pd is None:
+        st.error(
+            "Pandas is required for the evaluation dashboard. Install with 'pip install cognee[evals]' to use this feature."
+        )
+        return
+
    df = pd.DataFrame(records)
    if df.empty:
        st.warning("No JSON files found in the volume.")
--- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py
+++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py
@ -205,9 +205,12 @@ class LanceDBAdapter(VectorDBInterface):
        collection = await self.get_collection(collection_name)

        if len(data_point_ids) == 1:
-            results = await collection.query().where(f"id = '{data_point_ids[0]}'").to_pandas()
+            results = await collection.query().where(f"id = '{data_point_ids[0]}'")
        else:
-            results = await collection.query().where(f"id IN {tuple(data_point_ids)}").to_pandas()
+            results = await collection.query().where(f"id IN {tuple(data_point_ids)}")
+
+        # Convert query results to list format
+        results_list = results.to_list() if hasattr(results, "to_list") else list(results)

        return [
            ScoredResult(
@ -215,7 +218,7 @@ class LanceDBAdapter(VectorDBInterface):
                payload=result["payload"],
                score=0,
            )
-            for result in results.to_dict("index").values()
+            for result in results_list
        ]

    async def search(
@ -242,9 +245,9 @@ class LanceDBAdapter(VectorDBInterface):
        if limit == 0:
            return []

-        results = await collection.vector_search(query_vector).limit(limit).to_pandas()
+        result_values = await collection.vector_search(query_vector).limit(limit).to_list()

-        result_values = list(results.to_dict("index").values())
+        # result_values = list(results.to_dict("index").values())

        if not result_values:
            return []
--- a/cognee/tasks/graph/infer_data_ontology.py
+++ b/cognee/tasks/graph/infer_data_ontology.py
@ -1,309 +0,0 @@
-# PROPOSED TO BE DEPRECATED
-
-"""This module contains the OntologyEngine class which is responsible for adding graph ontology from a JSON or CSV file."""
-
-import csv
-import json
-from cognee.shared.logging_utils import get_logger
-from datetime import datetime, timezone
-from fastapi import status
-from typing import Any, Dict, List, Optional, Union, Type
-
-import aiofiles
-import pandas as pd
-from pydantic import BaseModel
-
-from cognee.modules.graph.exceptions import EntityNotFoundError
-from cognee.modules.ingestion.exceptions import IngestionError
-
-from cognee.infrastructure.data.chunking.config import get_chunk_config
-from cognee.infrastructure.data.chunking.get_chunking_engine import get_chunk_engine
-from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
-from cognee.infrastructure.files.utils.extract_text_from_file import extract_text_from_file
-from cognee.infrastructure.files.utils.guess_file_type import guess_file_type, FileTypeException
-from cognee.modules.data.methods.add_model_class_to_graph import (
-    add_model_class_to_graph,
-)
-from cognee.tasks.graph.models import NodeModel, GraphOntology
-from cognee.shared.data_models import KnowledgeGraph
-from cognee.modules.engine.utils import generate_node_id, generate_node_name
-from cognee.infrastructure.llm.LLMGateway import LLMGateway
-
-logger = get_logger("task:infer_data_ontology")
-
-
-async def extract_ontology(content: str, response_model: Type[BaseModel]):
-    """
-    Extracts structured ontology from the provided content using a pre-defined LLM client.
-
-    This asynchronous function retrieves a system prompt from a file and utilizes an LLM
-    client to create a structured output based on the input content and specified response
-    model.
-
-    Parameters:
-    -----------
-
-        - content (str): The content from which to extract the ontology.
-        - response_model (Type[BaseModel]): The model that defines the structure of the
-          output ontology.
-
-    Returns:
-    --------
-
-        The structured ontology extracted from the content.
-    """
-
-    system_prompt = LLMGateway.read_query_prompt("extract_ontology.txt")
-
-    ontology = await LLMGateway.acreate_structured_output(content, system_prompt, response_model)
-
-    return ontology
-
-
-class OntologyEngine:
-    """
-    Manage ontology data and operations for graph structures, providing methods for data
-    loading, flattening models, and adding ontological relationships to a graph database.
-
-    Public methods:
-
-    - flatten_model
-    - recursive_flatten
-    - load_data
-    - add_graph_ontology
-    """
-
-    async def flatten_model(
-        self, model: NodeModel, parent_id: Optional[str] = None
-    ) -> Dict[str, Any]:
-        """
-        Flatten the model to a dictionary including optional parent ID and relationship details
-        if available.
-
-        Parameters:
-        -----------
-
-            - model (NodeModel): The NodeModel instance to flatten.
-            - parent_id (Optional[str]): An optional ID of the parent node for hierarchical
-              purposes. (default None)
-
-        Returns:
-        --------
-
-            - Dict[str, Any]: A dictionary representation of the model with flattened
-              attributes.
-        """
-        result = model.dict()
-        result["parent_id"] = parent_id
-        if model.default_relationship:
-            result.update(
-                {
-                    "relationship_type": model.default_relationship.type,
-                    "relationship_source": model.default_relationship.source,
-                    "relationship_target": model.default_relationship.target,
-                }
-            )
-        return result
-
-    async def recursive_flatten(
-        self, items: Union[List[Dict[str, Any]], Dict[str, Any]], parent_id: Optional[str] = None
-    ) -> List[Dict[str, Any]]:
-        """
-        Recursively flatten a hierarchical structure of models into a flat list of dictionaries.
-
-        Parameters:
-        -----------
-
-            - items (Union[List[Dict[str, Any]], Dict[str, Any]]): A list or dictionary
-              containing models to flatten.
-            - parent_id (Optional[str]): An optional ID of the parent node to maintain hierarchy
-              during flattening. (default None)
-
-        Returns:
-        --------
-
-            - List[Dict[str, Any]]: A flat list of dictionaries representing the hierarchical
-              model structure.
-        """
-        flat_list = []
-
-        if isinstance(items, list):
-            for item in items:
-                flat_list.extend(await self.recursive_flatten(item, parent_id))
-        elif isinstance(items, dict):
-            model = NodeModel.model_validate(items)
-            flat_list.append(await self.flatten_model(model, parent_id))
-            for child in model.children:
-                flat_list.extend(await self.recursive_flatten(child, model.node_id))
-        return flat_list
-
-    async def load_data(self, file_path: str) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
-        """
-        Load data from a specified JSON or CSV file and return it in a structured format.
-
-        Parameters:
-        -----------
-
-            - file_path (str): The path to the file to load data from.
-
-        Returns:
-        --------
-
-            - Union[List[Dict[str, Any]], Dict[str, Any]]: Parsed data from the file as either a
-              list of dictionaries or a single dictionary depending on content type.
-        """
-        try:
-            if file_path.endswith(".json"):
-                async with aiofiles.open(file_path, mode="r") as f:
-                    data = await f.read()
-                    return json.loads(data)
-            elif file_path.endswith(".csv"):
-                async with aiofiles.open(file_path, mode="r") as f:
-                    content = await f.read()
-                    reader = csv.DictReader(content.splitlines())
-                    return list(reader)
-            else:
-                raise IngestionError(message="Unsupported file format")
-        except Exception as e:
-            raise IngestionError(
-                message=f"Failed to load data from {file_path}: {e}",
-                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
-            )
-
-    async def add_graph_ontology(self, file_path: str = None, documents: list = None):
-        """
-        Add graph ontology from a JSON or CSV file, or infer relationships from provided
-        document content. Raise exceptions for invalid file types or missing entities.
-
-        Parameters:
-        -----------
-
-            - file_path (str): Optional path to a file containing data to be loaded. (default
-              None)
-            - documents (list): Optional list of document objects for content extraction if no
-              file path is provided. (default None)
-        """
-        if file_path is None:
-            initial_chunks_and_ids = []
-
-            chunk_config = get_chunk_config()
-            chunk_engine = get_chunk_engine()
-            chunk_strategy = chunk_config.chunk_strategy
-
-            for base_file in documents:
-                with open(base_file.raw_data_location, "rb") as file:
-                    try:
-                        file_type = guess_file_type(file)
-                        text = extract_text_from_file(file, file_type)
-
-                        subchunks, chunks_with_ids = chunk_engine.chunk_data(
-                            chunk_strategy,
-                            text,
-                            chunk_config.chunk_size,
-                            chunk_config.chunk_overlap,
-                        )
-
-                        if chunks_with_ids[0][0] == 1:
-                            initial_chunks_and_ids.append({base_file.id: chunks_with_ids})
-
-                    except FileTypeException:
-                        logger.warning(
-                            "File (%s) has an unknown file type. We are skipping it.", file["id"]
-                        )
-
-            ontology = await extract_ontology(str(initial_chunks_and_ids), GraphOntology)
-            graph_client = await get_graph_engine()
-
-            await graph_client.add_nodes(
-                [
-                    (
-                        node.id,
-                        dict(
-                            uuid=generate_node_id(node.id),
-                            name=generate_node_name(node.name),
-                            type=generate_node_id(node.id),
-                            description=node.description,
-                            updated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
-                        ),
-                    )
-                    for node in ontology.nodes
-                ]
-            )
-
-            await graph_client.add_edges(
-                (
-                    generate_node_id(edge.source_id),
-                    generate_node_id(edge.target_id),
-                    edge.relationship_type,
-                    dict(
-                        source_node_id=generate_node_id(edge.source_id),
-                        target_node_id=generate_node_id(edge.target_id),
-                        relationship_name=edge.relationship_type,
-                        updated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
-                    ),
-                )
-                for edge in ontology.edges
-            )
-
-        else:
-            dataset_level_information = documents[0][1]
-
-            # Extract the list of valid IDs from the explanations
-            valid_ids = {item["id"] for item in dataset_level_information}
-            try:
-                data = await self.load_data(file_path)
-                flt_ontology = await self.recursive_flatten(data)
-                df = pd.DataFrame(flt_ontology)
-                graph_client = await get_graph_engine()
-
-                for _, row in df.iterrows():
-                    node_data = row.to_dict()
-                    node_id = node_data.pop("node_id", None)
-                    if node_id in valid_ids:
-                        await graph_client.add_node(node_id, node_data)
-                    if node_id not in valid_ids:
-                        raise EntityNotFoundError(
-                            message=f"Node ID {node_id} not found in the dataset"
-                        )
-                    if pd.notna(row.get("relationship_source")) and pd.notna(
-                        row.get("relationship_target")
-                    ):
-                        await graph_client.add_edge(
-                            row["relationship_source"],
-                            row["relationship_target"],
-                            relationship_name=row["relationship_type"],
-                            edge_properties={
-                                "source_node_id": row["relationship_source"],
-                                "target_node_id": row["relationship_target"],
-                                "relationship_name": row["relationship_type"],
-                                "updated_at": datetime.now(timezone.utc).strftime(
-                                    "%Y-%m-%d %H:%M:%S"
-                                ),
-                            },
-                        )
-
-                return
-            except Exception as e:
-                raise RuntimeError(f"Failed to add graph ontology from {file_path}: {e}") from e
-
-
-async def infer_data_ontology(documents, ontology_model=KnowledgeGraph, root_node_id=None):
-    """
-    Infer data ontology from provided documents and optionally add it to a graph.
-
-    Parameters:
-    -----------
-
-        - documents: The documents from which to infer the ontology.
-        - ontology_model: The ontology model to use for the inference, defaults to
-          KnowledgeGraph. (default KnowledgeGraph)
-        - root_node_id: An optional root node identifier for the ontology. (default None)
-    """
-    if ontology_model == KnowledgeGraph:
-        ontology_engine = OntologyEngine()
-        root_node_id = await ontology_engine.add_graph_ontology(documents=documents)
-    else:
-        graph_engine = await get_graph_engine()
-        await add_model_class_to_graph(ontology_model, graph_engine)
-
-    yield (documents, root_node_id)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -27,8 +27,6 @@ dependencies = [
    "typing_extensions>=4.12.2,<5.0.0",
    "nltk>=3.9.1,<4.0.0",
    "numpy>=1.26.4, <=4.0.0",
-    "pandas>=2.2.2,<3.0.0",
-
    "sqlalchemy>=2.0.39,<3.0.0",
    "aiosqlite>=0.20.0,<1.0.0",
    "tiktoken>=0.8.0,<1.0.0",
@ -110,6 +108,7 @@ codegraph = [
 evals = [
    "plotly>=6.0.0,<7",
    "gdown>=5.2.0,<6",
+    "pandas>=2.2.2,<3.0.0",
 ]

 graphiti = ["graphiti-core>=0.7.0,<0.8"]