feat: Natural Language Retriever (text2cypher) (#663)

## Description  ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin I added one example "get all connected nodes to entity" --------- Co-authored-by: Boris <boris@topoteretes.com>
2025-03-27 18:44:39 +04:00 · 2025-03-27 18:44:39 +04:00 · de5b7f2044
commit de5b7f2044
parent ebf1f81b35
5 changed files with 197 additions and 2 deletions
--- a/cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt
+++ b/cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt
@ -0,0 +1,66 @@
+You are an expert Neo4j Cypher query generator tasked with translating natural language questions into precise, optimized Cypher queries.
+
+TASK:
+Generate a valid, executable Cypher query that accurately answers the user's question based on the provided graph schema.
+
+GRAPH SCHEMA INFORMATION:
+- You will be given node labels and their properties in format: NodeLabels [list of properties]
+- You will be given relationship types between nodes
+- ONLY use node labels, properties, and relationship types that exist in the provided schema
+- Respect relationship directions (source→target) exactly as specified in the schema
+- Properties may have specific formats (e.g., dates, codes) - infer these from examples when possible
+
+QUERY REQUIREMENTS:
+1. Return ONLY the exact Cypher query with NO explanations, comments, or markdown
+2. Generate syntactically correct Neo4j Cypher code (Neo4j 4.4+ compatible)
+3. Be precise - match the exact property names and relationship types from the schema
+4. Handle complex queries by breaking them into logical pattern matching parts
+5. Use parameters (e.g., $name) for literal values when appropriate
+6. Use appropriate data types for parameters (strings, numbers, booleans)
+
+PERFORMANCE OPTIMIZATION:
+1. Use indexes and constraints when available (assume they exist on ID properties)
+2. Include LIMIT clauses for queries that could return large result sets
+3. Use efficient patterns - avoid unnecessary pattern complexity
+4. Consider using OPTIONAL MATCH for parts that might not exist
+5. For aggregation, use efficient aggregation functions (count, sum, avg)
+6. For pathfinding, consider using shortestPath() or apoc.algo.* procedures
+
+ERROR PREVENTION:
+1. Validate your query steps mentally before finalizing
+2. Ensure relationship directions match schema
+3. Check property names match exactly what's in the schema
+4. Use pattern variables consistently throughout the query
+5. If previous attempts failed, analyze the failures and adjust your approach
+
+Node schemas:
+- EntityType
+Properties: description, ontology_valid, name, created_at, type, version, topological_rank, updated_at, metadata, id
+Purpose: Represents the categories or classifications for entities in the database.
+
+- Entity
+Properties: description, ontology_valid, name, created_at, type, version, topological_rank, updated_at, metadata, id
+Purpose: Represents individual entities that belong to a specific type or classification.
+
+- TextDocument
+Properties: raw_data_location, name, mime_type, external_metadata, created_at, type, version, topological_rank, updated_at, metadata, id
+Purpose: Represents documents containing text data, along with metadata about their storage and format.
+
+- DocumentChunk
+Properties: version, created_at, type, topological_rank, cut_type, text, metadata, chunk_index, chunk_size, updated_at, id
+Purpose: Represents segmented portions of larger documents, useful for processing or analysis at a more granular level.
+
+- TextSummary
+Properties: topological_rank, metadata, id, type, updated_at, created_at, text, version
+Purpose: Represents summarized content generated from larger text documents, retaining essential information and metadata.
+
+Edge schema (relationship properties):
+`{{edge_schemas}}`
+
+This queries doesn't work. Do NOT use them:
+`{{previous_attempts}}`
+
+Example 1:
+Get all nodes connected to John
+MATCH (n:Entity {'name': 'John'})--(neighbor)
+RETURN n, neighbor
--- a/cognee/modules/retrieval/natural_language_retriever.py
+++ b/cognee/modules/retrieval/natural_language_retriever.py
@ -0,0 +1,117 @@
+from typing import Any, Optional
+import logging
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.infrastructure.databases.graph.networkx.adapter import NetworkXAdapter
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.infrastructure.llm.prompts import render_prompt
+from cognee.modules.retrieval.base_retriever import BaseRetriever
+from cognee.modules.retrieval.exceptions import SearchTypeNotSupported
+from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface
+
+logger = logging.getLogger("NaturalLanguageRetriever")
+
+
+class NaturalLanguageRetriever(BaseRetriever):
+    """Retriever for handling natural language search"""
+
+    def __init__(
+        self,
+        system_prompt_path: str = "natural_language_retriever_system.txt",
+        max_attempts: int = 3,
+    ):
+        """Initialize retriever with optional custom prompt paths."""
+        self.system_prompt_path = system_prompt_path
+        self.max_attempts = max_attempts
+
+    async def _get_graph_schema(self, graph_engine) -> tuple:
+        """Retrieve the node and edge schemas from the graph database."""
+        node_schemas = await graph_engine.query(
+            """
+            MATCH (n)
+            UNWIND keys(n) AS prop
+            RETURN DISTINCT labels(n) AS NodeLabels, collect(DISTINCT prop) AS Properties;
+            """
+        )
+        edge_schemas = await graph_engine.query(
+            """
+            MATCH ()-[r]->()
+            UNWIND keys(r) AS key
+            RETURN DISTINCT key;
+            """
+        )
+        return node_schemas, edge_schemas
+
+    async def _generate_cypher_query(self, query: str, edge_schemas, previous_attempts=None) -> str:
+        """Generate a Cypher query using LLM based on natural language query and schema information."""
+        llm_client = get_llm_client()
+        system_prompt = render_prompt(
+            self.system_prompt_path,
+            context={
+                "edge_schemas": edge_schemas,
+                "previous_attempts": previous_attempts or "No attempts yet",
+            },
+        )
+
+        return await llm_client.acreate_structured_output(
+            text_input=query,
+            system_prompt=system_prompt,
+            response_model=str,
+        )
+
+    async def _execute_cypher_query(self, query: str, graph_engine: GraphDBInterface) -> Any:
+        """Execute the natural language query against Neo4j with multiple attempts."""
+        node_schemas, edge_schemas = await self._get_graph_schema(graph_engine)
+        previous_attempts = ""
+        cypher_query = ""
+
+        for attempt in range(self.max_attempts):
+            logger.info(f"Starting attempt {attempt + 1}/{self.max_attempts} for query generation")
+            try:
+                cypher_query = await self._generate_cypher_query(
+                    query, edge_schemas, previous_attempts
+                )
+
+                logger.info(
+                    f"Executing generated Cypher query (attempt {attempt + 1}): {cypher_query[:100]}..."
+                    if len(cypher_query) > 100
+                    else cypher_query
+                )
+                context = await graph_engine.query(cypher_query)
+
+                if context:
+                    result_count = len(context) if isinstance(context, list) else 1
+                    logger.info(
+                        f"Successfully executed query (attempt {attempt + 1}): returned {result_count} result(s)"
+                    )
+                    return context
+
+                previous_attempts += f"Query: {cypher_query} -> Result: None\n"
+
+            except Exception as e:
+                previous_attempts += f"Query: {cypher_query if 'cypher_query' in locals() else 'Not generated'} -> Executed with error: {e}\n"
+                logger.error(f"Error executing query: {str(e)}")
+
+        logger.warning(
+            f"Failed to get results after {self.max_attempts} attempts for query: '{query[:50]}...'"
+        )
+        return []
+
+    async def get_context(self, query: str) -> Optional[Any]:
+        """Retrieves relevant context using a natural language query converted to Cypher."""
+        try:
+            graph_engine = await get_graph_engine()
+
+            if isinstance(graph_engine, (NetworkXAdapter)):
+                raise SearchTypeNotSupported("Natural language search type not supported.")
+
+            return await self._execute_cypher_query(query, graph_engine)
+        except Exception as e:
+            logger.error("Failed to execute natural language search retrieval: %s", str(e))
+            raise e
+
+    async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
+        """Returns a completion based on the query and context."""
+        if context is None:
+            context = await self.get_context(query)
+
+        return context
--- a/cognee/modules/search/methods/search.py
+++ b/cognee/modules/search/methods/search.py
@ -13,6 +13,7 @@ from cognee.modules.retrieval.graph_summary_completion_retriever import (
 )
 from cognee.modules.retrieval.code_retriever import CodeRetriever
 from cognee.modules.retrieval.cypher_search_retriever import CypherSearchRetriever
+from cognee.modules.retrieval.natural_language_retriever import NaturalLanguageRetriever
 from cognee.modules.search.types import SearchType
 from cognee.modules.storage.utils import JSONEncoder
 from cognee.modules.users.models import User
@ -67,6 +68,7 @@ async def specific_search(
        ).get_completion,
        SearchType.CODE: CodeRetriever().get_completion,
        SearchType.CYPHER: CypherSearchRetriever().get_completion,
+        SearchType.NATURAL_LANGUAGE: NaturalLanguageRetriever().get_completion,
    }

    search_task = search_tasks.get(query_type)
--- a/cognee/modules/search/types/SearchType.py
+++ b/cognee/modules/search/types/SearchType.py
@ -10,3 +10,4 @@ class SearchType(Enum):
    GRAPH_SUMMARY_COMPLETION = "GRAPH_SUMMARY_COMPLETION"
    CODE = "CODE"
    CYPHER = "CYPHER"
+    NATURAL_LANGUAGE = "NATURAL_LANGUAGE"
--- a/cognee/tests/test_neo4j.py
+++ b/cognee/tests/test_neo4j.py
@ -70,14 +70,23 @@ async def main():
        query_type=SearchType.SUMMARIES, query_text=random_node_name
    )
    assert len(search_results) != 0, "Query related summaries don't exist."
-    print("\nExtracted summaries are:\n")
+    print("\nExtracted results are:\n")
+    for result in search_results:
+        print(f"{result}\n")
+
+    search_results = await cognee.search(
+        query_type=SearchType.NATURAL_LANGUAGE,
+        query_text=f"Find nodes connected to node with name {random_node_name}",
+    )
+    assert len(search_results) != 0, "Query related natural language don't exist."
+    print("\nExtracted results are:\n")
    for result in search_results:
        print(f"{result}\n")

    user = await get_default_user()
    history = await get_history(user.id)

-    assert len(history) == 6, "Search history is not correct."
+    assert len(history) == 10, "Search history is not correct."

    await cognee.prune.prune_data()
    assert not os.path.isdir(data_directory_path), "Local data files are not deleted"