From 96eb0d448a2ed77295bdaac6f83119fb635810b2 Mon Sep 17 00:00:00 2001
From: Chaitany <67532224+patelchaitany@users.noreply.github.com>
Date: Fri, 19 Sep 2025 21:54:33 +0530
Subject: [PATCH] feat(#1357): Lexical chunk retriever (#1392)

<!-- .github/pull_request_template.md -->

## Description
<!--
Please provide a clear, human-generated description of the changes in
this PR.
DO NOT use AI-generated descriptions. We want to understand your thought
process and reasoning.
-->
I Implemented Lexical Chunk Retriever In the LexicalRetriever class is
Inherite the BaseRetriever and The DocumentChunk are lazy loaded when
first time query is made because it save time during object
initialization
and the function get_context and the get_completion are Implemented same
as the ChunksRetriever the only diffrence is that the DocumentChunk are
converted to match the output type of the ChunksRetriever using function
get_own_properties in the utils.

## Type of Change
<!-- Please check the relevant option -->
- [-] Bug fix (non-breaking change that fixes an issue)
- [-] New feature (non-breaking change that adds functionality)
- [-] Breaking change (fix or feature that would cause existing
functionality to change)
- [-] Documentation update
- [-] Code refactoring
- [-] Performance improvement
- [-] Other (please specify):

## Changes Made
<!-- List the specific changes made in this PR -->
- Added LexicalRetriever base class with customizable tokenizer & scorer
     - Implemented caching of DocumentChunk tokens and payloads
- Added robust initialization with error handling and logging -
Implemented get_context with top_k ranking and optional scores
- Implemented get_completion consistent with BaseRetriever interface
- Added JaccardChunksRetriever demo using set/multiset Jaccard
similarity
- Support for stopwords and multiset frequency-aware similarity -
Integrated logging for initialization, scoring, and retrieval

## Testing

- Manual tests: initialized retriever, retrieved chunks with toy corpus
    - Edge cases: empty corpus, empty query, scorer/tokenizer errors
    - Verified Jaccard similarity results for single/multiset cases
    - Code formatted and linted


## Screenshots/Videos (if applicable)
<!-- Add screenshots or videos to help explain your changes -->

## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [-] **I have tested my changes thoroughly before submitting this PR**
- [-] **This PR contains minimal changes necessary to address the
issue/feature**
- [-] My code follows the project's coding standards and style
guidelines
- [-] I have added tests that prove my fix is effective or that my
feature works
- [-] I have added necessary documentation (if applicable)
- [-] All new and existing tests pass
- [-] I have searched existing PRs to ensure this change hasn't been
submitted already
- [-] I have linked any relevant issues in the description
- [-] My commits have clear and descriptive messages

## Related Issues
<!-- Link any related issues using "Fixes #issue_number" or "Relates to
#issue_number" -->
Relates to  #1392
## Additional Notes
<!-- Add any additional notes, concerns, or context for reviewers -->
Int the cognee/modules/chunking/models/DocumentChunk.py
don't remove the optional  from is_part_of attributes.

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.

---------

Co-authored-by: Andrej Milicevic <milicevicandrej@yahoo.com>
Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
Co-authored-by: Igor Ilic <igorilic03@gmail.com>
Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
Co-authored-by: Boris <boris@topoteretes.com>
Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
---
 .../disable_independent_workflows.sh          |   2 +-
 .github/workflows/test_gemini.yml             |  29 -----
 .github/workflows/test_llms.yml               |  86 +++++++++++++
 .github/workflows/test_openrouter.yml         |  30 -----
 .github/workflows/test_suites.yml             |  23 ++--
 cognee/api/v1/search/search.py                |   3 +
 cognee/modules/retrieval/jaccard_retrival.py  |  56 +++++++++
 cognee/modules/retrieval/lexical_retriever.py | 117 ++++++++++++++++++
 .../search/methods/get_search_type_tools.py   |   5 +
 cognee/modules/search/types/SearchType.py     |   1 +
 10 files changed, 276 insertions(+), 76 deletions(-)
 delete mode 100644 .github/workflows/test_gemini.yml
 create mode 100644 .github/workflows/test_llms.yml
 delete mode 100644 .github/workflows/test_openrouter.yml
 create mode 100644 cognee/modules/retrieval/jaccard_retrival.py
 create mode 100644 cognee/modules/retrieval/lexical_retriever.py

diff --git a/.github/workflows/disable_independent_workflows.sh b/.github/workflows/disable_independent_workflows.sh
index 693c3092d..ff57da80d 100755
--- a/.github/workflows/disable_independent_workflows.sh
+++ b/.github/workflows/disable_independent_workflows.sh
@@ -10,7 +10,7 @@ WORKFLOWS=(
   "test_kuzu.yml"
   "test_multimetric_qa_eval_run.yaml"
   "test_graphrag_vs_rag_notebook.yml"
-  "test_gemini.yml"
+  "test_llms.yml"
   "test_multimedia_example.yaml"
   "test_deduplication.yml"
   "test_eval_framework.yml"
diff --git a/.github/workflows/test_gemini.yml b/.github/workflows/test_gemini.yml
deleted file mode 100644
index 544e15a5e..000000000
--- a/.github/workflows/test_gemini.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: test | gemini
-
-on:
-  workflow_call:
-
-jobs:
-  test-gemini:
-    name: Run Gemini Test
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run Gemini Simple Example
-        env:
-          LLM_PROVIDER: "gemini"
-          LLM_API_KEY: ${{ secrets.GEMINI_API_KEY }}
-          LLM_MODEL: "gemini/gemini-1.5-flash"
-          EMBEDDING_PROVIDER: "gemini"
-          EMBEDDING_API_KEY: ${{ secrets.GEMINI_API_KEY }}
-          EMBEDDING_MODEL: "gemini/text-embedding-004"
-          EMBEDDING_DIMENSIONS: "768"
-          EMBEDDING_MAX_TOKENS: "8076"
-        run: uv run python ./examples/python/simple_example.py
diff --git a/.github/workflows/test_llms.yml b/.github/workflows/test_llms.yml
new file mode 100644
index 000000000..5a0f947c9
--- /dev/null
+++ b/.github/workflows/test_llms.yml
@@ -0,0 +1,86 @@
+name: LLM Test Suites
+
+permissions:
+  contents: read
+
+on:
+  workflow_call:
+
+env:
+  RUNTIME__LOG_LEVEL: ERROR
+  ENV: 'dev'
+
+jobs:
+  test-gemini:
+    name: Run Gemini Test
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Gemini Simple Example
+        env:
+          LLM_PROVIDER: "gemini"
+          LLM_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+          LLM_MODEL: "gemini/gemini-1.5-flash"
+          EMBEDDING_PROVIDER: "gemini"
+          EMBEDDING_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+          EMBEDDING_MODEL: "gemini/text-embedding-004"
+          EMBEDDING_DIMENSIONS: "768"
+          EMBEDDING_MAX_TOKENS: "8076"
+        run: uv run python ./examples/python/simple_example.py
+
+  test-fastembed:
+    name: Run Fastembed Test
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Fastembed Simple Example
+        env:
+          LLM_PROVIDER: "openai"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_PROVIDER: "fastembed"
+          EMBEDDING_MODEL: "sentence-transformers/all-MiniLM-L6-v2"
+          EMBEDDING_DIMENSIONS: "384"
+          EMBEDDING_MAX_TOKENS: "256"
+        run: uv run python ./examples/python/simple_example.py
+
+  test-openrouter:
+    name: Run OpenRouter Test
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run OpenRouter Simple Example
+        env:
+          LLM_PROVIDER: "custom"
+          LLM_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+          LLM_MODEL: "openrouter/x-ai/grok-code-fast-1"
+          LLM_ENDPOINT: "https://openrouter.ai/api/v1"
+          EMBEDDING_PROVIDER: "openai"
+          EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          EMBEDDING_MODEL: "openai/text-embedding-3-large"
+          EMBEDDING_DIMENSIONS: "3072"
+          EMBEDDING_MAX_TOKENS: "8191"
+        run: uv run python ./examples/python/simple_example.py
\ No newline at end of file
diff --git a/.github/workflows/test_openrouter.yml b/.github/workflows/test_openrouter.yml
deleted file mode 100644
index 9c2dcdebe..000000000
--- a/.github/workflows/test_openrouter.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: test | openrouter
-
-on:
-  workflow_call:
-
-jobs:
-  test-openrouter:
-    name: Run OpenRouter Test
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: '3.11.x'
-
-      - name: Run OpenRouter Simple Example
-        env:
-          LLM_PROVIDER: "custom"
-          LLM_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
-          LLM_MODEL: "openrouter/x-ai/grok-code-fast-1"
-          LLM_ENDPOINT: "https://openrouter.ai/api/v1"
-          EMBEDDING_PROVIDER: "openai"
-          EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          EMBEDDING_MODEL: "openai/text-embedding-3-large"
-          EMBEDDING_DIMENSIONS: "3072"
-          EMBEDDING_MAX_TOKENS: "8191"
-        run: uv run python ./examples/python/simple_example.py
diff --git a/.github/workflows/test_suites.yml b/.github/workflows/test_suites.yml
index 86f89249d..ff18f2962 100644
--- a/.github/workflows/test_suites.yml
+++ b/.github/workflows/test_suites.yml
@@ -115,16 +115,10 @@ jobs:
     secrets: inherit
 
   # Additional LLM tests
-  gemini-tests:
-    name: Gemini Tests
-    needs: [basic-tests, e2e-tests]
-    uses: ./.github/workflows/test_gemini.yml
-    secrets: inherit
-
-  openrouter-tests:
-    name: OpenRouter Tests
-    needs: [basic-tests, e2e-tests]
-    uses: ./.github/workflows/test_openrouter.yml
+  llm-tests:
+    name: LLM Test Suite
+    needs: [ basic-tests, e2e-tests ]
+    uses: ./.github/workflows/test_llms.yml
     secrets: inherit
 
   # Ollama tests moved to the end
@@ -138,8 +132,7 @@ jobs:
       different-operating-systems-tests,
       vector-db-tests,
       example-tests,
-      gemini-tests,
-      openrouter-tests,
+      llm-tests,
       mcp-test,
       relational-db-migration-tests,
       docker-compose-test,
@@ -161,8 +154,7 @@ jobs:
       example-tests,
       db-examples-tests,
       mcp-test,
-      gemini-tests,
-      openrouter-tests,
+      llm-tests,
       ollama-tests,
       relational-db-migration-tests,
       docker-compose-test,
@@ -183,8 +175,7 @@ jobs:
                 "${{ needs.example-tests.result }}" == "success" &&
                 "${{ needs.db-examples-tests.result }}" == "success" &&
                 "${{ needs.relational-db-migration-tests.result }}" == "success" &&
-                "${{ needs.gemini-tests.result }}" == "success" &&
-                "${{ needs.openrouter-tests.result }}" == "success" &&
+                "${{ needs.llm-tests.result }}" == "success" &&
                 "${{ needs.docker-compose-test.result }}" == "success" &&
                 "${{ needs.docker-ci-test.result }}" == "success" &&
                 "${{ needs.ollama-tests.result }}" == "success" ]]; then
diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py
index dcebce012..7209c6036 100644
--- a/cognee/api/v1/search/search.py
+++ b/cognee/api/v1/search/search.py
@@ -82,6 +82,9 @@ async def search(
             Best for: General-purpose queries or when you're unsure which search type is best.
             Returns: The results from the automatically selected search type.
 
+        **CHUNKS_LEXICAL**:
+            Token-based lexical chunk search (e.g., Jaccard). Best for: exact-term matching, stopword-aware lookups.
+            Returns: Ranked text chunks (optionally with scores).
 
     Args:
         query_text: Your question or search query in natural language.
diff --git a/cognee/modules/retrieval/jaccard_retrival.py b/cognee/modules/retrieval/jaccard_retrival.py
new file mode 100644
index 000000000..91d2b67f7
--- /dev/null
+++ b/cognee/modules/retrieval/jaccard_retrival.py
@@ -0,0 +1,56 @@
+from cognee.modules.retrieval.lexical_retriever import LexicalRetriever
+import re
+from collections import Counter
+from typing import Optional
+class JaccardChunksRetriever(LexicalRetriever):
+    """
+    Retriever that specializes LexicalRetriever to use Jaccard similarity.
+    """
+
+    def __init__(self, top_k: int = 10, with_scores: bool = False,
+                 stop_words: Optional[list[str]] = None, multiset_jaccard: bool = False):
+        """
+        Parameters
+        ----------
+        top_k : int
+            Number of top results to return.
+        with_scores : bool
+            If True, return (payload, score) pairs. Otherwise, only payloads.
+        stop_words : list[str], optional
+            List of tokens to filter out.
+        multiset_jaccard : bool
+            If True, use multiset Jaccard (frequency aware).
+        """
+        self.stop_words = {t.lower() for t in stop_words} if stop_words else set()
+        self.multiset_jaccard = multiset_jaccard
+
+        super().__init__(
+            tokenizer=self._tokenizer,
+            scorer=self._scorer,
+            top_k=top_k,
+            with_scores=with_scores
+        )
+
+    def _tokenizer(self, text: str) -> list[str]:
+        """
+        Tokenizer: lowercases, splits on word characters (w+), filters stopwords.
+        """
+        tokens = re.findall(r"\w+", text.lower())
+        return [t for t in tokens if t not in self.stop_words]
+
+    def _scorer(self, query_tokens: list[str], chunk_tokens: list[str]) -> float:
+        """
+        Jaccard similarity scorer.
+        - If multiset_jaccard=True, uses frequency-aware Jaccard.
+        - Otherwise, normal set Jaccard.
+        """
+        if self.multiset_jaccard:
+            q_counts, c_counts = Counter(query_tokens), Counter(chunk_tokens)
+            numerator = sum(min(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts))
+            denominator = sum(max(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts))
+            return numerator / denominator if denominator else 0.0
+        else:
+            q_set, c_set = set(query_tokens), set(chunk_tokens)
+            if not q_set or not c_set:
+                return 0.0
+            return len(q_set & c_set) / len(q_set | c_set)
diff --git a/cognee/modules/retrieval/lexical_retriever.py b/cognee/modules/retrieval/lexical_retriever.py
new file mode 100644
index 000000000..2292b64c8
--- /dev/null
+++ b/cognee/modules/retrieval/lexical_retriever.py
@@ -0,0 +1,117 @@
+import asyncio
+from typing import Any, Callable, Optional
+from heapq import nlargest
+
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.modules.retrieval.base_retriever import BaseRetriever
+from cognee.modules.retrieval.exceptions.exceptions import NoDataError
+from cognee.shared.logging_utils import get_logger
+
+
+logger = get_logger("LexicalRetriever")
+
+
+class LexicalRetriever(BaseRetriever):
+
+    def __init__(self, tokenizer: Callable, scorer: Callable, top_k: int = 10, with_scores: bool = False):
+        if not callable(tokenizer) or not callable(scorer):
+            raise TypeError("tokenizer and scorer must be callables")
+        if not isinstance(top_k, int) or top_k <= 0:
+            raise ValueError("top_k must be a positive integer")
+
+        self.tokenizer = tokenizer
+        self.scorer = scorer
+        self.top_k = top_k
+        self.with_scores = bool(with_scores)
+
+        # Cache keyed by dataset context
+        self.chunks: dict[str, Any] = {}   # {chunk_id: tokens}
+        self.payloads: dict[str, Any] = {} # {chunk_id: original_document}
+        self._initialized = False
+        self._init_lock = asyncio.Lock()
+
+    async def initialize(self):
+      """Initialize retriever by reading all DocumentChunks from graph_engine."""
+      async with self._init_lock:
+          if self._initialized:
+              return
+
+          logger.info("Initializing LexicalRetriever by loading DocumentChunks from graph engine")
+
+          try:
+              graph_engine = await get_graph_engine()
+              nodes, _ = await graph_engine.get_filtered_graph_data([{"type": ["DocumentChunk"]}])
+          except Exception as e:
+              logger.error("Graph engine initialization failed")
+              raise NoDataError("Graph engine initialization failed") from e
+
+          chunk_count = 0
+          for node in nodes:
+              try:
+                  chunk_id, document = node
+              except Exception:
+                  logger.warning("Skipping node with unexpected shape: %r", node)
+                  continue
+
+              if document.get("type") == "DocumentChunk" and document.get("text"):
+                  try:
+                      tokens = self.tokenizer(document["text"])
+                      if not tokens:
+                          continue
+                      self.chunks[str(document.get("id",chunk_id))] = tokens
+                      self.payloads[str(document.get("id",chunk_id))] = document
+                      chunk_count += 1
+                  except Exception as e:
+                      logger.error("Tokenizer failed for chunk %s: %s", chunk_id, str(e))
+
+          if chunk_count == 0:
+              logger.error("Initialization completed but no valid chunks were loaded.")
+              raise NoDataError("No valid chunks loaded during initialization.")
+
+          self._initialized = True
+          logger.info("Initialized with %d document chunks", len(self.chunks))
+
+    async def get_context(self, query: str) -> Any:
+        """Retrieves relevant chunks for the given query."""
+        if not self._initialized:
+            await self.initialize()
+
+        if not self.chunks:
+            logger.warning("No chunks available in retriever")
+            return []
+
+        try:
+            query_tokens = self.tokenizer(query)
+        except Exception as e:
+            logger.error("Failed to tokenize query: %s", str(e))
+            return []
+
+        if not query_tokens:
+            logger.warning("Query produced no tokens")
+            return []
+
+        results = []
+        for chunk_id, chunk_tokens in self.chunks.items():
+            try:
+                score = self.scorer(query_tokens, chunk_tokens)
+                if not isinstance(score, (int, float)):
+                    logger.warning("Non-numeric score for chunk %s → treated as 0.0", chunk_id)
+                    score = 0.0
+            except Exception as e:
+                logger.error("Scorer failed for chunk %s: %s", chunk_id, str(e))
+                score = 0.0
+            results.append((chunk_id, score))
+
+        top_results = nlargest(self.top_k, results, key=lambda x: x[1])
+        logger.info("Retrieved %d/%d chunks for query (len=%d)", len(top_results), len(results), len(query_tokens))
+
+        if self.with_scores:
+            return [(self.payloads[chunk_id], score) for chunk_id, score in top_results]
+        else:
+            return [self.payloads[chunk_id] for chunk_id, _ in top_results]
+
+    async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
+        """Returns context for the given query (retrieves if not provided)."""
+        if context is None:
+            context = await self.get_context(query)
+        return context
diff --git a/cognee/modules/search/methods/get_search_type_tools.py b/cognee/modules/search/methods/get_search_type_tools.py
index 551f77a16..c5ea53a62 100644
--- a/cognee/modules/search/methods/get_search_type_tools.py
+++ b/cognee/modules/search/methods/get_search_type_tools.py
@@ -15,6 +15,7 @@ from cognee.modules.retrieval.completion_retriever import CompletionRetriever
 from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
 from cognee.modules.retrieval.temporal_retriever import TemporalRetriever
 from cognee.modules.retrieval.coding_rules_retriever import CodingRulesRetriever
+from cognee.modules.retrieval.jaccard_retrival import JaccardChunksRetriever
 from cognee.modules.retrieval.graph_summary_completion_retriever import (
     GraphSummaryCompletionRetriever,
 )
@@ -152,6 +153,10 @@ async def get_search_type_tools(
             TemporalRetriever(top_k=top_k).get_completion,
             TemporalRetriever(top_k=top_k).get_context,
         ],
+        SearchType.CHUNKS_LEXICAL: (lambda _r=JaccardChunksRetriever(top_k=top_k): [
+          _r.get_completion,
+          _r.get_context,
+        ])(),
         SearchType.CODING_RULES: [
             CodingRulesRetriever(rules_nodeset_name=node_name).get_existing_rules,
         ],
diff --git a/cognee/modules/search/types/SearchType.py b/cognee/modules/search/types/SearchType.py
index f5a23efff..418aec0b5 100644
--- a/cognee/modules/search/types/SearchType.py
+++ b/cognee/modules/search/types/SearchType.py
@@ -17,3 +17,4 @@ class SearchType(Enum):
     FEEDBACK = "FEEDBACK"
     TEMPORAL = "TEMPORAL"
     CODING_RULES = "CODING_RULES"
+    CHUNKS_LEXICAL = "CHUNKS_LEXICAL"