From 96eb0d448a2ed77295bdaac6f83119fb635810b2 Mon Sep 17 00:00:00 2001 From: Chaitany <67532224+patelchaitany@users.noreply.github.com> Date: Fri, 19 Sep 2025 21:54:33 +0530 Subject: [PATCH] feat(#1357): Lexical chunk retriever (#1392) ## Description I Implemented Lexical Chunk Retriever In the LexicalRetriever class is Inherite the BaseRetriever and The DocumentChunk are lazy loaded when first time query is made because it save time during object initialization and the function get_context and the get_completion are Implemented same as the ChunksRetriever the only diffrence is that the DocumentChunk are converted to match the output type of the ChunksRetriever using function get_own_properties in the utils. ## Type of Change - [-] Bug fix (non-breaking change that fixes an issue) - [-] New feature (non-breaking change that adds functionality) - [-] Breaking change (fix or feature that would cause existing functionality to change) - [-] Documentation update - [-] Code refactoring - [-] Performance improvement - [-] Other (please specify): ## Changes Made - Added LexicalRetriever base class with customizable tokenizer & scorer - Implemented caching of DocumentChunk tokens and payloads - Added robust initialization with error handling and logging - Implemented get_context with top_k ranking and optional scores - Implemented get_completion consistent with BaseRetriever interface - Added JaccardChunksRetriever demo using set/multiset Jaccard similarity - Support for stopwords and multiset frequency-aware similarity - Integrated logging for initialization, scoring, and retrieval ## Testing - Manual tests: initialized retriever, retrieved chunks with toy corpus - Edge cases: empty corpus, empty query, scorer/tokenizer errors - Verified Jaccard similarity results for single/multiset cases - Code formatted and linted ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [-] **I have tested my changes thoroughly before submitting this PR** - [-] **This PR contains minimal changes necessary to address the issue/feature** - [-] My code follows the project's coding standards and style guidelines - [-] I have added tests that prove my fix is effective or that my feature works - [-] I have added necessary documentation (if applicable) - [-] All new and existing tests pass - [-] I have searched existing PRs to ensure this change hasn't been submitted already - [-] I have linked any relevant issues in the description - [-] My commits have clear and descriptive messages ## Related Issues Relates to #1392 ## Additional Notes Int the cognee/modules/chunking/models/DocumentChunk.py don't remove the optional from is_part_of attributes. ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Andrej Milicevic Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Co-authored-by: Igor Ilic Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Co-authored-by: Boris Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com> --- .../disable_independent_workflows.sh | 2 +- .github/workflows/test_gemini.yml | 29 ----- .github/workflows/test_llms.yml | 86 +++++++++++++ .github/workflows/test_openrouter.yml | 30 ----- .github/workflows/test_suites.yml | 23 ++-- cognee/api/v1/search/search.py | 3 + cognee/modules/retrieval/jaccard_retrival.py | 56 +++++++++ cognee/modules/retrieval/lexical_retriever.py | 117 ++++++++++++++++++ .../search/methods/get_search_type_tools.py | 5 + cognee/modules/search/types/SearchType.py | 1 + 10 files changed, 276 insertions(+), 76 deletions(-) delete mode 100644 .github/workflows/test_gemini.yml create mode 100644 .github/workflows/test_llms.yml delete mode 100644 .github/workflows/test_openrouter.yml create mode 100644 cognee/modules/retrieval/jaccard_retrival.py create mode 100644 cognee/modules/retrieval/lexical_retriever.py diff --git a/.github/workflows/disable_independent_workflows.sh b/.github/workflows/disable_independent_workflows.sh index 693c3092d..ff57da80d 100755 --- a/.github/workflows/disable_independent_workflows.sh +++ b/.github/workflows/disable_independent_workflows.sh @@ -10,7 +10,7 @@ WORKFLOWS=( "test_kuzu.yml" "test_multimetric_qa_eval_run.yaml" "test_graphrag_vs_rag_notebook.yml" - "test_gemini.yml" + "test_llms.yml" "test_multimedia_example.yaml" "test_deduplication.yml" "test_eval_framework.yml" diff --git a/.github/workflows/test_gemini.yml b/.github/workflows/test_gemini.yml deleted file mode 100644 index 544e15a5e..000000000 --- a/.github/workflows/test_gemini.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: test | gemini - -on: - workflow_call: - -jobs: - test-gemini: - name: Run Gemini Test - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Gemini Simple Example - env: - LLM_PROVIDER: "gemini" - LLM_API_KEY: ${{ secrets.GEMINI_API_KEY }} - LLM_MODEL: "gemini/gemini-1.5-flash" - EMBEDDING_PROVIDER: "gemini" - EMBEDDING_API_KEY: ${{ secrets.GEMINI_API_KEY }} - EMBEDDING_MODEL: "gemini/text-embedding-004" - EMBEDDING_DIMENSIONS: "768" - EMBEDDING_MAX_TOKENS: "8076" - run: uv run python ./examples/python/simple_example.py diff --git a/.github/workflows/test_llms.yml b/.github/workflows/test_llms.yml new file mode 100644 index 000000000..5a0f947c9 --- /dev/null +++ b/.github/workflows/test_llms.yml @@ -0,0 +1,86 @@ +name: LLM Test Suites + +permissions: + contents: read + +on: + workflow_call: + +env: + RUNTIME__LOG_LEVEL: ERROR + ENV: 'dev' + +jobs: + test-gemini: + name: Run Gemini Test + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Gemini Simple Example + env: + LLM_PROVIDER: "gemini" + LLM_API_KEY: ${{ secrets.GEMINI_API_KEY }} + LLM_MODEL: "gemini/gemini-1.5-flash" + EMBEDDING_PROVIDER: "gemini" + EMBEDDING_API_KEY: ${{ secrets.GEMINI_API_KEY }} + EMBEDDING_MODEL: "gemini/text-embedding-004" + EMBEDDING_DIMENSIONS: "768" + EMBEDDING_MAX_TOKENS: "8076" + run: uv run python ./examples/python/simple_example.py + + test-fastembed: + name: Run Fastembed Test + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Fastembed Simple Example + env: + LLM_PROVIDER: "openai" + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_PROVIDER: "fastembed" + EMBEDDING_MODEL: "sentence-transformers/all-MiniLM-L6-v2" + EMBEDDING_DIMENSIONS: "384" + EMBEDDING_MAX_TOKENS: "256" + run: uv run python ./examples/python/simple_example.py + + test-openrouter: + name: Run OpenRouter Test + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run OpenRouter Simple Example + env: + LLM_PROVIDER: "custom" + LLM_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + LLM_MODEL: "openrouter/x-ai/grok-code-fast-1" + LLM_ENDPOINT: "https://openrouter.ai/api/v1" + EMBEDDING_PROVIDER: "openai" + EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} + EMBEDDING_MODEL: "openai/text-embedding-3-large" + EMBEDDING_DIMENSIONS: "3072" + EMBEDDING_MAX_TOKENS: "8191" + run: uv run python ./examples/python/simple_example.py \ No newline at end of file diff --git a/.github/workflows/test_openrouter.yml b/.github/workflows/test_openrouter.yml deleted file mode 100644 index 9c2dcdebe..000000000 --- a/.github/workflows/test_openrouter.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: test | openrouter - -on: - workflow_call: - -jobs: - test-openrouter: - name: Run OpenRouter Test - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run OpenRouter Simple Example - env: - LLM_PROVIDER: "custom" - LLM_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} - LLM_MODEL: "openrouter/x-ai/grok-code-fast-1" - LLM_ENDPOINT: "https://openrouter.ai/api/v1" - EMBEDDING_PROVIDER: "openai" - EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} - EMBEDDING_MODEL: "openai/text-embedding-3-large" - EMBEDDING_DIMENSIONS: "3072" - EMBEDDING_MAX_TOKENS: "8191" - run: uv run python ./examples/python/simple_example.py diff --git a/.github/workflows/test_suites.yml b/.github/workflows/test_suites.yml index 86f89249d..ff18f2962 100644 --- a/.github/workflows/test_suites.yml +++ b/.github/workflows/test_suites.yml @@ -115,16 +115,10 @@ jobs: secrets: inherit # Additional LLM tests - gemini-tests: - name: Gemini Tests - needs: [basic-tests, e2e-tests] - uses: ./.github/workflows/test_gemini.yml - secrets: inherit - - openrouter-tests: - name: OpenRouter Tests - needs: [basic-tests, e2e-tests] - uses: ./.github/workflows/test_openrouter.yml + llm-tests: + name: LLM Test Suite + needs: [ basic-tests, e2e-tests ] + uses: ./.github/workflows/test_llms.yml secrets: inherit # Ollama tests moved to the end @@ -138,8 +132,7 @@ jobs: different-operating-systems-tests, vector-db-tests, example-tests, - gemini-tests, - openrouter-tests, + llm-tests, mcp-test, relational-db-migration-tests, docker-compose-test, @@ -161,8 +154,7 @@ jobs: example-tests, db-examples-tests, mcp-test, - gemini-tests, - openrouter-tests, + llm-tests, ollama-tests, relational-db-migration-tests, docker-compose-test, @@ -183,8 +175,7 @@ jobs: "${{ needs.example-tests.result }}" == "success" && "${{ needs.db-examples-tests.result }}" == "success" && "${{ needs.relational-db-migration-tests.result }}" == "success" && - "${{ needs.gemini-tests.result }}" == "success" && - "${{ needs.openrouter-tests.result }}" == "success" && + "${{ needs.llm-tests.result }}" == "success" && "${{ needs.docker-compose-test.result }}" == "success" && "${{ needs.docker-ci-test.result }}" == "success" && "${{ needs.ollama-tests.result }}" == "success" ]]; then diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index dcebce012..7209c6036 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -82,6 +82,9 @@ async def search( Best for: General-purpose queries or when you're unsure which search type is best. Returns: The results from the automatically selected search type. + **CHUNKS_LEXICAL**: + Token-based lexical chunk search (e.g., Jaccard). Best for: exact-term matching, stopword-aware lookups. + Returns: Ranked text chunks (optionally with scores). Args: query_text: Your question or search query in natural language. diff --git a/cognee/modules/retrieval/jaccard_retrival.py b/cognee/modules/retrieval/jaccard_retrival.py new file mode 100644 index 000000000..91d2b67f7 --- /dev/null +++ b/cognee/modules/retrieval/jaccard_retrival.py @@ -0,0 +1,56 @@ +from cognee.modules.retrieval.lexical_retriever import LexicalRetriever +import re +from collections import Counter +from typing import Optional +class JaccardChunksRetriever(LexicalRetriever): + """ + Retriever that specializes LexicalRetriever to use Jaccard similarity. + """ + + def __init__(self, top_k: int = 10, with_scores: bool = False, + stop_words: Optional[list[str]] = None, multiset_jaccard: bool = False): + """ + Parameters + ---------- + top_k : int + Number of top results to return. + with_scores : bool + If True, return (payload, score) pairs. Otherwise, only payloads. + stop_words : list[str], optional + List of tokens to filter out. + multiset_jaccard : bool + If True, use multiset Jaccard (frequency aware). + """ + self.stop_words = {t.lower() for t in stop_words} if stop_words else set() + self.multiset_jaccard = multiset_jaccard + + super().__init__( + tokenizer=self._tokenizer, + scorer=self._scorer, + top_k=top_k, + with_scores=with_scores + ) + + def _tokenizer(self, text: str) -> list[str]: + """ + Tokenizer: lowercases, splits on word characters (w+), filters stopwords. + """ + tokens = re.findall(r"\w+", text.lower()) + return [t for t in tokens if t not in self.stop_words] + + def _scorer(self, query_tokens: list[str], chunk_tokens: list[str]) -> float: + """ + Jaccard similarity scorer. + - If multiset_jaccard=True, uses frequency-aware Jaccard. + - Otherwise, normal set Jaccard. + """ + if self.multiset_jaccard: + q_counts, c_counts = Counter(query_tokens), Counter(chunk_tokens) + numerator = sum(min(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts)) + denominator = sum(max(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts)) + return numerator / denominator if denominator else 0.0 + else: + q_set, c_set = set(query_tokens), set(chunk_tokens) + if not q_set or not c_set: + return 0.0 + return len(q_set & c_set) / len(q_set | c_set) diff --git a/cognee/modules/retrieval/lexical_retriever.py b/cognee/modules/retrieval/lexical_retriever.py new file mode 100644 index 000000000..2292b64c8 --- /dev/null +++ b/cognee/modules/retrieval/lexical_retriever.py @@ -0,0 +1,117 @@ +import asyncio +from typing import Any, Callable, Optional +from heapq import nlargest + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.modules.retrieval.base_retriever import BaseRetriever +from cognee.modules.retrieval.exceptions.exceptions import NoDataError +from cognee.shared.logging_utils import get_logger + + +logger = get_logger("LexicalRetriever") + + +class LexicalRetriever(BaseRetriever): + + def __init__(self, tokenizer: Callable, scorer: Callable, top_k: int = 10, with_scores: bool = False): + if not callable(tokenizer) or not callable(scorer): + raise TypeError("tokenizer and scorer must be callables") + if not isinstance(top_k, int) or top_k <= 0: + raise ValueError("top_k must be a positive integer") + + self.tokenizer = tokenizer + self.scorer = scorer + self.top_k = top_k + self.with_scores = bool(with_scores) + + # Cache keyed by dataset context + self.chunks: dict[str, Any] = {} # {chunk_id: tokens} + self.payloads: dict[str, Any] = {} # {chunk_id: original_document} + self._initialized = False + self._init_lock = asyncio.Lock() + + async def initialize(self): + """Initialize retriever by reading all DocumentChunks from graph_engine.""" + async with self._init_lock: + if self._initialized: + return + + logger.info("Initializing LexicalRetriever by loading DocumentChunks from graph engine") + + try: + graph_engine = await get_graph_engine() + nodes, _ = await graph_engine.get_filtered_graph_data([{"type": ["DocumentChunk"]}]) + except Exception as e: + logger.error("Graph engine initialization failed") + raise NoDataError("Graph engine initialization failed") from e + + chunk_count = 0 + for node in nodes: + try: + chunk_id, document = node + except Exception: + logger.warning("Skipping node with unexpected shape: %r", node) + continue + + if document.get("type") == "DocumentChunk" and document.get("text"): + try: + tokens = self.tokenizer(document["text"]) + if not tokens: + continue + self.chunks[str(document.get("id",chunk_id))] = tokens + self.payloads[str(document.get("id",chunk_id))] = document + chunk_count += 1 + except Exception as e: + logger.error("Tokenizer failed for chunk %s: %s", chunk_id, str(e)) + + if chunk_count == 0: + logger.error("Initialization completed but no valid chunks were loaded.") + raise NoDataError("No valid chunks loaded during initialization.") + + self._initialized = True + logger.info("Initialized with %d document chunks", len(self.chunks)) + + async def get_context(self, query: str) -> Any: + """Retrieves relevant chunks for the given query.""" + if not self._initialized: + await self.initialize() + + if not self.chunks: + logger.warning("No chunks available in retriever") + return [] + + try: + query_tokens = self.tokenizer(query) + except Exception as e: + logger.error("Failed to tokenize query: %s", str(e)) + return [] + + if not query_tokens: + logger.warning("Query produced no tokens") + return [] + + results = [] + for chunk_id, chunk_tokens in self.chunks.items(): + try: + score = self.scorer(query_tokens, chunk_tokens) + if not isinstance(score, (int, float)): + logger.warning("Non-numeric score for chunk %s → treated as 0.0", chunk_id) + score = 0.0 + except Exception as e: + logger.error("Scorer failed for chunk %s: %s", chunk_id, str(e)) + score = 0.0 + results.append((chunk_id, score)) + + top_results = nlargest(self.top_k, results, key=lambda x: x[1]) + logger.info("Retrieved %d/%d chunks for query (len=%d)", len(top_results), len(results), len(query_tokens)) + + if self.with_scores: + return [(self.payloads[chunk_id], score) for chunk_id, score in top_results] + else: + return [self.payloads[chunk_id] for chunk_id, _ in top_results] + + async def get_completion(self, query: str, context: Optional[Any] = None) -> Any: + """Returns context for the given query (retrieves if not provided).""" + if context is None: + context = await self.get_context(query) + return context diff --git a/cognee/modules/search/methods/get_search_type_tools.py b/cognee/modules/search/methods/get_search_type_tools.py index 551f77a16..c5ea53a62 100644 --- a/cognee/modules/search/methods/get_search_type_tools.py +++ b/cognee/modules/search/methods/get_search_type_tools.py @@ -15,6 +15,7 @@ from cognee.modules.retrieval.completion_retriever import CompletionRetriever from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever from cognee.modules.retrieval.temporal_retriever import TemporalRetriever from cognee.modules.retrieval.coding_rules_retriever import CodingRulesRetriever +from cognee.modules.retrieval.jaccard_retrival import JaccardChunksRetriever from cognee.modules.retrieval.graph_summary_completion_retriever import ( GraphSummaryCompletionRetriever, ) @@ -152,6 +153,10 @@ async def get_search_type_tools( TemporalRetriever(top_k=top_k).get_completion, TemporalRetriever(top_k=top_k).get_context, ], + SearchType.CHUNKS_LEXICAL: (lambda _r=JaccardChunksRetriever(top_k=top_k): [ + _r.get_completion, + _r.get_context, + ])(), SearchType.CODING_RULES: [ CodingRulesRetriever(rules_nodeset_name=node_name).get_existing_rules, ], diff --git a/cognee/modules/search/types/SearchType.py b/cognee/modules/search/types/SearchType.py index f5a23efff..418aec0b5 100644 --- a/cognee/modules/search/types/SearchType.py +++ b/cognee/modules/search/types/SearchType.py @@ -17,3 +17,4 @@ class SearchType(Enum): FEEDBACK = "FEEDBACK" TEMPORAL = "TEMPORAL" CODING_RULES = "CODING_RULES" + CHUNKS_LEXICAL = "CHUNKS_LEXICAL"