diff --git a/.github/workflows/disable_independent_workflows.sh b/.github/workflows/disable_independent_workflows.sh index 693c3092d..ff57da80d 100755 --- a/.github/workflows/disable_independent_workflows.sh +++ b/.github/workflows/disable_independent_workflows.sh @@ -10,7 +10,7 @@ WORKFLOWS=( "test_kuzu.yml" "test_multimetric_qa_eval_run.yaml" "test_graphrag_vs_rag_notebook.yml" - "test_gemini.yml" + "test_llms.yml" "test_multimedia_example.yaml" "test_deduplication.yml" "test_eval_framework.yml" diff --git a/.github/workflows/test_gemini.yml b/.github/workflows/test_gemini.yml deleted file mode 100644 index 544e15a5e..000000000 --- a/.github/workflows/test_gemini.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: test | gemini - -on: - workflow_call: - -jobs: - test-gemini: - name: Run Gemini Test - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Gemini Simple Example - env: - LLM_PROVIDER: "gemini" - LLM_API_KEY: ${{ secrets.GEMINI_API_KEY }} - LLM_MODEL: "gemini/gemini-1.5-flash" - EMBEDDING_PROVIDER: "gemini" - EMBEDDING_API_KEY: ${{ secrets.GEMINI_API_KEY }} - EMBEDDING_MODEL: "gemini/text-embedding-004" - EMBEDDING_DIMENSIONS: "768" - EMBEDDING_MAX_TOKENS: "8076" - run: uv run python ./examples/python/simple_example.py diff --git a/.github/workflows/test_llms.yml b/.github/workflows/test_llms.yml new file mode 100644 index 000000000..5a0f947c9 --- /dev/null +++ b/.github/workflows/test_llms.yml @@ -0,0 +1,86 @@ +name: LLM Test Suites + +permissions: + contents: read + +on: + workflow_call: + +env: + RUNTIME__LOG_LEVEL: ERROR + ENV: 'dev' + +jobs: + test-gemini: + name: Run Gemini Test + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Gemini Simple Example + env: + LLM_PROVIDER: "gemini" + LLM_API_KEY: ${{ secrets.GEMINI_API_KEY }} + LLM_MODEL: "gemini/gemini-1.5-flash" + EMBEDDING_PROVIDER: "gemini" + EMBEDDING_API_KEY: ${{ secrets.GEMINI_API_KEY }} + EMBEDDING_MODEL: "gemini/text-embedding-004" + EMBEDDING_DIMENSIONS: "768" + EMBEDDING_MAX_TOKENS: "8076" + run: uv run python ./examples/python/simple_example.py + + test-fastembed: + name: Run Fastembed Test + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Fastembed Simple Example + env: + LLM_PROVIDER: "openai" + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_PROVIDER: "fastembed" + EMBEDDING_MODEL: "sentence-transformers/all-MiniLM-L6-v2" + EMBEDDING_DIMENSIONS: "384" + EMBEDDING_MAX_TOKENS: "256" + run: uv run python ./examples/python/simple_example.py + + test-openrouter: + name: Run OpenRouter Test + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run OpenRouter Simple Example + env: + LLM_PROVIDER: "custom" + LLM_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + LLM_MODEL: "openrouter/x-ai/grok-code-fast-1" + LLM_ENDPOINT: "https://openrouter.ai/api/v1" + EMBEDDING_PROVIDER: "openai" + EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} + EMBEDDING_MODEL: "openai/text-embedding-3-large" + EMBEDDING_DIMENSIONS: "3072" + EMBEDDING_MAX_TOKENS: "8191" + run: uv run python ./examples/python/simple_example.py \ No newline at end of file diff --git a/.github/workflows/test_openrouter.yml b/.github/workflows/test_openrouter.yml deleted file mode 100644 index 9c2dcdebe..000000000 --- a/.github/workflows/test_openrouter.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: test | openrouter - -on: - workflow_call: - -jobs: - test-openrouter: - name: Run OpenRouter Test - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run OpenRouter Simple Example - env: - LLM_PROVIDER: "custom" - LLM_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} - LLM_MODEL: "openrouter/x-ai/grok-code-fast-1" - LLM_ENDPOINT: "https://openrouter.ai/api/v1" - EMBEDDING_PROVIDER: "openai" - EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} - EMBEDDING_MODEL: "openai/text-embedding-3-large" - EMBEDDING_DIMENSIONS: "3072" - EMBEDDING_MAX_TOKENS: "8191" - run: uv run python ./examples/python/simple_example.py diff --git a/.github/workflows/test_suites.yml b/.github/workflows/test_suites.yml index 86f89249d..ff18f2962 100644 --- a/.github/workflows/test_suites.yml +++ b/.github/workflows/test_suites.yml @@ -115,16 +115,10 @@ jobs: secrets: inherit # Additional LLM tests - gemini-tests: - name: Gemini Tests - needs: [basic-tests, e2e-tests] - uses: ./.github/workflows/test_gemini.yml - secrets: inherit - - openrouter-tests: - name: OpenRouter Tests - needs: [basic-tests, e2e-tests] - uses: ./.github/workflows/test_openrouter.yml + llm-tests: + name: LLM Test Suite + needs: [ basic-tests, e2e-tests ] + uses: ./.github/workflows/test_llms.yml secrets: inherit # Ollama tests moved to the end @@ -138,8 +132,7 @@ jobs: different-operating-systems-tests, vector-db-tests, example-tests, - gemini-tests, - openrouter-tests, + llm-tests, mcp-test, relational-db-migration-tests, docker-compose-test, @@ -161,8 +154,7 @@ jobs: example-tests, db-examples-tests, mcp-test, - gemini-tests, - openrouter-tests, + llm-tests, ollama-tests, relational-db-migration-tests, docker-compose-test, @@ -183,8 +175,7 @@ jobs: "${{ needs.example-tests.result }}" == "success" && "${{ needs.db-examples-tests.result }}" == "success" && "${{ needs.relational-db-migration-tests.result }}" == "success" && - "${{ needs.gemini-tests.result }}" == "success" && - "${{ needs.openrouter-tests.result }}" == "success" && + "${{ needs.llm-tests.result }}" == "success" && "${{ needs.docker-compose-test.result }}" == "success" && "${{ needs.docker-ci-test.result }}" == "success" && "${{ needs.ollama-tests.result }}" == "success" ]]; then diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index dcebce012..7209c6036 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -82,6 +82,9 @@ async def search( Best for: General-purpose queries or when you're unsure which search type is best. Returns: The results from the automatically selected search type. + **CHUNKS_LEXICAL**: + Token-based lexical chunk search (e.g., Jaccard). Best for: exact-term matching, stopword-aware lookups. + Returns: Ranked text chunks (optionally with scores). Args: query_text: Your question or search query in natural language. diff --git a/cognee/modules/retrieval/jaccard_retrival.py b/cognee/modules/retrieval/jaccard_retrival.py new file mode 100644 index 000000000..91d2b67f7 --- /dev/null +++ b/cognee/modules/retrieval/jaccard_retrival.py @@ -0,0 +1,56 @@ +from cognee.modules.retrieval.lexical_retriever import LexicalRetriever +import re +from collections import Counter +from typing import Optional +class JaccardChunksRetriever(LexicalRetriever): + """ + Retriever that specializes LexicalRetriever to use Jaccard similarity. + """ + + def __init__(self, top_k: int = 10, with_scores: bool = False, + stop_words: Optional[list[str]] = None, multiset_jaccard: bool = False): + """ + Parameters + ---------- + top_k : int + Number of top results to return. + with_scores : bool + If True, return (payload, score) pairs. Otherwise, only payloads. + stop_words : list[str], optional + List of tokens to filter out. + multiset_jaccard : bool + If True, use multiset Jaccard (frequency aware). + """ + self.stop_words = {t.lower() for t in stop_words} if stop_words else set() + self.multiset_jaccard = multiset_jaccard + + super().__init__( + tokenizer=self._tokenizer, + scorer=self._scorer, + top_k=top_k, + with_scores=with_scores + ) + + def _tokenizer(self, text: str) -> list[str]: + """ + Tokenizer: lowercases, splits on word characters (w+), filters stopwords. + """ + tokens = re.findall(r"\w+", text.lower()) + return [t for t in tokens if t not in self.stop_words] + + def _scorer(self, query_tokens: list[str], chunk_tokens: list[str]) -> float: + """ + Jaccard similarity scorer. + - If multiset_jaccard=True, uses frequency-aware Jaccard. + - Otherwise, normal set Jaccard. + """ + if self.multiset_jaccard: + q_counts, c_counts = Counter(query_tokens), Counter(chunk_tokens) + numerator = sum(min(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts)) + denominator = sum(max(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts)) + return numerator / denominator if denominator else 0.0 + else: + q_set, c_set = set(query_tokens), set(chunk_tokens) + if not q_set or not c_set: + return 0.0 + return len(q_set & c_set) / len(q_set | c_set) diff --git a/cognee/modules/retrieval/lexical_retriever.py b/cognee/modules/retrieval/lexical_retriever.py new file mode 100644 index 000000000..2292b64c8 --- /dev/null +++ b/cognee/modules/retrieval/lexical_retriever.py @@ -0,0 +1,117 @@ +import asyncio +from typing import Any, Callable, Optional +from heapq import nlargest + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.modules.retrieval.base_retriever import BaseRetriever +from cognee.modules.retrieval.exceptions.exceptions import NoDataError +from cognee.shared.logging_utils import get_logger + + +logger = get_logger("LexicalRetriever") + + +class LexicalRetriever(BaseRetriever): + + def __init__(self, tokenizer: Callable, scorer: Callable, top_k: int = 10, with_scores: bool = False): + if not callable(tokenizer) or not callable(scorer): + raise TypeError("tokenizer and scorer must be callables") + if not isinstance(top_k, int) or top_k <= 0: + raise ValueError("top_k must be a positive integer") + + self.tokenizer = tokenizer + self.scorer = scorer + self.top_k = top_k + self.with_scores = bool(with_scores) + + # Cache keyed by dataset context + self.chunks: dict[str, Any] = {} # {chunk_id: tokens} + self.payloads: dict[str, Any] = {} # {chunk_id: original_document} + self._initialized = False + self._init_lock = asyncio.Lock() + + async def initialize(self): + """Initialize retriever by reading all DocumentChunks from graph_engine.""" + async with self._init_lock: + if self._initialized: + return + + logger.info("Initializing LexicalRetriever by loading DocumentChunks from graph engine") + + try: + graph_engine = await get_graph_engine() + nodes, _ = await graph_engine.get_filtered_graph_data([{"type": ["DocumentChunk"]}]) + except Exception as e: + logger.error("Graph engine initialization failed") + raise NoDataError("Graph engine initialization failed") from e + + chunk_count = 0 + for node in nodes: + try: + chunk_id, document = node + except Exception: + logger.warning("Skipping node with unexpected shape: %r", node) + continue + + if document.get("type") == "DocumentChunk" and document.get("text"): + try: + tokens = self.tokenizer(document["text"]) + if not tokens: + continue + self.chunks[str(document.get("id",chunk_id))] = tokens + self.payloads[str(document.get("id",chunk_id))] = document + chunk_count += 1 + except Exception as e: + logger.error("Tokenizer failed for chunk %s: %s", chunk_id, str(e)) + + if chunk_count == 0: + logger.error("Initialization completed but no valid chunks were loaded.") + raise NoDataError("No valid chunks loaded during initialization.") + + self._initialized = True + logger.info("Initialized with %d document chunks", len(self.chunks)) + + async def get_context(self, query: str) -> Any: + """Retrieves relevant chunks for the given query.""" + if not self._initialized: + await self.initialize() + + if not self.chunks: + logger.warning("No chunks available in retriever") + return [] + + try: + query_tokens = self.tokenizer(query) + except Exception as e: + logger.error("Failed to tokenize query: %s", str(e)) + return [] + + if not query_tokens: + logger.warning("Query produced no tokens") + return [] + + results = [] + for chunk_id, chunk_tokens in self.chunks.items(): + try: + score = self.scorer(query_tokens, chunk_tokens) + if not isinstance(score, (int, float)): + logger.warning("Non-numeric score for chunk %s → treated as 0.0", chunk_id) + score = 0.0 + except Exception as e: + logger.error("Scorer failed for chunk %s: %s", chunk_id, str(e)) + score = 0.0 + results.append((chunk_id, score)) + + top_results = nlargest(self.top_k, results, key=lambda x: x[1]) + logger.info("Retrieved %d/%d chunks for query (len=%d)", len(top_results), len(results), len(query_tokens)) + + if self.with_scores: + return [(self.payloads[chunk_id], score) for chunk_id, score in top_results] + else: + return [self.payloads[chunk_id] for chunk_id, _ in top_results] + + async def get_completion(self, query: str, context: Optional[Any] = None) -> Any: + """Returns context for the given query (retrieves if not provided).""" + if context is None: + context = await self.get_context(query) + return context diff --git a/cognee/modules/search/methods/get_search_type_tools.py b/cognee/modules/search/methods/get_search_type_tools.py index 551f77a16..c5ea53a62 100644 --- a/cognee/modules/search/methods/get_search_type_tools.py +++ b/cognee/modules/search/methods/get_search_type_tools.py @@ -15,6 +15,7 @@ from cognee.modules.retrieval.completion_retriever import CompletionRetriever from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever from cognee.modules.retrieval.temporal_retriever import TemporalRetriever from cognee.modules.retrieval.coding_rules_retriever import CodingRulesRetriever +from cognee.modules.retrieval.jaccard_retrival import JaccardChunksRetriever from cognee.modules.retrieval.graph_summary_completion_retriever import ( GraphSummaryCompletionRetriever, ) @@ -152,6 +153,10 @@ async def get_search_type_tools( TemporalRetriever(top_k=top_k).get_completion, TemporalRetriever(top_k=top_k).get_context, ], + SearchType.CHUNKS_LEXICAL: (lambda _r=JaccardChunksRetriever(top_k=top_k): [ + _r.get_completion, + _r.get_context, + ])(), SearchType.CODING_RULES: [ CodingRulesRetriever(rules_nodeset_name=node_name).get_existing_rules, ], diff --git a/cognee/modules/search/types/SearchType.py b/cognee/modules/search/types/SearchType.py index f5a23efff..418aec0b5 100644 --- a/cognee/modules/search/types/SearchType.py +++ b/cognee/modules/search/types/SearchType.py @@ -17,3 +17,4 @@ class SearchType(Enum): FEEDBACK = "FEEDBACK" TEMPORAL = "TEMPORAL" CODING_RULES = "CODING_RULES" + CHUNKS_LEXICAL = "CHUNKS_LEXICAL"