add support for structured outputs with llamma cpp va instructor and litellm

2025-12-30 16:20:56 -08:00 · 2025-12-30 16:20:56 -08:00 · d578971b60
commit d578971b60
parent 27f2aa03b3
5 changed files with 4912 additions and 4578 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -97,6 +97,21 @@ git checkout -b feature/your-feature-name
 python cognee/cognee/tests/test_library.py
 ```

+### Running Simple Example
+
+Change .env.example into .env and provide your OPENAI_API_KEY as LLM_API_KEY
+
+Make sure to run ```shell uv sync ``` in the root cloned folder or set up a virtual environment to run cognee
+
+```shell
+python cognee/cognee/examples/python/simple_example.py
+```
+or 
+
+```shell
+uv run python cognee/cognee/examples/python/simple_example.py
+```
+
 ## 4. 📤 Submitting Changes

 1. Install ruff on your system
--- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py
+++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py
@ -34,6 +34,7 @@ class LLMProvider(Enum):
    GEMINI = "gemini"
    MISTRAL = "mistral"
    BEDROCK = "bedrock"
+    LLAMA_CPP = "llama_cpp"


 def get_llm_client(raise_api_key_error: bool = True):
@ -187,5 +188,28 @@ def get_llm_client(raise_api_key_error: bool = True):
            instructor_mode=llm_config.llm_instructor_mode.lower(),
        )

+    
+    elif provider == LLMProvider.LLAMA_CPP:
+        from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llama_cpp.adapter import (
+            LlamaCppAPIAdapter,
+        )
+
+        # Get optional local mode parameters (will be None if not set)
+        model_path = getattr(llm_config, 'llama_cpp_model_path', None)
+        n_ctx = getattr(llm_config, 'llama_cpp_n_ctx', 2048)
+        n_gpu_layers = getattr(llm_config, 'llama_cpp_n_gpu_layers', 0)
+        chat_format = getattr(llm_config, 'llama_cpp_chat_format', 'chatml')
+
+        return LlamaCppAPIAdapter(
+            model=llm_config.llm_model,
+            max_completion_tokens=max_completion_tokens,
+            instructor_mode=llm_config.llm_instructor_mode.lower(),
+            endpoint=llm_config.llm_endpoint,
+            api_key=llm_config.llm_api_key,
+            model_path=model_path,
+            n_ctx=n_ctx,
+            n_gpu_layers=n_gpu_layers,
+            chat_format=chat_format,
+        )
    else:
        raise UnsupportedLLMProviderError(provider)
--- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llama_cpp/adapter.py
+++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llama_cpp/adapter.py
@ -0,0 +1,191 @@
+"""Adapter for Instructor-backed Structured Output Framework for Llama CPP"""
+
+import litellm
+import logging
+import instructor
+from typing import Type, Optional
+from openai import AsyncOpenAI
+from pydantic import BaseModel
+
+from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
+    LLMInterface,
+)
+from cognee.shared.logging_utils import get_logger
+from cognee.shared.rate_limiting import llm_rate_limiter_context_manager
+
+from tenacity import (
+    retry,
+    stop_after_delay,
+    wait_exponential_jitter,
+    retry_if_not_exception_type,
+    before_sleep_log,
+)
+
+logger = get_logger()
+
+
+class LlamaCppAPIAdapter(LLMInterface):
+    """
+    Adapter for Llama CPP LLM provider with support for TWO modes:
+
+    1. SERVER MODE (OpenAI-compatible):
+       - Connects to llama-cpp-python server via HTTP (local or remote)
+       - Uses instructor.from_openai()
+       - Requires: endpoint, api_key, model
+
+    2. LOCAL MODE (In-process):
+       - Loads model directly using llama-cpp-python library
+       - Uses instructor.patch() on llama.Llama object
+       - Requires: model_path
+
+    Public methods:
+    - acreate_structured_output
+
+    Instance variables:
+    - name
+    - model (for server mode) or model_path (for local mode)
+    - mode_type: "server" or "local"
+    - max_completion_tokens
+    - aclient
+    """
+
+    name: str
+    model: Optional[str]
+    model_path: Optional[str]
+    mode_type: str  # "server" or "local"
+    default_instructor_mode = instructor.Mode.JSON
+
+    def __init__(
+        self,
+        name: str = "LlamaCpp",
+        max_completion_tokens: int = 2048,
+        instructor_mode: Optional[str] = None,
+        # Server mode parameters
+        endpoint: Optional[str] = None,
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        # Local mode parameters
+        model_path: Optional[str] = None,
+        n_ctx: int = 2048,
+        n_gpu_layers: int = 0,
+        chat_format: str = "chatml",
+    ):
+        self.name = name
+        self.max_completion_tokens = max_completion_tokens
+        self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
+
+        # Determine which mode to use
+        if model_path:
+            self._init_local_mode(model_path, n_ctx, n_gpu_layers, chat_format)
+        elif endpoint:
+            self._init_server_mode(endpoint, api_key, model)
+        else:
+            raise ValueError(
+                "Must provide either 'model_path' (for local mode) or 'endpoint' (for server mode)"
+            )
+
+    def _init_local_mode(self, model_path: str, n_ctx: int, n_gpu_layers: int, chat_format: str):
+        """Initialize local mode using llama-cpp-python library directly"""
+        try:
+            import llama_cpp
+        except ImportError:
+            raise ImportError(
+                "llama-cpp-python is not installed. Install with: pip install llama-cpp-python"
+            )
+
+        logger.info(f"Initializing LlamaCpp in LOCAL mode with model: {model_path}")
+
+        self.mode_type = "local"
+        self.model_path = model_path
+        self.model = None
+
+        # Initialize llama-cpp-python with the model
+        self.llama = llama_cpp.Llama(
+            model_path=model_path,
+            n_gpu_layers=n_gpu_layers,  # -1 for all GPU, 0 for CPU only
+            chat_format=chat_format,
+            n_ctx=n_ctx,
+            verbose=False,
+        )
+
+        self.aclient = instructor.patch(
+            create=self.llama.create_chat_completion_openai_v1,
+            mode=instructor.Mode(self.instructor_mode),
+        )
+
+    def _init_server_mode(self, endpoint: str, api_key: Optional[str], model: Optional[str]):
+        """Initialize server mode connecting to llama-cpp-python server"""
+        logger.info(f"Initializing LlamaCpp in SERVER mode with endpoint: {endpoint}")
+
+        self.mode_type = "server"
+        self.model = model
+        self.model_path = None
+        self.endpoint = endpoint
+        self.api_key = api_key
+
+        # Use instructor.from_openai() for server mode (OpenAI-compatible API)
+        self.aclient = instructor.from_openai(
+            AsyncOpenAI(base_url=self.endpoint, api_key=self.api_key),
+            mode=instructor.Mode(self.instructor_mode),
+        )
+
+    @retry(
+        stop=stop_after_delay(128),
+        wait=wait_exponential_jitter(8, 128),
+        retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
+        before_sleep=before_sleep_log(logger, logging.DEBUG),
+        reraise=True,
+    )
+    async def acreate_structured_output(
+        self, text_input: str, system_prompt: str, response_model: Type[BaseModel], **kwargs
+    ) -> BaseModel:
+        """
+        Generate a structured output from the LLM using the provided text and system prompt.
+
+        Works in both local and server modes transparently.
+
+        Parameters:
+        -----------
+            - text_input (str): The input text provided by the user.
+            - system_prompt (str): The system prompt that guides the response generation.
+            - response_model (Type[BaseModel]): The model type that the response should conform to.
+
+        Returns:
+        --------
+            - BaseModel: A structured output that conforms to the specified response model.
+        """
+        async with llm_rate_limiter_context_manager():
+            # Prepare messages (system first, then user is more standard)
+            messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": text_input},
+            ]
+
+            if self.mode_type == "server":
+                # Server mode: use async client with OpenAI-compatible API
+                response = await self.aclient.chat.completions.create(
+                    model=self.model,
+                    messages=messages,
+                    response_model=response_model,
+                    max_retries=2,
+                    max_completion_tokens=self.max_completion_tokens,
+                    **kwargs,
+                )
+
+            else:
+                import asyncio
+
+                # Local mode: instructor.patch() returns a SYNC callable
+                # Per docs: https://python.useinstructor.com/integrations/llama-cpp-python/
+                def _call_sync():
+                    return self.aclient(
+                        messages=messages,
+                        response_model=response_model,
+                        max_tokens=self.max_completion_tokens,
+                        **kwargs,
+                    )
+
+                # Run sync function in thread pool to avoid blocking
+                response = await asyncio.to_thread(_call_sync)
+
+        return response
--- a/pyproject.toml
+++ b/pyproject.toml
@ -104,6 +104,7 @@ anthropic = ["anthropic>=0.27"]
 deepeval = ["deepeval>=3.0.1,<4"]
 posthog = ["posthog>=3.5.0,<4"]
 groq = ["groq>=0.8.0,<1.0.0"]
+llama-cpp = ["llama-cpp-python[server]>=0.3.0,<1.0.0"]
 chromadb = [
    "chromadb>=0.6,<0.7",
    "pypika==0.48.9",
--- a/uv.lock
+++ b/uv.lock