add support for structured outputs with llamma cpp va instructor and litellm

2025-12-30 16:20:56 -08:00 · 2025-12-30 16:20:56 -08:00 · d578971b60
commit d578971b60
parent 27f2aa03b3
5 changed files with 4912 additions and 4578 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -97,6 +97,21 @@ git checkout -b feature/your-feature-name
 python cognee/cognee/tests/test_library.py
 ```
 ### Running Simple Example
 Change .env.example into .env and provide your OPENAI_API_KEY as LLM_API_KEY
 Make sure to run ```shell uv sync ``` in the root cloned folder or set up a virtual environment to run cognee
 ```shell
 python cognee/cognee/examples/python/simple_example.py
 ```
 or 
 ```shell
 uv run python cognee/cognee/examples/python/simple_example.py
 ```
 ## 4. 📤 Submitting Changes
 1. Install ruff on your system
--- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py
+++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py
@ -34,6 +34,7 @@ class LLMProvider(Enum):
    GEMINI = "gemini"
    MISTRAL = "mistral"
    BEDROCK = "bedrock"
    LLAMA_CPP = "llama_cpp"
 def get_llm_client(raise_api_key_error: bool = True):
@ -187,5 +188,28 @@ def get_llm_client(raise_api_key_error: bool = True):
            instructor_mode=llm_config.llm_instructor_mode.lower(),
        )
    elif provider == LLMProvider.LLAMA_CPP:
        from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llama_cpp.adapter import (
            LlamaCppAPIAdapter,
        )
        # Get optional local mode parameters (will be None if not set)
        model_path = getattr(llm_config, 'llama_cpp_model_path', None)
        n_ctx = getattr(llm_config, 'llama_cpp_n_ctx', 2048)
        n_gpu_layers = getattr(llm_config, 'llama_cpp_n_gpu_layers', 0)
        chat_format = getattr(llm_config, 'llama_cpp_chat_format', 'chatml')
        return LlamaCppAPIAdapter(
            model=llm_config.llm_model,
            max_completion_tokens=max_completion_tokens,
            instructor_mode=llm_config.llm_instructor_mode.lower(),
            endpoint=llm_config.llm_endpoint,
            api_key=llm_config.llm_api_key,
            model_path=model_path,
            n_ctx=n_ctx,
            n_gpu_layers=n_gpu_layers,
            chat_format=chat_format,
        )
    else:
        raise UnsupportedLLMProviderError(provider)
--- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llama_cpp/adapter.py
+++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llama_cpp/adapter.py
@ -0,0 +1,191 @@
 """Adapter for Instructor-backed Structured Output Framework for Llama CPP"""
 import litellm
 import logging
 import instructor
 from typing import Type, Optional
 from openai import AsyncOpenAI
 from pydantic import BaseModel
 from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
    LLMInterface,
 )
 from cognee.shared.logging_utils import get_logger
 from cognee.shared.rate_limiting import llm_rate_limiter_context_manager
 from tenacity import (
    retry,
    stop_after_delay,
    wait_exponential_jitter,
    retry_if_not_exception_type,
    before_sleep_log,
 )
 logger = get_logger()
 class LlamaCppAPIAdapter(LLMInterface):
    """
    Adapter for Llama CPP LLM provider with support for TWO modes:
    1. SERVER MODE (OpenAI-compatible):
       - Connects to llama-cpp-python server via HTTP (local or remote)
       - Uses instructor.from_openai()
       - Requires: endpoint, api_key, model
    2. LOCAL MODE (In-process):
       - Loads model directly using llama-cpp-python library
       - Uses instructor.patch() on llama.Llama object
       - Requires: model_path
    Public methods:
    - acreate_structured_output
    Instance variables:
    - name
    - model (for server mode) or model_path (for local mode)
    - mode_type: "server" or "local"
    - max_completion_tokens
    - aclient
    """
    name: str
    model: Optional[str]
    model_path: Optional[str]
    mode_type: str  # "server" or "local"
    default_instructor_mode = instructor.Mode.JSON
    def __init__(
        self,
        name: str = "LlamaCpp",
        max_completion_tokens: int = 2048,
        instructor_mode: Optional[str] = None,
        # Server mode parameters
        endpoint: Optional[str] = None,
        api_key: Optional[str] = None,
        model: Optional[str] = None,
        # Local mode parameters
        model_path: Optional[str] = None,
        n_ctx: int = 2048,
        n_gpu_layers: int = 0,
        chat_format: str = "chatml",
    ):
        self.name = name
        self.max_completion_tokens = max_completion_tokens
        self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
        # Determine which mode to use
        if model_path:
            self._init_local_mode(model_path, n_ctx, n_gpu_layers, chat_format)
        elif endpoint:
            self._init_server_mode(endpoint, api_key, model)
        else:
            raise ValueError(
                "Must provide either 'model_path' (for local mode) or 'endpoint' (for server mode)"
            )
    def _init_local_mode(self, model_path: str, n_ctx: int, n_gpu_layers: int, chat_format: str):
        """Initialize local mode using llama-cpp-python library directly"""
        try:
            import llama_cpp
        except ImportError:
            raise ImportError(
                "llama-cpp-python is not installed. Install with: pip install llama-cpp-python"
            )
        logger.info(f"Initializing LlamaCpp in LOCAL mode with model: {model_path}")
        self.mode_type = "local"
        self.model_path = model_path
        self.model = None
        # Initialize llama-cpp-python with the model
        self.llama = llama_cpp.Llama(
            model_path=model_path,
            n_gpu_layers=n_gpu_layers,  # -1 for all GPU, 0 for CPU only
            chat_format=chat_format,
            n_ctx=n_ctx,
            verbose=False,
        )
        self.aclient = instructor.patch(
            create=self.llama.create_chat_completion_openai_v1,
            mode=instructor.Mode(self.instructor_mode),
        )
    def _init_server_mode(self, endpoint: str, api_key: Optional[str], model: Optional[str]):
        """Initialize server mode connecting to llama-cpp-python server"""
        logger.info(f"Initializing LlamaCpp in SERVER mode with endpoint: {endpoint}")
        self.mode_type = "server"
        self.model = model
        self.model_path = None
        self.endpoint = endpoint
        self.api_key = api_key
        # Use instructor.from_openai() for server mode (OpenAI-compatible API)
        self.aclient = instructor.from_openai(
            AsyncOpenAI(base_url=self.endpoint, api_key=self.api_key),
            mode=instructor.Mode(self.instructor_mode),
        )
    @retry(
        stop=stop_after_delay(128),
        wait=wait_exponential_jitter(8, 128),
        retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
        before_sleep=before_sleep_log(logger, logging.DEBUG),
        reraise=True,
    )
    async def acreate_structured_output(
        self, text_input: str, system_prompt: str, response_model: Type[BaseModel], **kwargs
    ) -> BaseModel:
        """
        Generate a structured output from the LLM using the provided text and system prompt.
        Works in both local and server modes transparently.
        Parameters:
        -----------
            - text_input (str): The input text provided by the user.
            - system_prompt (str): The system prompt that guides the response generation.
            - response_model (Type[BaseModel]): The model type that the response should conform to.
        Returns:
        --------
            - BaseModel: A structured output that conforms to the specified response model.
        """
        async with llm_rate_limiter_context_manager():
            # Prepare messages (system first, then user is more standard)
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text_input},
            ]
            if self.mode_type == "server":
                # Server mode: use async client with OpenAI-compatible API
                response = await self.aclient.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    response_model=response_model,
                    max_retries=2,
                    max_completion_tokens=self.max_completion_tokens,
                    **kwargs,
                )
            else:
                import asyncio
                # Local mode: instructor.patch() returns a SYNC callable
                # Per docs: https://python.useinstructor.com/integrations/llama-cpp-python/
                def _call_sync():
                    return self.aclient(
                        messages=messages,
                        response_model=response_model,
                        max_tokens=self.max_completion_tokens,
                        **kwargs,
                    )
                # Run sync function in thread pool to avoid blocking
                response = await asyncio.to_thread(_call_sync)
        return response
--- a/pyproject.toml
+++ b/pyproject.toml
@ -104,6 +104,7 @@ anthropic = ["anthropic>=0.27"]
 deepeval = ["deepeval>=3.0.1,<4"]
 posthog = ["posthog>=3.5.0,<4"]
 groq = ["groq>=0.8.0,<1.0.0"]
 llama-cpp = ["llama-cpp-python[server]>=0.3.0,<1.0.0"]
 chromadb = [
    "chromadb>=0.6,<0.7",
    "pypika==0.48.9",
--- a/uv.lock
+++ b/uv.lock