add support for structured outputs with llamma cpp va instructor and litellm

This commit is contained in:
dgarnitz 2025-12-30 16:20:56 -08:00
parent 27f2aa03b3
commit d578971b60
5 changed files with 4912 additions and 4578 deletions

View file

@ -97,6 +97,21 @@ git checkout -b feature/your-feature-name
python cognee/cognee/tests/test_library.py
```
### Running Simple Example
Change .env.example into .env and provide your OPENAI_API_KEY as LLM_API_KEY
Make sure to run ```shell uv sync ``` in the root cloned folder or set up a virtual environment to run cognee
```shell
python cognee/cognee/examples/python/simple_example.py
```
or
```shell
uv run python cognee/cognee/examples/python/simple_example.py
```
## 4. 📤 Submitting Changes
1. Install ruff on your system

View file

@ -34,6 +34,7 @@ class LLMProvider(Enum):
GEMINI = "gemini"
MISTRAL = "mistral"
BEDROCK = "bedrock"
LLAMA_CPP = "llama_cpp"
def get_llm_client(raise_api_key_error: bool = True):
@ -187,5 +188,28 @@ def get_llm_client(raise_api_key_error: bool = True):
instructor_mode=llm_config.llm_instructor_mode.lower(),
)
elif provider == LLMProvider.LLAMA_CPP:
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llama_cpp.adapter import (
LlamaCppAPIAdapter,
)
# Get optional local mode parameters (will be None if not set)
model_path = getattr(llm_config, 'llama_cpp_model_path', None)
n_ctx = getattr(llm_config, 'llama_cpp_n_ctx', 2048)
n_gpu_layers = getattr(llm_config, 'llama_cpp_n_gpu_layers', 0)
chat_format = getattr(llm_config, 'llama_cpp_chat_format', 'chatml')
return LlamaCppAPIAdapter(
model=llm_config.llm_model,
max_completion_tokens=max_completion_tokens,
instructor_mode=llm_config.llm_instructor_mode.lower(),
endpoint=llm_config.llm_endpoint,
api_key=llm_config.llm_api_key,
model_path=model_path,
n_ctx=n_ctx,
n_gpu_layers=n_gpu_layers,
chat_format=chat_format,
)
else:
raise UnsupportedLLMProviderError(provider)

View file

@ -0,0 +1,191 @@
"""Adapter for Instructor-backed Structured Output Framework for Llama CPP"""
import litellm
import logging
import instructor
from typing import Type, Optional
from openai import AsyncOpenAI
from pydantic import BaseModel
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
LLMInterface,
)
from cognee.shared.logging_utils import get_logger
from cognee.shared.rate_limiting import llm_rate_limiter_context_manager
from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
)
logger = get_logger()
class LlamaCppAPIAdapter(LLMInterface):
"""
Adapter for Llama CPP LLM provider with support for TWO modes:
1. SERVER MODE (OpenAI-compatible):
- Connects to llama-cpp-python server via HTTP (local or remote)
- Uses instructor.from_openai()
- Requires: endpoint, api_key, model
2. LOCAL MODE (In-process):
- Loads model directly using llama-cpp-python library
- Uses instructor.patch() on llama.Llama object
- Requires: model_path
Public methods:
- acreate_structured_output
Instance variables:
- name
- model (for server mode) or model_path (for local mode)
- mode_type: "server" or "local"
- max_completion_tokens
- aclient
"""
name: str
model: Optional[str]
model_path: Optional[str]
mode_type: str # "server" or "local"
default_instructor_mode = instructor.Mode.JSON
def __init__(
self,
name: str = "LlamaCpp",
max_completion_tokens: int = 2048,
instructor_mode: Optional[str] = None,
# Server mode parameters
endpoint: Optional[str] = None,
api_key: Optional[str] = None,
model: Optional[str] = None,
# Local mode parameters
model_path: Optional[str] = None,
n_ctx: int = 2048,
n_gpu_layers: int = 0,
chat_format: str = "chatml",
):
self.name = name
self.max_completion_tokens = max_completion_tokens
self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
# Determine which mode to use
if model_path:
self._init_local_mode(model_path, n_ctx, n_gpu_layers, chat_format)
elif endpoint:
self._init_server_mode(endpoint, api_key, model)
else:
raise ValueError(
"Must provide either 'model_path' (for local mode) or 'endpoint' (for server mode)"
)
def _init_local_mode(self, model_path: str, n_ctx: int, n_gpu_layers: int, chat_format: str):
"""Initialize local mode using llama-cpp-python library directly"""
try:
import llama_cpp
except ImportError:
raise ImportError(
"llama-cpp-python is not installed. Install with: pip install llama-cpp-python"
)
logger.info(f"Initializing LlamaCpp in LOCAL mode with model: {model_path}")
self.mode_type = "local"
self.model_path = model_path
self.model = None
# Initialize llama-cpp-python with the model
self.llama = llama_cpp.Llama(
model_path=model_path,
n_gpu_layers=n_gpu_layers, # -1 for all GPU, 0 for CPU only
chat_format=chat_format,
n_ctx=n_ctx,
verbose=False,
)
self.aclient = instructor.patch(
create=self.llama.create_chat_completion_openai_v1,
mode=instructor.Mode(self.instructor_mode),
)
def _init_server_mode(self, endpoint: str, api_key: Optional[str], model: Optional[str]):
"""Initialize server mode connecting to llama-cpp-python server"""
logger.info(f"Initializing LlamaCpp in SERVER mode with endpoint: {endpoint}")
self.mode_type = "server"
self.model = model
self.model_path = None
self.endpoint = endpoint
self.api_key = api_key
# Use instructor.from_openai() for server mode (OpenAI-compatible API)
self.aclient = instructor.from_openai(
AsyncOpenAI(base_url=self.endpoint, api_key=self.api_key),
mode=instructor.Mode(self.instructor_mode),
)
@retry(
stop=stop_after_delay(128),
wait=wait_exponential_jitter(8, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel], **kwargs
) -> BaseModel:
"""
Generate a structured output from the LLM using the provided text and system prompt.
Works in both local and server modes transparently.
Parameters:
-----------
- text_input (str): The input text provided by the user.
- system_prompt (str): The system prompt that guides the response generation.
- response_model (Type[BaseModel]): The model type that the response should conform to.
Returns:
--------
- BaseModel: A structured output that conforms to the specified response model.
"""
async with llm_rate_limiter_context_manager():
# Prepare messages (system first, then user is more standard)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text_input},
]
if self.mode_type == "server":
# Server mode: use async client with OpenAI-compatible API
response = await self.aclient.chat.completions.create(
model=self.model,
messages=messages,
response_model=response_model,
max_retries=2,
max_completion_tokens=self.max_completion_tokens,
**kwargs,
)
else:
import asyncio
# Local mode: instructor.patch() returns a SYNC callable
# Per docs: https://python.useinstructor.com/integrations/llama-cpp-python/
def _call_sync():
return self.aclient(
messages=messages,
response_model=response_model,
max_tokens=self.max_completion_tokens,
**kwargs,
)
# Run sync function in thread pool to avoid blocking
response = await asyncio.to_thread(_call_sync)
return response

View file

@ -104,6 +104,7 @@ anthropic = ["anthropic>=0.27"]
deepeval = ["deepeval>=3.0.1,<4"]
posthog = ["posthog>=3.5.0,<4"]
groq = ["groq>=0.8.0,<1.0.0"]
llama-cpp = ["llama-cpp-python[server]>=0.3.0,<1.0.0"]
chromadb = [
"chromadb>=0.6,<0.7",
"pypika==0.48.9",

9259
uv.lock generated

File diff suppressed because it is too large Load diff