Support Structured Outputs with Llama CPP using LiteLLM & Instructor (#1949)
<!-- .github/pull_request_template.md --> ## Description This PR adds support for structured outputs with llama cpp using litellm and instructor. It returns a Pydantic instance. Based on the github issue described [here](https://github.com/topoteretes/cognee/issues/1947). It features the following: - works for both local and server modes (OpenAI api compatible) - defaults to `JSON` mode (**not JSON schema mode, which is too rigid**) - uses existing patterns around logging & tenacity decorator consistent with other adapters - Respects max_completion_tokens / max_tokens ## Acceptance Criteria <!-- * Key requirements to the new feature or modification; * Proof that the changes work and meet the requirements; * Include instructions on how to verify the changes. Describe how to test it locally; * Proof that it's sufficiently tested. --> I used the script below to test it with the [Phi-3-mini-4k-instruct model](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf). This tests a basic structured data extraction and a more complex one locally, then verifies that data extraction works in server mode. There are instructors in the script on how to set up the models. If you are testing this on a mac, run `brew install llama.cpp` to get llama cpp working locally. If you don't have Apple silicon chips, you will need to alter the script or the configs to run this on GPU. ``` """ Comprehensive test script for LlamaCppAPIAdapter - Tests LOCAL and SERVER modes SETUP INSTRUCTIONS: =================== 1. Download a small model (pick ONE): # Phi-3-mini (2.3GB, recommended - best balance) wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf # OR TinyLlama (1.1GB, smallest but lower quality) wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf 2. For SERVER mode tests, start a server: python -m llama_cpp.server --model ./Phi-3-mini-4k-instruct-q4.gguf --port 8080 --n_gpu_layers -1 """ import asyncio import os from pydantic import BaseModel from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llama_cpp.adapter import ( LlamaCppAPIAdapter, ) class Person(BaseModel): """Simple test model for person extraction""" name: str age: int class EntityExtraction(BaseModel): """Test model for entity extraction""" entities: list[str] summary: str # Configuration - UPDATE THESE PATHS MODEL_PATHS = [ "./Phi-3-mini-4k-instruct-q4.gguf", "./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", ] def find_model() -> str: """Find the first available model file""" for path in MODEL_PATHS: if os.path.exists(path): return path return None async def test_local_mode(): """Test LOCAL mode (in-process, no server needed)""" print("=" * 70) print("Test 1: LOCAL MODE (In-Process)") print("=" * 70) model_path = find_model() if not model_path: print("❌ No model found! Download a model first:") print() return False print(f"Using model: {model_path}") try: adapter = LlamaCppAPIAdapter( name="LlamaCpp-Local", model_path=model_path, # Local mode parameter max_completion_tokens=4096, n_ctx=2048, n_gpu_layers=-1, # 0 for CPU, -1 for all GPU layers ) print(f"✓ Adapter initialized in {adapter.mode_type.upper()} mode") print(" Sending request...") result = await adapter.acreate_structured_output( text_input="John Smith is 30 years old", system_prompt="Extract the person's name and age.", response_model=Person, ) print(f"✅ Success!") print(f" Name: {result.name}") print(f" Age: {result.age}") print() return True except ImportError as e: print(f"❌ ImportError: {e}") print(" Install llama-cpp-python: pip install llama-cpp-python") print() return False except Exception as e: print(f"❌ Failed: {e}") print() return False async def test_server_mode(): """Test SERVER mode (localhost HTTP endpoint)""" print("=" * 70) print("Test 3: SERVER MODE (Localhost HTTP)") print("=" * 70) try: adapter = LlamaCppAPIAdapter( name="LlamaCpp-Server", endpoint="http://localhost:8080/v1", # Server mode parameter api_key="dummy", model="Phi-3-mini-4k-instruct-q4.gguf", max_completion_tokens=1024, chat_format="phi-3" ) print(f"✓ Adapter initialized in {adapter.mode_type.upper()} mode") print(f" Endpoint: {adapter.endpoint}") print(" Sending request...") result = await adapter.acreate_structured_output( text_input="Sarah Johnson is 25 years old", system_prompt="Extract the person's name and age.", response_model=Person, ) print(f"✅ Success!") print(f" Name: {result.name}") print(f" Age: {result.age}") print() return True except Exception as e: print(f"❌ Failed: {e}") print(" Make sure llama-cpp-python server is running on port 8080:") print(" python -m llama_cpp.server --model your-model.gguf --port 8080") print() return False async def test_entity_extraction_local(): """Test more complex extraction with local mode""" print("=" * 70) print("Test 2: Complex Entity Extraction (Local Mode)") print("=" * 70) model_path = find_model() if not model_path: print("❌ No model found!") print() return False try: adapter = LlamaCppAPIAdapter( name="LlamaCpp-Local", model_path=model_path, max_completion_tokens=1024, n_ctx=2048, n_gpu_layers=-1, ) print(f"✓ Adapter initialized") print(" Sending complex extraction request...") result = await adapter.acreate_structured_output( text_input="Natural language processing (NLP) is a subfield of artificial intelligence (AI) and computer science.", system_prompt="Extract all technical entities mentioned and provide a brief summary.", response_model=EntityExtraction, ) print(f"✅ Success!") print(f" Entities: {', '.join(result.entities)}") print(f" Summary: {result.summary}") print() return True except Exception as e: print(f"❌ Failed: {e}") print() return False async def main(): """Run all tests""" print("\n" + "🦙" * 35) print("Llama CPP Adapter - Comprehensive Test Suite") print("Testing LOCAL and SERVER modes") print("🦙" * 35 + "\n") results = {} # Test 1: Local mode (no server needed) print("=" * 70) print("PHASE 1: Testing LOCAL mode (in-process)") print("=" * 70) print() results["local_basic"] = await test_local_mode() results["local_complex"] = await test_entity_extraction_local() # Test 2: Server mode (requires server on 8080) print("\n" + "=" * 70) print("PHASE 2: Testing SERVER mode (requires server running)") print("=" * 70) print() results["server"] = await test_server_mode() # Summary print("\n" + "=" * 70) print("TEST SUMMARY") print("=" * 70) for test_name, passed in results.items(): status = "✅ PASSED" if passed else "❌ FAILED" print(f" {test_name:20s}: {status}") passed_count = sum(results.values()) total_count = len(results) print() print(f"Total: {passed_count}/{total_count} tests passed") if passed_count == total_count: print("\n🎉 All tests passed! The adapter is working correctly.") elif results.get("local_basic"): print("\n✓ Local mode works! Server/cloud tests need llama-cpp-python server running.") else: print("\n⚠️ Please check setup instructions at the top of this file.") if __name__ == "__main__": asyncio.run(main()) ``` **The following screenshots show the tests passing** <img width="622" height="149" alt="image" src="https://github.com/user-attachments/assets/9df02f66-39a9-488a-96a6-dc79b47e3001" /> Test 1 <img width="939" height="750" alt="image" src="https://github.com/user-attachments/assets/87759189-8fd2-450f-af7f-0364101a5690" /> Test 2 <img width="938" height="746" alt="image" src="https://github.com/user-attachments/assets/61e423c0-3d41-4fde-acaf-ae77c3463d66" /> Test 3 <img width="944" height="232" alt="image" src="https://github.com/user-attachments/assets/f7302777-2004-447c-a2fe-b12762241ba9" /> **note** I also tried to test it with the `TinyLlama-1.1B-Chat` model but such a small model is bad at producing structured JSON consistently. ## Type of Change <!-- Please check the relevant option --> - [ ] Bug fix (non-breaking change that fixes an issue) - [ X] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) see above ## Pre-submission Checklist <!-- Please check all boxes that apply before submitting your PR --> - [X] **I have tested my changes thoroughly before submitting this PR** - [X] **This PR contains minimal changes necessary to address the issue/feature** - [X] My code follows the project's coding standards and style guidelines - [X] I have added tests that prove my fix is effective or that my feature works - [X] I have added necessary documentation (if applicable) - [X] All new and existing tests pass - [X] I have searched existing PRs to ensure this change hasn't been submitted already - [X] I have linked any relevant issues in the description - [X] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Llama CPP integration supporting local (in-process) and server (OpenAI‑compatible) modes. * Selectable provider with configurable model path, context size, GPU layers, and chat format. * Asynchronous structured-output generation with rate limiting, retries/backoff, and debug logging. * **Chores** * Added llama-cpp-python dependency and bumped project version. * **Documentation** * CONTRIBUTING updated with a “Running Simple Example” walkthrough for local/server usage. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
commit
e5341c5f49
5 changed files with 4911 additions and 4577 deletions
|
|
@ -97,6 +97,21 @@ git checkout -b feature/your-feature-name
|
|||
python cognee/cognee/tests/test_library.py
|
||||
```
|
||||
|
||||
### Running Simple Example
|
||||
|
||||
Change .env.example into .env and provide your OPENAI_API_KEY as LLM_API_KEY
|
||||
|
||||
Make sure to run ```shell uv sync ``` in the root cloned folder or set up a virtual environment to run cognee
|
||||
|
||||
```shell
|
||||
python cognee/cognee/examples/python/simple_example.py
|
||||
```
|
||||
or
|
||||
|
||||
```shell
|
||||
uv run python cognee/cognee/examples/python/simple_example.py
|
||||
```
|
||||
|
||||
## 4. 📤 Submitting Changes
|
||||
|
||||
1. Install ruff on your system
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ class LLMProvider(Enum):
|
|||
GEMINI = "gemini"
|
||||
MISTRAL = "mistral"
|
||||
BEDROCK = "bedrock"
|
||||
LLAMA_CPP = "llama_cpp"
|
||||
|
||||
|
||||
def get_llm_client(raise_api_key_error: bool = True):
|
||||
|
|
@ -187,5 +188,28 @@ def get_llm_client(raise_api_key_error: bool = True):
|
|||
instructor_mode=llm_config.llm_instructor_mode.lower(),
|
||||
)
|
||||
|
||||
|
||||
elif provider == LLMProvider.LLAMA_CPP:
|
||||
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llama_cpp.adapter import (
|
||||
LlamaCppAPIAdapter,
|
||||
)
|
||||
|
||||
# Get optional local mode parameters (will be None if not set)
|
||||
model_path = getattr(llm_config, 'llama_cpp_model_path', None)
|
||||
n_ctx = getattr(llm_config, 'llama_cpp_n_ctx', 2048)
|
||||
n_gpu_layers = getattr(llm_config, 'llama_cpp_n_gpu_layers', 0)
|
||||
chat_format = getattr(llm_config, 'llama_cpp_chat_format', 'chatml')
|
||||
|
||||
return LlamaCppAPIAdapter(
|
||||
model=llm_config.llm_model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
instructor_mode=llm_config.llm_instructor_mode.lower(),
|
||||
endpoint=llm_config.llm_endpoint,
|
||||
api_key=llm_config.llm_api_key,
|
||||
model_path=model_path,
|
||||
n_ctx=n_ctx,
|
||||
n_gpu_layers=n_gpu_layers,
|
||||
chat_format=chat_format,
|
||||
)
|
||||
else:
|
||||
raise UnsupportedLLMProviderError(provider)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,191 @@
|
|||
"""Adapter for Instructor-backed Structured Output Framework for Llama CPP"""
|
||||
|
||||
import litellm
|
||||
import logging
|
||||
import instructor
|
||||
from typing import Type, Optional
|
||||
from openai import AsyncOpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
|
||||
LLMInterface,
|
||||
)
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee.shared.rate_limiting import llm_rate_limiter_context_manager
|
||||
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_delay,
|
||||
wait_exponential_jitter,
|
||||
retry_if_not_exception_type,
|
||||
before_sleep_log,
|
||||
)
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class LlamaCppAPIAdapter(LLMInterface):
|
||||
"""
|
||||
Adapter for Llama CPP LLM provider with support for TWO modes:
|
||||
|
||||
1. SERVER MODE (OpenAI-compatible):
|
||||
- Connects to llama-cpp-python server via HTTP (local or remote)
|
||||
- Uses instructor.from_openai()
|
||||
- Requires: endpoint, api_key, model
|
||||
|
||||
2. LOCAL MODE (In-process):
|
||||
- Loads model directly using llama-cpp-python library
|
||||
- Uses instructor.patch() on llama.Llama object
|
||||
- Requires: model_path
|
||||
|
||||
Public methods:
|
||||
- acreate_structured_output
|
||||
|
||||
Instance variables:
|
||||
- name
|
||||
- model (for server mode) or model_path (for local mode)
|
||||
- mode_type: "server" or "local"
|
||||
- max_completion_tokens
|
||||
- aclient
|
||||
"""
|
||||
|
||||
name: str
|
||||
model: Optional[str]
|
||||
model_path: Optional[str]
|
||||
mode_type: str # "server" or "local"
|
||||
default_instructor_mode = instructor.Mode.JSON
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str = "LlamaCpp",
|
||||
max_completion_tokens: int = 2048,
|
||||
instructor_mode: Optional[str] = None,
|
||||
# Server mode parameters
|
||||
endpoint: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
# Local mode parameters
|
||||
model_path: Optional[str] = None,
|
||||
n_ctx: int = 2048,
|
||||
n_gpu_layers: int = 0,
|
||||
chat_format: str = "chatml",
|
||||
):
|
||||
self.name = name
|
||||
self.max_completion_tokens = max_completion_tokens
|
||||
self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
|
||||
|
||||
# Determine which mode to use
|
||||
if model_path:
|
||||
self._init_local_mode(model_path, n_ctx, n_gpu_layers, chat_format)
|
||||
elif endpoint:
|
||||
self._init_server_mode(endpoint, api_key, model)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Must provide either 'model_path' (for local mode) or 'endpoint' (for server mode)"
|
||||
)
|
||||
|
||||
def _init_local_mode(self, model_path: str, n_ctx: int, n_gpu_layers: int, chat_format: str):
|
||||
"""Initialize local mode using llama-cpp-python library directly"""
|
||||
try:
|
||||
import llama_cpp
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"llama-cpp-python is not installed. Install with: pip install llama-cpp-python"
|
||||
)
|
||||
|
||||
logger.info(f"Initializing LlamaCpp in LOCAL mode with model: {model_path}")
|
||||
|
||||
self.mode_type = "local"
|
||||
self.model_path = model_path
|
||||
self.model = None
|
||||
|
||||
# Initialize llama-cpp-python with the model
|
||||
self.llama = llama_cpp.Llama(
|
||||
model_path=model_path,
|
||||
n_gpu_layers=n_gpu_layers, # -1 for all GPU, 0 for CPU only
|
||||
chat_format=chat_format,
|
||||
n_ctx=n_ctx,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
self.aclient = instructor.patch(
|
||||
create=self.llama.create_chat_completion_openai_v1,
|
||||
mode=instructor.Mode(self.instructor_mode),
|
||||
)
|
||||
|
||||
def _init_server_mode(self, endpoint: str, api_key: Optional[str], model: Optional[str]):
|
||||
"""Initialize server mode connecting to llama-cpp-python server"""
|
||||
logger.info(f"Initializing LlamaCpp in SERVER mode with endpoint: {endpoint}")
|
||||
|
||||
self.mode_type = "server"
|
||||
self.model = model
|
||||
self.model_path = None
|
||||
self.endpoint = endpoint
|
||||
self.api_key = api_key
|
||||
|
||||
# Use instructor.from_openai() for server mode (OpenAI-compatible API)
|
||||
self.aclient = instructor.from_openai(
|
||||
AsyncOpenAI(base_url=self.endpoint, api_key=self.api_key),
|
||||
mode=instructor.Mode(self.instructor_mode),
|
||||
)
|
||||
|
||||
@retry(
|
||||
stop=stop_after_delay(128),
|
||||
wait=wait_exponential_jitter(8, 128),
|
||||
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
|
||||
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
||||
reraise=True,
|
||||
)
|
||||
async def acreate_structured_output(
|
||||
self, text_input: str, system_prompt: str, response_model: Type[BaseModel], **kwargs
|
||||
) -> BaseModel:
|
||||
"""
|
||||
Generate a structured output from the LLM using the provided text and system prompt.
|
||||
|
||||
Works in both local and server modes transparently.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
- text_input (str): The input text provided by the user.
|
||||
- system_prompt (str): The system prompt that guides the response generation.
|
||||
- response_model (Type[BaseModel]): The model type that the response should conform to.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
- BaseModel: A structured output that conforms to the specified response model.
|
||||
"""
|
||||
async with llm_rate_limiter_context_manager():
|
||||
# Prepare messages (system first, then user is more standard)
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text_input},
|
||||
]
|
||||
|
||||
if self.mode_type == "server":
|
||||
# Server mode: use async client with OpenAI-compatible API
|
||||
response = await self.aclient.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=messages,
|
||||
response_model=response_model,
|
||||
max_retries=2,
|
||||
max_completion_tokens=self.max_completion_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
else:
|
||||
import asyncio
|
||||
|
||||
# Local mode: instructor.patch() returns a SYNC callable
|
||||
# Per docs: https://python.useinstructor.com/integrations/llama-cpp-python/
|
||||
def _call_sync():
|
||||
return self.aclient(
|
||||
messages=messages,
|
||||
response_model=response_model,
|
||||
max_tokens=self.max_completion_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Run sync function in thread pool to avoid blocking
|
||||
response = await asyncio.to_thread(_call_sync)
|
||||
|
||||
return response
|
||||
|
|
@ -104,6 +104,7 @@ anthropic = ["anthropic>=0.27"]
|
|||
deepeval = ["deepeval>=3.0.1,<4"]
|
||||
posthog = ["posthog>=3.5.0,<4"]
|
||||
groq = ["groq>=0.8.0,<1.0.0"]
|
||||
llama-cpp = ["llama-cpp-python[server]>=0.3.0,<1.0.0"]
|
||||
chromadb = [
|
||||
"chromadb>=0.6,<0.7",
|
||||
"pypika==0.48.9",
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue