modal ollama POC

feat: adds ollama deployment with modal (only completion)
2025-03-28 18:04:52 +01:00 · 2025-03-14 17:22:19 +01:00
2 changed files with 121 additions and 0 deletions
--- a/ollama_modal_deployment/ollama.service
+++ b/ollama_modal_deployment/ollama.service
@ -0,0 +1,16 @@
 [Unit]
 Description=Ollama Service
 After=network-online.target
 [Service]
 ExecStart=/usr/bin/ollama serve
 User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
 Environment="PATH=$PATH"
 Environment="OLLAMA_ORIGINS=*"
 Environment="OLLAMA_HOST=0.0.0.0:11434"
 [Install]
 WantedBy=default.target
--- a/ollama_modal_deployment/ollama_api.py
+++ b/ollama_modal_deployment/ollama_api.py
@ -0,0 +1,105 @@
 import modal
 import os
 import subprocess
 import time
 from fastapi import FastAPI, HTTPException
 from typing import List, Any, Optional, Dict
 from pydantic import BaseModel, Field
 import ollama
 from fastapi.middleware.cors import CORSMiddleware
 import httpx
 from fastapi import Request, Response
 MODEL = os.environ.get("MODEL", "deepseek-r1:70b")
 EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "avr/sfr-embedding-mistral")
 def pull() -> None:
    subprocess.run(["systemctl", "daemon-reload"])
    subprocess.run(["systemctl", "enable", "ollama"])
    subprocess.run(["systemctl", "start", "ollama"])
    wait_for_ollama()
    subprocess.run(["ollama", "pull", MODEL], stdout=subprocess.PIPE)
    subprocess.run(["ollama", "pull", EMBEDDING_MODEL], stdout=subprocess.PIPE)
 def wait_for_ollama(timeout: int = 30, interval: int = 2) -> None:
    import httpx
    from loguru import logger
    start_time = time.time()
    while True:
        try:
            response = httpx.get("http://localhost:11434/api/version")
            if response.status_code == 200:
                logger.info("Ollama service is ready")
                return
        except httpx.ConnectError:
            if time.time() - start_time > timeout:
                raise TimeoutError("Ollama service failed to start")
            logger.info(f"Waiting for Ollama service... ({int(time.time() - start_time)}s)")
            time.sleep(interval)
 image = (
    modal.Image.debian_slim()
    .apt_install("curl", "systemctl")
    .run_commands(  # from https://github.com/ollama/ollama/blob/main/docs/linux.md
        "curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz",
        "tar -C /usr -xzf ollama-linux-amd64.tgz",
        "useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama",
        "usermod -a -G ollama $(whoami)",
    )
    .copy_local_file("ollama.service", "/etc/systemd/system/ollama.service")
    .pip_install("ollama", "httpx", "loguru", "fastapi")
    # .env({"OLLAMA_MODELS": "/persistent/ollama-models"})
    # .run_function(check_blobs_directory)
    .run_function(pull)
 )
 app = modal.App(name="ollama", image=image)
 api = FastAPI()
@api.api_route("/{full_path:path}", methods=["GET", "POST", "PUT", "DELETE"])
 async def proxy(full_path: str, request: Request):
    # Construct the local Ollama endpoint URL
    local_url = f"http://localhost:11434/{full_path}"
    print(f"Forwarding {request.method} request to: {local_url}")  # Logging the target URL
    # Forward the request
    async with httpx.AsyncClient(timeout=httpx.Timeout(180.0)) as client:
        response = await client.request(
            method=request.method,
            url=local_url,
            headers=request.headers.raw,
            params=request.query_params,
            content=await request.body(),
        )
    print(f"Received response with status: {response.status_code}")  # Logging the response status
    return Response(
        content=response.content, status_code=response.status_code, headers=response.headers
    )
@app.cls(
    gpu="L40S:1",
    scaledown_window=5 * 60,
 )
 class Ollama:
    def __init__(self):
        self.serve()
    @modal.build()
    def build(self):
        subprocess.run(["systemctl", "daemon-reload"])
        subprocess.run(["systemctl", "enable", "ollama"])
    @modal.enter()
    def enter(self):
        subprocess.run(["systemctl", "start", "ollama"])
        wait_for_ollama()
        # subprocess.run(["ollama", "pull", MODEL])
    @modal.asgi_app()
    def serve(self):
        return api
Author	SHA1	Message	Date
hajdul88	c0cfcdd1f1	modal ollama POC	2025-03-28 18:04:52 +01:00
hajdul88	2ab5683303	feat: adds ollama deployment with modal (only completion)	2025-03-14 17:22:19 +01:00