Compare commits
2 commits
main
...
ollama_mod
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c0cfcdd1f1 | ||
|
|
2ab5683303 |
2 changed files with 121 additions and 0 deletions
16
ollama_modal_deployment/ollama.service
Normal file
16
ollama_modal_deployment/ollama.service
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
[Unit]
|
||||||
|
Description=Ollama Service
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStart=/usr/bin/ollama serve
|
||||||
|
User=ollama
|
||||||
|
Group=ollama
|
||||||
|
Restart=always
|
||||||
|
RestartSec=3
|
||||||
|
Environment="PATH=$PATH"
|
||||||
|
Environment="OLLAMA_ORIGINS=*"
|
||||||
|
Environment="OLLAMA_HOST=0.0.0.0:11434"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=default.target
|
||||||
105
ollama_modal_deployment/ollama_api.py
Normal file
105
ollama_modal_deployment/ollama_api.py
Normal file
|
|
@ -0,0 +1,105 @@
|
||||||
|
import modal
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from typing import List, Any, Optional, Dict
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
import ollama
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from fastapi import Request, Response
|
||||||
|
|
||||||
|
MODEL = os.environ.get("MODEL", "deepseek-r1:70b")
|
||||||
|
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "avr/sfr-embedding-mistral")
|
||||||
|
|
||||||
|
|
||||||
|
def pull() -> None:
|
||||||
|
subprocess.run(["systemctl", "daemon-reload"])
|
||||||
|
subprocess.run(["systemctl", "enable", "ollama"])
|
||||||
|
subprocess.run(["systemctl", "start", "ollama"])
|
||||||
|
wait_for_ollama()
|
||||||
|
subprocess.run(["ollama", "pull", MODEL], stdout=subprocess.PIPE)
|
||||||
|
subprocess.run(["ollama", "pull", EMBEDDING_MODEL], stdout=subprocess.PIPE)
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_ollama(timeout: int = 30, interval: int = 2) -> None:
|
||||||
|
import httpx
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
response = httpx.get("http://localhost:11434/api/version")
|
||||||
|
if response.status_code == 200:
|
||||||
|
logger.info("Ollama service is ready")
|
||||||
|
return
|
||||||
|
except httpx.ConnectError:
|
||||||
|
if time.time() - start_time > timeout:
|
||||||
|
raise TimeoutError("Ollama service failed to start")
|
||||||
|
logger.info(f"Waiting for Ollama service... ({int(time.time() - start_time)}s)")
|
||||||
|
time.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
|
image = (
|
||||||
|
modal.Image.debian_slim()
|
||||||
|
.apt_install("curl", "systemctl")
|
||||||
|
.run_commands( # from https://github.com/ollama/ollama/blob/main/docs/linux.md
|
||||||
|
"curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz",
|
||||||
|
"tar -C /usr -xzf ollama-linux-amd64.tgz",
|
||||||
|
"useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama",
|
||||||
|
"usermod -a -G ollama $(whoami)",
|
||||||
|
)
|
||||||
|
.copy_local_file("ollama.service", "/etc/systemd/system/ollama.service")
|
||||||
|
.pip_install("ollama", "httpx", "loguru", "fastapi")
|
||||||
|
# .env({"OLLAMA_MODELS": "/persistent/ollama-models"})
|
||||||
|
# .run_function(check_blobs_directory)
|
||||||
|
.run_function(pull)
|
||||||
|
)
|
||||||
|
app = modal.App(name="ollama", image=image)
|
||||||
|
api = FastAPI()
|
||||||
|
|
||||||
|
|
||||||
|
@api.api_route("/{full_path:path}", methods=["GET", "POST", "PUT", "DELETE"])
|
||||||
|
async def proxy(full_path: str, request: Request):
|
||||||
|
# Construct the local Ollama endpoint URL
|
||||||
|
local_url = f"http://localhost:11434/{full_path}"
|
||||||
|
print(f"Forwarding {request.method} request to: {local_url}") # Logging the target URL
|
||||||
|
# Forward the request
|
||||||
|
async with httpx.AsyncClient(timeout=httpx.Timeout(180.0)) as client:
|
||||||
|
response = await client.request(
|
||||||
|
method=request.method,
|
||||||
|
url=local_url,
|
||||||
|
headers=request.headers.raw,
|
||||||
|
params=request.query_params,
|
||||||
|
content=await request.body(),
|
||||||
|
)
|
||||||
|
print(f"Received response with status: {response.status_code}") # Logging the response status
|
||||||
|
return Response(
|
||||||
|
content=response.content, status_code=response.status_code, headers=response.headers
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cls(
|
||||||
|
gpu="L40S:1",
|
||||||
|
scaledown_window=5 * 60,
|
||||||
|
)
|
||||||
|
class Ollama:
|
||||||
|
def __init__(self):
|
||||||
|
self.serve()
|
||||||
|
|
||||||
|
@modal.build()
|
||||||
|
def build(self):
|
||||||
|
subprocess.run(["systemctl", "daemon-reload"])
|
||||||
|
subprocess.run(["systemctl", "enable", "ollama"])
|
||||||
|
|
||||||
|
@modal.enter()
|
||||||
|
def enter(self):
|
||||||
|
subprocess.run(["systemctl", "start", "ollama"])
|
||||||
|
wait_for_ollama()
|
||||||
|
# subprocess.run(["ollama", "pull", MODEL])
|
||||||
|
|
||||||
|
@modal.asgi_app()
|
||||||
|
def serve(self):
|
||||||
|
return api
|
||||||
Loading…
Add table
Reference in a new issue