Adds initial LightRAG app integration with schema and processors

Introduces the LightRAG Retrieval-Augmented Generation framework as an Apolo app, including input/output schemas, types, and processors.
Adds Helm chart value processing, environment and persistence configurations, and output service discovery for deployment.
Includes scripts for generating type schemas and testing support, along with CI and linting setup tailored for the new app.
Provides a documentation loader script to ingest markdown files into LightRAG with flexible referencing modes.

Relates to MLO-469
This commit is contained in:
Taddeus 2025-11-03 13:59:52 +02:00
parent 748ded40fb
commit 2255b91210
27 changed files with 5166 additions and 66 deletions

0
.apolo/__init__.py Normal file
View file

55
.apolo/applications.yaml Normal file
View file

@ -0,0 +1,55 @@
- app_type: lightrag
name: lightrag
title: LightRAG
install_type: workflow
helm_path: k8s-deploy/lightrag
app_package_name: apolo_apps_lightrag
inputs:
schema_path: .apolo/src/apolo_apps_lightrag/schemas/LightRAGAppInputs.json
types_name: LightRAGAppInputs
processor: LightRAGInputsProcessor
image: ghcr.io/neuro-inc/app-lightrag
outputs:
schema_path: .apolo/src/apolo_apps_lightrag/schemas/LightRAGAppOutputs.json
types_name: LightRAGAppOutputs
processor: LightRAGOutputsProcessor
image: ghcr.io/neuro-inc/app-lightrag
short_description: Advanced RAG framework with graph-enhanced retrieval capabilities
description: |
LightRAG is a simple and fast Retrieval-Augmented Generation (RAG) system that incorporates
graph structure into text indexing and retrieval processes. Unlike traditional RAG approaches,
LightRAG utilizes both low-level and high-level knowledge discovery to enhance text retrieval.
It supports both naive and local search methods, making it suitable for comprehensive
question-answering tasks. The system includes a user-friendly web interface for document
management and querying, with built-in support for various LLM providers and embedding models.
pub_date: "2025-06-25T00:00:00+00:00"
logo: https://storage.googleapis.com/development-421920-assets/app-logos/lightrag-logo.png
tags:
- "RAG"
- "LightRAG"
- "Knowledge Graph"
- "Vector Search"
- "Document Processing"
- "LLM"
- "Embeddings"
- "PostgreSQL"
- "Graph"
- "AI"
- "NLP"
assets:
- type: image
url: https://storage.googleapis.com/development-421920-assets/app-logos/lightrag-banner.png
- type: video
url: https://www.youtube.com/watch?v=oageL-1I0GE
- type: pdf
url: https://arxiv.org/abs/2410.05779
urls:
- name: LightRAG GitHub Repository
type: documentation
url: https://github.com/HKUDS/LightRAG
- name: LightRAG DeepWiki Documentation
type: documentation
url: https://deepwiki.com/HKUDS/LightRAG
- name: LightRAG Official Repository
type: external
url: https://github.com/HKUDS/LightRAG

1
.apolo/project.yaml Normal file
View file

@ -0,0 +1 @@
id: lightrag

View file

@ -0,0 +1,3 @@
.PHONY: gen-types-schemas
gen-types-schemas:
@.apolo/scripts/gen_types_schemas.sh

View file

@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
APP_PACKAGE_DIR=".apolo/src/apolo_apps_lightrag"
INPUT_SCHEMA="${APP_PACKAGE_DIR}/schemas/LightRAGAppInputs.json"
OUTPUT_SCHEMA="${APP_PACKAGE_DIR}/schemas/LightRAGAppOutputs.json"
if command -v poetry >/dev/null 2>&1; then
APP_TYPES_CMD=(poetry run app-types)
elif [[ -x "${REPO_ROOT}/.venv/bin/app-types" ]]; then
APP_TYPES_CMD=("${REPO_ROOT}/.venv/bin/app-types")
elif command -v app-types >/dev/null 2>&1; then
APP_TYPES_CMD=(app-types)
else
echo "app-types CLI not found. Install dependencies via 'poetry install --with dev'." >&2
exit 1
fi
(
cd "${REPO_ROOT}"
"${APP_TYPES_CMD[@]}" dump-types-schema "${APP_PACKAGE_DIR}" LightRAGAppInputs "${INPUT_SCHEMA}"
"${APP_TYPES_CMD[@]}" dump-types-schema "${APP_PACKAGE_DIR}" LightRAGAppOutputs "${OUTPUT_SCHEMA}"
)

View file

@ -0,0 +1,15 @@
from apolo_apps_lightrag.inputs_processor import (
LightRAGInputsProcessor,
)
from apolo_apps_lightrag.outputs_processor import (
LightRAGOutputsProcessor,
)
from apolo_apps_lightrag.types import LightRAGAppInputs, LightRAGAppOutputs
__all__ = [
"LightRAGInputsProcessor",
"LightRAGOutputsProcessor",
"LightRAGAppInputs",
"LightRAGAppOutputs",
]

View file

@ -0,0 +1,246 @@
import logging
import typing as t
from apolo_app_types.app_types import AppType
from apolo_app_types.helm.apps.base import BaseChartValueProcessor
from apolo_app_types.helm.apps.common import gen_extra_values
from apolo_app_types.helm.utils.deep_merging import merge_list_of_dicts
from apolo_app_types.protocols.common.openai_compat import (
OpenAICompatChatAPI,
OpenAICompatEmbeddingsAPI,
)
from apolo_app_types.protocols.common.secrets_ import serialize_optional_secret
from .types import (
AnthropicLLMProvider,
GeminiLLMProvider,
LightRAGAppInputs,
OllamaEmbeddingProvider,
OllamaLLMProvider,
OpenAIEmbeddingProvider,
OpenAILLMProvider,
)
logger = logging.getLogger(__name__)
class LightRAGInputsProcessor(BaseChartValueProcessor[LightRAGAppInputs]):
def _extract_llm_config(self, llm_config: t.Any) -> dict[str, t.Any]:
"""Extract LLM configuration from provider-specific config."""
if isinstance(llm_config, OpenAICompatChatAPI):
if not llm_config.hf_model:
msg = "OpenAI compatible chat API must have hf_model configured"
raise ValueError(msg)
model = llm_config.hf_model.model_hf_name
host = llm_config.complete_url
return {
"binding": "openai",
"model": model,
"host": host,
"api_key": getattr(llm_config, "api_key", None),
}
if isinstance(llm_config, OpenAILLMProvider):
host = llm_config.complete_url
return {
"binding": "openai",
"model": llm_config.model,
"host": host,
"api_key": llm_config.api_key,
}
if isinstance(llm_config, AnthropicLLMProvider):
host = llm_config.complete_url
return {
"binding": "anthropic",
"model": llm_config.model,
"host": host,
"api_key": llm_config.api_key,
}
if isinstance(llm_config, OllamaLLMProvider):
host = llm_config.complete_url
return {
"binding": "ollama",
"model": llm_config.model,
"host": host,
"api_key": None,
}
if isinstance(llm_config, GeminiLLMProvider):
host = llm_config.complete_url
return {
"binding": "gemini",
"model": llm_config.model,
"host": host,
"api_key": llm_config.api_key,
}
binding = getattr(llm_config, "provider", "openai")
model = getattr(llm_config, "model", "gpt-4o-mini")
api_key = getattr(llm_config, "api_key", None)
host = ""
if hasattr(llm_config, "complete_url"):
host = llm_config.complete_url
elif hasattr(llm_config, "host") and llm_config.host:
protocol = getattr(llm_config, "protocol", "https")
port = getattr(llm_config, "port", 443)
host = f"{protocol}://{llm_config.host}:{port}"
return {"binding": binding, "model": model, "host": host, "api_key": api_key}
def _extract_embedding_config(self, embedding_config: t.Any) -> dict[str, t.Any]:
"""Extract embedding configuration from provider-specific config."""
if isinstance(embedding_config, OpenAICompatEmbeddingsAPI):
if embedding_config.hf_model is None:
msg = "OpenAI compatible embeddings API must have hf_model configured"
raise ValueError(msg)
model = embedding_config.hf_model.model_hf_name
host = embedding_config.complete_url
return {
"binding": "openai",
"model": model,
"api_key": getattr(embedding_config, "api_key", None),
"dimensions": 1536,
"host": host,
}
if isinstance(embedding_config, OpenAIEmbeddingProvider):
host = embedding_config.complete_url
return {
"binding": "openai",
"model": embedding_config.model,
"api_key": embedding_config.api_key,
"dimensions": 1536,
"host": host,
}
if isinstance(embedding_config, OllamaEmbeddingProvider):
host = embedding_config.complete_url
return {
"binding": "ollama",
"model": embedding_config.model,
"api_key": None,
"dimensions": 1024,
"host": host,
}
binding = getattr(embedding_config, "provider", "openai")
model = getattr(embedding_config, "model", "text-embedding-ada-002")
api_key = getattr(embedding_config, "api_key", None)
dimensions = 1536
if hasattr(embedding_config, "dimensions"):
dimensions = embedding_config.dimensions
host = ""
if hasattr(embedding_config, "complete_url"):
host = embedding_config.complete_url
elif hasattr(embedding_config, "host") and embedding_config.host:
protocol = getattr(embedding_config, "protocol", "https")
port = getattr(embedding_config, "port", 443)
host = f"{protocol}://{embedding_config.host}:{port}"
return {
"binding": binding,
"model": model,
"api_key": api_key,
"dimensions": dimensions,
"host": host,
}
async def _get_environment_values(
self,
input_: LightRAGAppInputs,
app_secrets_name: str,
) -> dict[str, t.Any]:
llm_config = self._extract_llm_config(input_.llm_config)
embedding_config = self._extract_embedding_config(input_.embedding_config)
env_config = {
"HOST": "0.0.0.0",
"PORT": 9621,
"WEBUI_TITLE": "Graph RAG Engine",
"WEBUI_DESCRIPTION": "Simple and Fast Graph Based RAG System",
"LLM_BINDING": llm_config["binding"],
"LLM_MODEL": llm_config["model"],
"LLM_BINDING_HOST": llm_config["host"],
"LLM_BINDING_API_KEY": serialize_optional_secret(
llm_config["api_key"], app_secrets_name
),
"OPENAI_API_KEY": serialize_optional_secret(
llm_config["api_key"], app_secrets_name
)
or "",
"EMBEDDING_BINDING": embedding_config["binding"],
"EMBEDDING_MODEL": embedding_config["model"],
"EMBEDDING_DIM": embedding_config["dimensions"],
"EMBEDDING_BINDING_HOST": embedding_config["host"],
"EMBEDDING_BINDING_API_KEY": serialize_optional_secret(
embedding_config["api_key"], app_secrets_name
)
or "",
"LIGHTRAG_KV_STORAGE": "PGKVStorage",
"LIGHTRAG_VECTOR_STORAGE": "PGVectorStorage",
"LIGHTRAG_DOC_STATUS_STORAGE": "PGDocStatusStorage",
"LIGHTRAG_GRAPH_STORAGE": "NetworkXStorage",
"POSTGRES_HOST": input_.pgvector_user.pgbouncer_host,
"POSTGRES_PORT": input_.pgvector_user.pgbouncer_port,
"POSTGRES_USER": input_.pgvector_user.user,
"POSTGRES_PASSWORD": input_.pgvector_user.password,
"POSTGRES_DATABASE": input_.pgvector_user.dbname,
"POSTGRES_WORKSPACE": "default",
}
return {"env": env_config}
async def _get_persistence_values(
self,
input_: LightRAGAppInputs,
) -> dict[str, t.Any]:
return {
"persistence": {
"enabled": True,
"ragStorage": {
"size": f"{input_.persistence.rag_storage_size}Gi",
},
"inputs": {
"size": f"{input_.persistence.inputs_storage_size}Gi",
},
}
}
async def gen_extra_values(
self,
input_: LightRAGAppInputs,
app_name: str,
namespace: str,
app_id: str,
app_secrets_name: str,
*_: t.Any,
**kwargs: t.Any,
) -> dict[str, t.Any]:
env_values = await self._get_environment_values(input_, app_secrets_name)
persistence_values = await self._get_persistence_values(input_)
platform_values = await gen_extra_values(
apolo_client=self.client,
preset_type=input_.preset,
ingress_http=input_.ingress_http,
ingress_grpc=None,
namespace=namespace,
app_id=app_id,
app_type=AppType.LightRAG,
)
base_values = {
"replicaCount": 1,
"image": {
"repository": "ghcr.io/hkuds/lightrag",
"tag": "1.3.8",
"pullPolicy": "IfNotPresent",
},
"service": {
"type": "ClusterIP",
"port": 9621,
},
"nameOverride": "",
"fullnameOverride": app_name,
}
logger.debug("Generated LightRAG values for app %s", app_name)
return merge_list_of_dicts(
[
base_values,
env_values,
persistence_values,
platform_values,
]
)
__all__ = ["LightRAGInputsProcessor"]

View file

@ -0,0 +1,66 @@
import logging
import typing as t
from apolo_app_types.clients.kube import get_service_host_port
from apolo_app_types.outputs.base import BaseAppOutputsProcessor
from apolo_app_types.outputs.common import (
INSTANCE_LABEL,
get_internal_external_web_urls,
)
from apolo_app_types.outputs.utils.ingress import get_ingress_host_port
from apolo_app_types.protocols.common.networking import HttpApi, ServiceAPI, WebApp
from .types import LightRAGAppOutputs
logger = logging.getLogger(__name__)
async def _generate_lightrag_outputs(
helm_values: dict[str, t.Any],
app_instance_id: str,
) -> LightRAGAppOutputs:
labels = {"app.kubernetes.io/name": "lightrag", INSTANCE_LABEL: app_instance_id}
internal_web_app_url, external_web_app_url = await get_internal_external_web_urls(
labels
)
internal_host, internal_port = await get_service_host_port(match_labels=labels)
internal_server_url = None
if internal_host:
internal_server_url = HttpApi(
host=internal_host,
port=int(internal_port),
protocol="http",
)
external_server_url = None
ingress_host_port = await get_ingress_host_port(match_labels=labels)
if ingress_host_port:
external_server_url = HttpApi(
host=ingress_host_port[0],
port=int(ingress_host_port[1]),
protocol="https",
)
return LightRAGAppOutputs(
app_url=ServiceAPI[WebApp](
internal_url=internal_web_app_url,
external_url=external_web_app_url,
),
server_url=ServiceAPI[HttpApi](
internal_url=internal_server_url,
external_url=external_server_url,
),
)
class LightRAGOutputsProcessor(BaseAppOutputsProcessor[LightRAGAppOutputs]):
async def _generate_outputs(
self,
helm_values: dict[str, t.Any],
app_instance_id: str,
) -> LightRAGAppOutputs:
outputs = await _generate_lightrag_outputs(helm_values, app_instance_id)
logger.info("Got outputs: %s", outputs)
return outputs
__all__ = ["LightRAGOutputsProcessor"]

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,245 @@
{
"$defs": {
"HttpApi": {
"properties": {
"host": {
"title": "Host",
"type": "string",
"x-description": "The hostname of the HTTP endpoint.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Hostname"
},
"port": {
"default": 80,
"exclusiveMinimum": 0,
"title": "Port",
"type": "integer",
"x-description": "The port of the HTTP endpoint.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Port"
},
"protocol": {
"default": "http",
"title": "Protocol",
"type": "string",
"x-description": "The protocol to use, e.g., http or https.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Protocol"
},
"timeout": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 30.0,
"title": "Timeout",
"x-description": "Connection timeout in seconds.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Connection Timeout"
},
"base_path": {
"default": "/",
"title": "Base Path",
"type": "string"
}
},
"required": [
"host"
],
"title": "HttpApi",
"type": "object",
"x-description": "HTTP API Configuration.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "HTTP API",
"x-type": "OpenAICompatibleEmbeddingsRestAPI"
},
"ServiceAPI_HttpApi_": {
"properties": {
"internal_url": {
"anyOf": [
{
"$ref": "#/$defs/HttpApi"
},
{
"type": "null"
}
],
"default": null,
"x-description": "Internal URL to access the service. This route is not protected by platform authorization and only workloads from the same project can access it.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Internal URL"
},
"external_url": {
"anyOf": [
{
"$ref": "#/$defs/HttpApi"
},
{
"type": "null"
}
],
"default": null,
"x-description": "External URL for accessing the service from outside the cluster. This route might be secured by platform authorization and is accessible from any network with a valid platform authorization token that has appropriate permissions.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "External URL"
}
},
"title": "ServiceAPI[HttpApi]",
"type": "object",
"x-description": "Service APIs URLs.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Service APIs",
"x-type": "ServiceAPI[GrpcAPI]"
},
"ServiceAPI_WebApp_": {
"properties": {
"internal_url": {
"anyOf": [
{
"$ref": "#/$defs/WebApp"
},
{
"type": "null"
}
],
"default": null,
"x-description": "Internal URL to access the service. This route is not protected by platform authorization and only workloads from the same project can access it.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Internal URL"
},
"external_url": {
"anyOf": [
{
"$ref": "#/$defs/WebApp"
},
{
"type": "null"
}
],
"default": null,
"x-description": "External URL for accessing the service from outside the cluster. This route might be secured by platform authorization and is accessible from any network with a valid platform authorization token that has appropriate permissions.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "External URL"
}
},
"title": "ServiceAPI[WebApp]",
"type": "object",
"x-description": "Service APIs URLs.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Service APIs",
"x-type": "ServiceAPI[GrpcAPI]"
},
"WebApp": {
"properties": {
"host": {
"title": "Host",
"type": "string",
"x-description": "The hostname of the HTTP endpoint.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Hostname"
},
"port": {
"default": 80,
"exclusiveMinimum": 0,
"title": "Port",
"type": "integer",
"x-description": "The port of the HTTP endpoint.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Port"
},
"protocol": {
"default": "http",
"title": "Protocol",
"type": "string",
"x-description": "The protocol to use, e.g., http or https.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Protocol"
},
"timeout": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 30.0,
"title": "Timeout",
"x-description": "Connection timeout in seconds.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Connection Timeout"
},
"base_path": {
"default": "/",
"title": "Base Path",
"type": "string"
},
"api_type": {
"const": "webapp",
"default": "webapp",
"title": "Api Type",
"type": "string"
}
},
"required": [
"host"
],
"title": "WebApp",
"type": "object",
"x-description": "HTTP API Configuration.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "HTTP API",
"x-type": "OpenAICompatibleEmbeddingsRestAPI"
}
},
"description": "LightRAG outputs.",
"properties": {
"app_url": {
"anyOf": [
{
"$ref": "#/$defs/ServiceAPI_WebApp_"
},
{
"type": "null"
}
],
"default": null,
"description": "The main application URL for accessing the service. This is the primary endpoint users should use to access the application.",
"title": "Application URL"
},
"server_url": {
"anyOf": [
{
"$ref": "#/$defs/ServiceAPI_HttpApi_"
},
{
"type": "null"
}
],
"default": null
}
},
"title": "LightRAGAppOutputs",
"type": "object"
}

View file

@ -0,0 +1,426 @@
from typing import Literal
from pydantic import BaseModel, ConfigDict, Field, field_validator
from apolo_app_types import AppInputs, AppOutputs
from apolo_app_types.protocols.common import (
IngressHttp,
Preset,
SchemaExtraMetadata,
SchemaMetaType,
)
from apolo_app_types.protocols.common.networking import HttpApi, RestAPI, ServiceAPI
from apolo_app_types.protocols.common.openai_compat import (
OpenAICompatChatAPI,
OpenAICompatEmbeddingsAPI,
)
from apolo_app_types.protocols.postgres import CrunchyPostgresUserCredentials
class LightRAGPersistence(BaseModel):
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="LightRAG Persistence",
description="Configure persistent storage for LightRAG data and inputs.",
).as_json_schema_extra(),
)
rag_storage_size: int = Field(
default=10,
gt=0,
json_schema_extra=SchemaExtraMetadata(
title="RAG Storage Size (GB)",
description="Size of the persistent volume for RAG data storage.",
).as_json_schema_extra(),
)
inputs_storage_size: int = Field(
default=5,
gt=0,
json_schema_extra=SchemaExtraMetadata(
title="Inputs Storage Size (GB)",
description="Size of the persistent volume for input files.",
).as_json_schema_extra(),
)
@field_validator("rag_storage_size", "inputs_storage_size", mode="before")
@classmethod
def validate_storage_size(cls, value: int) -> int:
if value and isinstance(value, int) and value < 1:
error_message = "Storage size must be greater than 1GB."
raise ValueError(error_message)
return value
class OpenAILLMProvider(RestAPI):
"""OpenAI LLM provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="OpenAI LLM Provider",
description="OpenAI chat completion API configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
default="api.openai.com",
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="OpenAI API host",
).as_json_schema_extra(),
)
port: int = Field(
default=443,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Set the port.",
).as_json_schema_extra(),
)
protocol: Literal["https"] = "https"
timeout: int | None = Field(
default=60,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Set the connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/v1"
provider: Literal["openai"] = "openai"
model: str = Field(
default="gpt-4o-mini",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Chat completion model name.",
).as_json_schema_extra(),
)
api_key: str = Field(
default="",
json_schema_extra=SchemaExtraMetadata(
title="API Key",
description="OpenAI API key.",
).as_json_schema_extra(),
)
class AnthropicLLMProvider(RestAPI):
"""Anthropic LLM provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="Anthropic LLM Provider",
description="Anthropic Claude API configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
default="api.anthropic.com",
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="Anthropic API host",
).as_json_schema_extra(),
)
port: int = Field(
default=443,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Set the port.",
).as_json_schema_extra(),
)
protocol: Literal["https"] = "https"
timeout: int | None = Field(
default=60,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Set the connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/v1"
provider: Literal["anthropic"] = "anthropic"
model: str = Field(
default="claude-3-5-sonnet-20241022",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Anthropic Claude model name.",
).as_json_schema_extra(),
)
api_key: str = Field(
default="",
json_schema_extra=SchemaExtraMetadata(
title="API Key",
description="Anthropic API key.",
).as_json_schema_extra(),
)
class OllamaLLMProvider(RestAPI):
"""Ollama LLM provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="Ollama LLM Provider",
description="Configuration for a self-hosted Ollama server.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="Ollama server host.",
).as_json_schema_extra(),
)
port: int = Field(
default=11434,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Ollama server port.",
).as_json_schema_extra(),
)
protocol: Literal["http", "https"] = Field(
default="http",
json_schema_extra=SchemaExtraMetadata(
title="Protocol",
description="Ollama server protocol.",
).as_json_schema_extra(),
)
timeout: int | None = Field(
default=300,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Configure connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/api"
provider: Literal["ollama"] = "ollama"
model: str = Field(
default="llama3.1:8b-instruct-q4_0",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Ollama model name.",
).as_json_schema_extra(),
)
class GeminiLLMProvider(RestAPI):
"""Google Gemini LLM provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="Gemini LLM Provider",
description="Google Gemini API configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
default="generativelanguage.googleapis.com",
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="Google AI API host",
).as_json_schema_extra(),
)
port: int = Field(
default=443,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Set the port.",
).as_json_schema_extra(),
)
protocol: Literal["https"] = "https"
timeout: int | None = Field(
default=60,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Configure connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/v1"
provider: Literal["gemini"] = "gemini"
model: str = Field(
default="gemini-1.5-flash",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Google Gemini model name.",
).as_json_schema_extra(),
)
api_key: str = Field(
default="",
json_schema_extra=SchemaExtraMetadata(
title="API Key",
description="Google AI API key.",
).as_json_schema_extra(),
)
LLMProvider = (
OpenAICompatChatAPI
| OpenAILLMProvider
| AnthropicLLMProvider
| OllamaLLMProvider
| GeminiLLMProvider
)
class OpenAIEmbeddingProvider(RestAPI):
"""OpenAI embedding provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="OpenAI Embedding Provider",
description="OpenAI embeddings API configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
default="api.openai.com",
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="OpenAI API host",
).as_json_schema_extra(),
)
port: int = Field(
default=443,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Set the port.",
).as_json_schema_extra(),
)
protocol: Literal["https"] = "https"
timeout: int | None = Field(
default=60,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Set the connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/v1"
provider: Literal["openai"] = "openai"
model: str = Field(
default="text-embedding-ada-002",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Embedding model name.",
).as_json_schema_extra(),
)
api_key: str = Field(
default="",
json_schema_extra=SchemaExtraMetadata(
title="API Key",
description="OpenAI API key.",
).as_json_schema_extra(),
)
class OllamaEmbeddingProvider(RestAPI):
"""Ollama embedding provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="Ollama Embedding Provider",
description="Ollama local embedding model configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="Ollama server host.",
).as_json_schema_extra(),
)
port: int = Field(
default=11434,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Ollama server port.",
).as_json_schema_extra(),
)
protocol: Literal["http", "https"] = Field(
default="http",
json_schema_extra=SchemaExtraMetadata(
title="Protocol",
description="Ollama server protocol.",
).as_json_schema_extra(),
)
timeout: int | None = Field(
default=300,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Configure connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/api"
provider: Literal["ollama"] = "ollama"
model: str = Field(
default="nomic-embed-text",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Ollama embedding model name.",
).as_json_schema_extra(),
)
EmbeddingProvider = (
OpenAICompatEmbeddingsAPI | OpenAIEmbeddingProvider | OllamaEmbeddingProvider
)
LightRAGLLMConfig = LLMProvider
LightRAGEmbeddingConfig = EmbeddingProvider
class LightRAGAppInputs(AppInputs):
preset: Preset
ingress_http: IngressHttp
pgvector_user: CrunchyPostgresUserCredentials
llm_config: LightRAGLLMConfig = Field(
default=OpenAICompatChatAPI(host="", port=443, protocol="https"),
json_schema_extra=SchemaExtraMetadata(
title="LLM Configuration",
description="LLM provider configuration.",
).as_json_schema_extra(),
)
embedding_config: LightRAGEmbeddingConfig = Field(
default=OpenAICompatEmbeddingsAPI(host="", port=443, protocol="https"),
json_schema_extra=SchemaExtraMetadata(
title="Embedding Configuration",
description="Embedding provider configuration.",
).as_json_schema_extra(),
)
persistence: LightRAGPersistence = Field(
default_factory=LightRAGPersistence,
json_schema_extra=SchemaExtraMetadata(
title="Persistence Configuration",
description="Configure persistent storage for LightRAG data and inputs.",
).as_json_schema_extra(),
)
class LightRAGAppOutputs(AppOutputs):
"""LightRAG outputs."""
server_url: ServiceAPI[HttpApi] | None = None
__all__ = [
"LightRAGAppInputs",
"LightRAGAppOutputs",
"LightRAGEmbeddingConfig",
"LightRAGLLMConfig",
"LightRAGPersistence",
"OpenAILLMProvider",
"AnthropicLLMProvider",
"OllamaLLMProvider",
"GeminiLLMProvider",
"OpenAIEmbeddingProvider",
"OllamaEmbeddingProvider",
]

0
.apolo/tests/__init__.py Normal file
View file

6
.apolo/tests/conftest.py Normal file
View file

@ -0,0 +1,6 @@
import pytest
pytest_plugins = [
"apolo_app_types_fixtures.apolo_clients",
"apolo_app_types_fixtures.constants",
]

View file

View file

@ -23,7 +23,7 @@ LLM_BINDING_API_KEY=sk-your-openai-api-key-here
# EMBEDDING_BINDING_HOST=https://api.openai.com/v1 # EMBEDDING_BINDING_HOST=https://api.openai.com/v1
# EMBEDDING_BINDING_API_KEY=sk-your-openai-api-key-here # EMBEDDING_BINDING_API_KEY=sk-your-openai-api-key-here
EMBEDDING_BINDING=openai EMBEDDING_BINDING=openai
EMBEDDING_MODEL=gemini-embedding-001 EMBEDDING_MODEL=gemini-embedding-001
EMBEDDING_DIM=3072 EMBEDDING_DIM=3072
EMBEDDING_BINDING_HOST=https://generativelanguage.googleapis.com/v1beta/openai/ EMBEDDING_BINDING_HOST=https://generativelanguage.googleapis.com/v1beta/openai/
EMBEDDING_BINDING_API_KEY=AI-your-gemini-api-key-here EMBEDDING_BINDING_API_KEY=AI-your-gemini-api-key-here

17
.github/actionlint-matcher.json vendored Normal file
View file

@ -0,0 +1,17 @@
{
"problemMatcher": [
{
"owner": "actionlint",
"pattern": [
{
"code": 5,
"column": 3,
"file": 1,
"line": 2,
"message": 4,
"regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$"
}
]
}
]
}

106
.github/workflows/ci.yaml vendored Normal file
View file

@ -0,0 +1,106 @@
name: CI
on:
push:
branches: [master]
tags: ["v*"]
pull_request:
branches: [master]
schedule:
- cron: 0 4 * * *
jobs:
test:
name: All checks are passed
uses: ./.github/workflows/test.yaml
secrets: inherit
approve:
name: Approve bot PR
runs-on: ubuntu-latest
if: endsWith(github.actor, '[bot]')
needs: test
permissions:
contents: write
pull-requests: write
steps:
- name: Checkout
uses: actions/checkout@v4
- name: metadata
id: metadata
if: github.actor == 'dependabot[bot]'
uses: dependabot/fetch-metadata@v2
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Enable auto-merge for bot PRs
run: gh pr merge --auto --squash --delete-branch "$PR_URL"
env:
PR_URL: ${{ github.event.pull_request.html_url }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
release-processor-image:
name: Release processor image
runs-on: ubuntu-latest
needs: test
if: github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/master')
steps:
- name: Checkout commit
uses: actions/checkout@v4
- name: Purge old artifacts
uses: kolpav/purge-artifacts-action@v1
with:
token: ${{ github.token }}
expire-in: 30mins
- name: Login to ghcr.io
uses: docker/login-action@v3.5.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ github.token }}
- name: Release development image
run: |
make build-hook-image
- name: Release development image
run: |
export IMAGE_TAG=development
make push-hook-image
- name: Release prod image
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
run: |
export IMAGE_TAG=${GITHUB_REF#refs/tags/v}
make push-hook-image
release-processor-image-pr:
name: Release processor image from PR
runs-on: ubuntu-latest
needs: test
if: github.event_name == 'pull_request'
steps:
- name: Checkout commit
uses: actions/checkout@v4
- name: Purge old artifacts
uses: kolpav/purge-artifacts-action@v1
with:
token: ${{ github.token }}
expire-in: 30mins
- name: Login to ghcr.io
uses: docker/login-action@v3.5.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ github.token }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ghcr.io/neuro-inc/app-lightrag
tags: |
type=ref,event=pr
- name: Build Docker image
run: make build-hook-image
- name: Push release for testing
if: github.actor != 'dependabot[bot]'
run: |
FULL_IMAGE_NAME=${{ steps.meta.outputs.tags }}
export IMAGE_TAG=${FULL_IMAGE_NAME##*:}
make push-hook-image

View file

@ -24,7 +24,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install pre-commit pip install pre-commit apolo-app-types
- name: Run pre-commit - name: Run pre-commit
run: pre-commit run --all-files --show-diff-on-failure run: pre-commit run --all-files --show-diff-on-failure

74
.github/workflows/test.yaml vendored Normal file
View file

@ -0,0 +1,74 @@
on:
workflow_call: {}
jobs:
lint:
name: Lint
runs-on: ubuntu-latest
steps:
- name: Checkout commit
uses: actions/checkout@v4
- name: Install python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Cache pre-commit hooks
uses: actions/cache@v4
with:
path: ~/.cache/pre-commit
key: pre-commit|py3.12|${{ hashFiles('.pre-commit-config.yaml') }}
- name: Install Poetry
run: pip install poetry
- name: Setup Python dependencies cache
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry
key: poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
poetry-
- name: Install dependencies
run: make install
- name: Add local venv to PATH
run: echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
- name: Lint
run: |
echo "::add-matcher::.github/actionlint-matcher.json"
make lint
test-unit:
name: Unit tests
runs-on: ubuntu-latest
steps:
- name: Checkout commit
uses: actions/checkout@v4
- name: Install python
uses: actions/setup-python@v5
with:
python-version-file: .python-version
cache: pip
- name: Install Poetry
run: pip install poetry
- name: Setup Python dependencies cache
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry
key: poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
poetry-
- name: Install dependencies
run: make setup
- name: Add local venv to PATH
run: echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
- name: Run unit tests
run: make test-unit
check:
name: Check
needs:
- lint
runs-on: ubuntu-latest
if: always()
steps:
- name: Decide whether the needed jobs succeeded or failed
uses: re-actors/alls-green@release/v1
with:
jobs: ${{ toJSON(needs) }}

View file

@ -26,3 +26,23 @@ repos:
- id: check-manifest - id: check-manifest
stages: [manual] stages: [manual]
exclude: ^lightrag/api/webui/ exclude: ^lightrag/api/webui/
- repo: https://github.com/mrtazz/checkmake.git
rev: 0.2.2
hooks:
- id: checkmake
- repo: https://github.com/gruntwork-io/pre-commit
rev: v0.1.30
hooks:
- id: helmlint
- repo: local
hooks:
- id: generate-types-schemas
name: Generate types schemas
entry: make -f .apolo/scripts/gen_types_schemas.mk gen-types-schemas
language: system

1
.python-version Normal file
View file

@ -0,0 +1 @@
3.11

View file

@ -7,16 +7,7 @@ CHART_PACKAGE_DIR := dist/charts
HELM_REGISTRY := ghcr.io/neuro-inc/helm-charts HELM_REGISTRY := ghcr.io/neuro-inc/helm-charts
RAW_VERSION := $(if $(VERSION),$(VERSION),$(shell git describe --tags --always --dirty 2>/dev/null)) RAW_VERSION := $(if $(VERSION),$(VERSION),$(shell git describe --tags --always --dirty 2>/dev/null))
SANITIZED_VERSION := $(shell RAW="$(RAW_VERSION)" python - <<'PY' SANITIZED_VERSION := $(shell python -c 'import re; raw = "$(RAW_VERSION)".strip(); raw = raw[1:] if raw.startswith("v") else raw; raw = raw or "0.0.0"; sanitized = re.sub(r"[^0-9A-Za-z.\-]", "-", raw); print(sanitized or "0.0.0")')
import os, re
raw = os.environ.get("RAW", "").strip()
if not raw:
raw = "0.0.0"
raw = raw.lstrip("v")
sanitized = re.sub(r"[^0-9A-Za-z\\.\\-]", "-", raw)
print(sanitized or "0.0.0")
PY
)
CHART_VERSION := $(SANITIZED_VERSION) CHART_VERSION := $(SANITIZED_VERSION)
CHART_PACKAGE := $(CHART_PACKAGE_DIR)/$(CHART_NAME)-$(CHART_VERSION).tgz CHART_PACKAGE := $(CHART_PACKAGE_DIR)/$(CHART_NAME)-$(CHART_VERSION).tgz

View file

@ -8,7 +8,7 @@ Advanced script to load markdown documentation into LightRAG with flexible refer
# Default mode (file path references) # Default mode (file path references)
python load_docs.py /path/to/your/docs python load_docs.py /path/to/your/docs
# URL mode (website link references) # URL mode (website link references)
python load_docs.py /path/to/docs --mode urls --base-url https://docs.example.com/ python load_docs.py /path/to/docs --mode urls --base-url https://docs.example.com/
``` ```
@ -28,7 +28,7 @@ python load_docs.py docs/ --mode files
- [KG] administration/setup.md - [KG] administration/setup.md
``` ```
### URLs Mode ### URLs Mode
Uses website URLs in query response citations: Uses website URLs in query response citations:
```bash ```bash
python load_docs.py docs/ --mode urls --base-url https://docs.apolo.us/index/ python load_docs.py docs/ --mode urls --base-url https://docs.apolo.us/index/
@ -37,7 +37,7 @@ python load_docs.py docs/ --mode urls --base-url https://my-docs.com/v1/
**Query Response Example:** **Query Response Example:**
``` ```
### References ### References
- [DC] https://docs.apolo.us/index/getting-started/installation - [DC] https://docs.apolo.us/index/getting-started/installation
- [KG] https://docs.apolo.us/index/administration/setup - [KG] https://docs.apolo.us/index/administration/setup
``` ```
@ -68,7 +68,7 @@ docs/
```bash ```bash
# Visit your docs site and note the URL patterns: # Visit your docs site and note the URL patterns:
# https://docs.example.com/getting-started/installation # https://docs.example.com/getting-started/installation
# https://docs.example.com/api/authentication # https://docs.example.com/api/authentication
# https://docs.example.com/guides/deployment # https://docs.example.com/guides/deployment
``` ```
@ -80,7 +80,7 @@ mkdir -p docs/{getting-started,api,guides}
**Step 3: Organize Your Markdown Files** **Step 3: Organize Your Markdown Files**
```bash ```bash
# Match each URL to a file path: # Match each URL to a file path:
docs/getting-started/installation.md # → /getting-started/installation docs/getting-started/installation.md # → /getting-started/installation
docs/api/authentication.md # → /api/authentication docs/api/authentication.md # → /api/authentication
docs/guides/deployment.md # → /guides/deployment docs/guides/deployment.md # → /guides/deployment
docs/guides/README.md # → /guides (overview page) docs/guides/README.md # → /guides (overview page)
@ -109,7 +109,7 @@ curl -I https://docs.example.com/api/authentication
apolo-docs/ apolo-docs/
├── getting-started/ ├── getting-started/
│ ├── first-steps/ │ ├── first-steps/
│ │ ├── getting-started.md → /index/getting-started/first-steps/getting-started │ │ ├── getting-started.md → /index/getting-started/first-steps/getting-started
│ │ └── README.md → /index/getting-started/first-steps │ │ └── README.md → /index/getting-started/first-steps
│ ├── apolo-base-docker-image.md → /index/getting-started/apolo-base-docker-image │ ├── apolo-base-docker-image.md → /index/getting-started/apolo-base-docker-image
│ └── faq.md → /index/getting-started/faq │ └── faq.md → /index/getting-started/faq
@ -135,14 +135,14 @@ python load_docs.py docs/ --endpoint https://lightrag.example.com
# Load to local instance, skip test query # Load to local instance, skip test query
python load_docs.py docs/ --no-test python load_docs.py docs/ --no-test
# Files mode with custom endpoint # Files mode with custom endpoint
python load_docs.py docs/ --mode files --endpoint http://localhost:9621 python load_docs.py docs/ --mode files --endpoint http://localhost:9621
``` ```
## Features ## Features
- **Dual Reference Modes**: File paths or live website URLs in citations - **Dual Reference Modes**: File paths or live website URLs in citations
- **Flexible Base URL**: Works with any documentation site structure - **Flexible Base URL**: Works with any documentation site structure
- **Simple dependency**: Only requires `httpx` and Python standard library - **Simple dependency**: Only requires `httpx` and Python standard library
- **Automatic discovery**: Finds all `.md` files recursively - **Automatic discovery**: Finds all `.md` files recursively
- **Smart metadata**: Adds appropriate title, path/URL, and source information - **Smart metadata**: Adds appropriate title, path/URL, and source information
@ -171,4 +171,4 @@ This loader is perfect for:
pip install httpx pip install httpx
``` ```
**Note**: This script is included with LightRAG deployments and provides a simple way to load any markdown documentation into your LightRAG instance. **Note**: This script is included with LightRAG deployments and provides a simple way to load any markdown documentation into your LightRAG instance.

22
hooks.Dockerfile Normal file
View file

@ -0,0 +1,22 @@
FROM python:3.12-slim
LABEL org.opencontainers.image.source="https://github.com/neuro-inc/LightRAG"
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_CREATE=0
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY README.md poetry.lock pyproject.toml .
RUN pip --no-cache-dir install poetry && poetry install --no-root --no-cache
COPY .apolo .apolo
RUN poetry install --only-root --no-cache
ENTRYPOINT ["app-types"]

View file

@ -14,8 +14,8 @@ from typing import Dict, List, Optional
async def load_document_to_lightrag( async def load_document_to_lightrag(
content: str, content: str,
title: str, title: str,
doc_url: str, doc_url: str,
endpoint: str = "http://localhost:9621", endpoint: str = "http://localhost:9621",
headers: Optional[Dict[str, str]] = None headers: Optional[Dict[str, str]] = None
@ -34,7 +34,7 @@ async def load_document_to_lightrag(
"file_source": doc_url "file_source": doc_url
} }
) )
if response.status_code == 200: if response.status_code == 200:
print(f"✅ Loaded: {title}") print(f"✅ Loaded: {title}")
return True return True
@ -47,7 +47,7 @@ async def load_document_to_lightrag(
except: except:
print(f" Response: {response.text}") print(f" Response: {response.text}")
return False return False
except Exception as e: except Exception as e:
print(f"❌ Error loading {title}: {e}") print(f"❌ Error loading {title}: {e}")
return False return False
@ -58,27 +58,27 @@ def convert_file_path_to_url(relative_path: str, base_url: str) -> str:
# Ensure base URL ends with / # Ensure base URL ends with /
if not base_url.endswith('/'): if not base_url.endswith('/'):
base_url += '/' base_url += '/'
# Handle special cases # Handle special cases
if relative_path in ["README.md", "SUMMARY.md"]: if relative_path in ["README.md", "SUMMARY.md"]:
return base_url.rstrip('/') return base_url.rstrip('/')
# Remove .md extension and convert path # Remove .md extension and convert path
url_path = relative_path.replace(".md", "") url_path = relative_path.replace(".md", "")
# Handle README files in subdirectories - they map to the directory URL # Handle README files in subdirectories - they map to the directory URL
if url_path.endswith("/README"): if url_path.endswith("/README"):
url_path = url_path[:-7] # Remove "/README" url_path = url_path[:-7] # Remove "/README"
# Clean up any double slashes # Clean up any double slashes
url_path = url_path.strip("/") url_path = url_path.strip("/")
return f"{base_url}{url_path}" return f"{base_url}{url_path}"
def load_markdown_files(docs_path: Path, mode: str = "files", base_url: str = None) -> List[tuple]: def load_markdown_files(docs_path: Path, mode: str = "files", base_url: str = None) -> List[tuple]:
"""Load all markdown files from directory structure """Load all markdown files from directory structure
Args: Args:
docs_path: Path to documentation directory docs_path: Path to documentation directory
mode: 'files' for file paths, 'urls' for URL references mode: 'files' for file paths, 'urls' for URL references
@ -86,42 +86,42 @@ def load_markdown_files(docs_path: Path, mode: str = "files", base_url: str = No
""" """
if not docs_path.exists(): if not docs_path.exists():
raise FileNotFoundError(f"Documentation directory not found: {docs_path}") raise FileNotFoundError(f"Documentation directory not found: {docs_path}")
if mode == "urls" and not base_url: if mode == "urls" and not base_url:
raise ValueError("base_url is required when mode is 'urls'") raise ValueError("base_url is required when mode is 'urls'")
# Find all markdown files, excluding SUMMARY.md as it's just the table of contents # Find all markdown files, excluding SUMMARY.md as it's just the table of contents
md_files = [f for f in docs_path.rglob("*.md") if f.name != "SUMMARY.md"] md_files = [f for f in docs_path.rglob("*.md") if f.name != "SUMMARY.md"]
print(f"📚 Found {len(md_files)} markdown files") print(f"📚 Found {len(md_files)} markdown files")
print(f"🔧 Mode: {mode}") print(f"🔧 Mode: {mode}")
if mode == "urls": if mode == "urls":
print(f"🌐 Base URL: {base_url}") print(f"🌐 Base URL: {base_url}")
documents = [] documents = []
for file_path in md_files: for file_path in md_files:
try: try:
# Load content # Load content
with open(file_path, 'r', encoding='utf-8') as f: with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip() content = f.read().strip()
if not content: if not content:
continue continue
# Generate title from filename # Generate title from filename
title = file_path.stem.replace("-", " ").replace("_", " ").title() title = file_path.stem.replace("-", " ").replace("_", " ").title()
if title.lower() == "readme": if title.lower() == "readme":
# Use parent directory name for README files # Use parent directory name for README files
title = f"{file_path.parent.name.replace('-', ' ').replace('_', ' ').title()} Overview" title = f"{file_path.parent.name.replace('-', ' ').replace('_', ' ').title()} Overview"
# Get relative path for metadata # Get relative path for metadata
relative_path = str(file_path.relative_to(docs_path)) relative_path = str(file_path.relative_to(docs_path))
if mode == "files": if mode == "files":
# Use file path as reference # Use file path as reference
reference = relative_path reference = relative_path
source_info = f"File: {file_path.name}" source_info = f"File: {file_path.name}"
# Prepare content with file metadata # Prepare content with file metadata
content_with_metadata = f""" content_with_metadata = f"""
Title: {title} Title: {title}
@ -134,7 +134,7 @@ Source: {source_info}
# Convert file path to documentation URL # Convert file path to documentation URL
reference = convert_file_path_to_url(relative_path, base_url) reference = convert_file_path_to_url(relative_path, base_url)
source_info = f"Documentation Site" source_info = f"Documentation Site"
# Prepare content with URL metadata # Prepare content with URL metadata
content_with_metadata = f""" content_with_metadata = f"""
Title: {title} Title: {title}
@ -143,13 +143,13 @@ Source: {source_info}
{content} {content}
""" """
documents.append((content_with_metadata, title, reference)) documents.append((content_with_metadata, title, reference))
except Exception as e: except Exception as e:
print(f"⚠️ Error processing {file_path}: {e}") print(f"⚠️ Error processing {file_path}: {e}")
continue continue
return documents return documents
@ -189,7 +189,7 @@ async def test_query(
headers=request_headers, headers=request_headers,
json={"query": "What is this documentation about?", "mode": "local"} json={"query": "What is this documentation about?", "mode": "local"}
) )
if response.status_code == 200: if response.status_code == 200:
result = response.json() result = response.json()
print(f"✅ Query successful!") print(f"✅ Query successful!")
@ -202,7 +202,7 @@ async def test_query(
print(f" Error details: {error_detail}") print(f" Error details: {error_detail}")
except: except:
print(f" Response: {response.text}") print(f" Response: {response.text}")
except Exception as e: except Exception as e:
print(f"❌ Query error: {e}") print(f"❌ Query error: {e}")
@ -216,22 +216,22 @@ async def main():
Examples: Examples:
# Load with file path references (default mode) # Load with file path references (default mode)
python load_docs.py ../apolo-copilot/docs/official-apolo-documentation/docs python load_docs.py ../apolo-copilot/docs/official-apolo-documentation/docs
# Load with URL references # Load with URL references
python load_docs.py docs/ --mode urls --base-url https://docs.apolo.us/index/ python load_docs.py docs/ --mode urls --base-url https://docs.apolo.us/index/
# Load Apolo docs with URL references (common use case) # Load Apolo docs with URL references (common use case)
python load_docs.py ../apolo-copilot/docs/official-apolo-documentation/docs \\ python load_docs.py ../apolo-copilot/docs/official-apolo-documentation/docs \\
--mode urls --base-url https://docs.apolo.us/index/ --mode urls --base-url https://docs.apolo.us/index/
# Use custom endpoint # Use custom endpoint
python load_docs.py docs/ --endpoint https://lightrag.example.com python load_docs.py docs/ --endpoint https://lightrag.example.com
# Load with different documentation base URL # Load with different documentation base URL
python load_docs.py docs/ --mode urls --base-url https://my-docs.example.com/docs/ python load_docs.py docs/ --mode urls --base-url https://my-docs.example.com/docs/
""" """
) )
parser.add_argument( parser.add_argument(
"docs_path", "docs_path",
nargs="?", nargs="?",
@ -259,7 +259,7 @@ Examples:
action="store_true", action="store_true",
help="Skip test query after loading" help="Skip test query after loading"
) )
args = parser.parse_args() args = parser.parse_args()
api_key = os.getenv("LIGHTRAG_API_KEY") api_key = os.getenv("LIGHTRAG_API_KEY")
if api_key: if api_key:
@ -267,7 +267,7 @@ Examples:
else: else:
auth_headers = None auth_headers = None
print(" LIGHTRAG_API_KEY not set, continuing without authentication.") print(" LIGHTRAG_API_KEY not set, continuing without authentication.")
print("🚀 Loading Documentation into LightRAG") print("🚀 Loading Documentation into LightRAG")
print("=" * 60) print("=" * 60)
print(f"📁 Documentation path: {args.docs_path}") print(f"📁 Documentation path: {args.docs_path}")
@ -280,12 +280,12 @@ Examples:
sys.exit(1) sys.exit(1)
print(f"🌐 LightRAG endpoint: {args.endpoint}") print(f"🌐 LightRAG endpoint: {args.endpoint}")
print() print()
# Test LightRAG connectivity # Test LightRAG connectivity
if not await test_lightrag_health(args.endpoint, headers=auth_headers): if not await test_lightrag_health(args.endpoint, headers=auth_headers):
print("❌ Cannot connect to LightRAG. Please ensure it's running and accessible.") print("❌ Cannot connect to LightRAG. Please ensure it's running and accessible.")
sys.exit(1) sys.exit(1)
# Load documents # Load documents
docs_path = Path(args.docs_path).resolve() docs_path = Path(args.docs_path).resolve()
try: try:
@ -293,24 +293,24 @@ Examples:
except (FileNotFoundError, ValueError) as e: except (FileNotFoundError, ValueError) as e:
print(f"{e}") print(f"{e}")
sys.exit(1) sys.exit(1)
if not documents: if not documents:
print("❌ No markdown files found to load") print("❌ No markdown files found to load")
sys.exit(1) sys.exit(1)
# Calculate statistics # Calculate statistics
total_content = sum(len(content) for content, _, _ in documents) total_content = sum(len(content) for content, _, _ in documents)
avg_content = total_content // len(documents) if documents else 0 avg_content = total_content // len(documents) if documents else 0
print(f"📊 Total content: {total_content:,} characters") print(f"📊 Total content: {total_content:,} characters")
print(f"📊 Average length: {avg_content:,} characters") print(f"📊 Average length: {avg_content:,} characters")
# Load documents # Load documents
successful = 0 successful = 0
failed = 0 failed = 0
print(f"\n🔄 Starting to load documents...") print(f"\n🔄 Starting to load documents...")
for i, (content, title, doc_url) in enumerate(documents): for i, (content, title, doc_url) in enumerate(documents):
success = await load_document_to_lightrag( success = await load_document_to_lightrag(
content, content,
@ -319,23 +319,23 @@ Examples:
args.endpoint, args.endpoint,
headers=auth_headers headers=auth_headers
) )
if success: if success:
successful += 1 successful += 1
else: else:
failed += 1 failed += 1
# Progress update # Progress update
if (i + 1) % 10 == 0: if (i + 1) % 10 == 0:
print(f"📈 Progress: {i + 1}/{len(documents)} ({successful} success, {failed} failed)") print(f"📈 Progress: {i + 1}/{len(documents)} ({successful} success, {failed} failed)")
# Small delay to avoid overwhelming the service # Small delay to avoid overwhelming the service
await asyncio.sleep(0.3) await asyncio.sleep(0.3)
print(f"\n✅ Loading complete!") print(f"\n✅ Loading complete!")
print(f"📊 Successful: {successful}") print(f"📊 Successful: {successful}")
print(f"📊 Failed: {failed}") print(f"📊 Failed: {failed}")
# Test query unless disabled # Test query unless disabled
if not args.no_test and successful > 0: if not args.no_test and successful > 0:
await test_query(args.endpoint, headers=auth_headers) await test_query(args.endpoint, headers=auth_headers)

2377
poetry.lock generated Normal file

File diff suppressed because it is too large Load diff

111
pyproject.toml Normal file
View file

@ -0,0 +1,111 @@
[project]
name = "apolo-apps-lightrag"
version = "0.0.1"
description = "Apolo LightRAG application"
authors = [
{name = "Apolo", email = "dev@apolo.us"}
]
readme = "README.md"
dynamic = ["version"]
requires-python = ">=3.11.0,<4.0"
[tool.poetry]
name = "apolo-apps-lightrag"
authors = ["Apolo.us"]
packages = [
{ include = "apolo_apps_lightrag", from = ".apolo/src" },
]
[tool.poetry.dependencies]
apolo-sdk = "^25.7.2"
pydantic = "^2.9.2"
pyyaml = "^6.0.2"
yarl = "^1.18.3"
apolo-app-types = "^25.9.0"
[tool.poetry.group.dev]
optional = true
[tool.poetry.group.dev.dependencies]
pre-commit = "^4.2.0"
types-PyYAML = "^6.0.12.20241230"
pytest = "^8.3.4"
pytest-asyncio = "^0.25.3"
pytest-cov = "^6.2.1"
mypy = "^1.17.1"
[tool.ruff]
target-version = "py311"
lint.select = [
"E", "F", "I", "C90", "UP", "B", "ASYNC", "N", "FBT", "A", "C4", "EM", "FA", "ICN",
"G", "PIE", "T20", "PYI", "PT", "RET", "PTH"
]
lint.ignore = [
"A003",
"N818"
]
[tool.ruff.lint.isort.sections]
ApoloSDK = ["apolo-sdk"]
[tool.ruff.lint.isort]
combine-as-imports = true
lines-after-imports = 2
section-order = ["future", "standard-library", "third-party", "ApoloSDK", "first-party", "local-folder"]
known-first-party = ["apolo_app_types"]
known-local-folder = ["tests"]
[tool.mypy]
check_untyped_defs = true
disallow_any_generics = true
disallow_untyped_defs = true
follow_imports = "silent"
strict_optional = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_unused_configs = true
plugins = ['pydantic.mypy']
exclude = [
"tests/"
]
[[tool.mypy.overrides]]
module = "pytest"
ignore_missing_imports = true
[tool.flake8]
extend-exclude = [
".git",
".env",
"__pycache__",
".eggs",
]
max-line-length = 88
extend-ignore = [
"N801",
"N802",
"N803",
"E252",
"W503",
"E133",
"E203",
"F541",
]
[tool.coverage.report]
fail_under = 0
skip_empty = true
sort = "-cover"
omit = [
"./apolo/tests/*",
]
[tool.pytest.ini_options]
asyncio_mode = "auto"
asyncio_default_fixture_loop_scope = "session"
log_cli = false
log_level = "INFO"
junit_family = "xunit2"
testpaths = [
"./apolo/tests/",
]