Adds initial LightRAG app integration with schema and processors

Introduces the LightRAG Retrieval-Augmented Generation framework as an Apolo app, including input/output schemas, types, and processors.
Adds Helm chart value processing, environment and persistence configurations, and output service discovery for deployment.
Includes scripts for generating type schemas and testing support, along with CI and linting setup tailored for the new app.
Provides a documentation loader script to ingest markdown files into LightRAG with flexible referencing modes.

Relates to MLO-469
This commit is contained in:
Taddeus 2025-11-03 13:59:52 +02:00
parent 748ded40fb
commit 2255b91210
27 changed files with 5166 additions and 66 deletions

0
.apolo/__init__.py Normal file
View file

55
.apolo/applications.yaml Normal file
View file

@ -0,0 +1,55 @@
- app_type: lightrag
name: lightrag
title: LightRAG
install_type: workflow
helm_path: k8s-deploy/lightrag
app_package_name: apolo_apps_lightrag
inputs:
schema_path: .apolo/src/apolo_apps_lightrag/schemas/LightRAGAppInputs.json
types_name: LightRAGAppInputs
processor: LightRAGInputsProcessor
image: ghcr.io/neuro-inc/app-lightrag
outputs:
schema_path: .apolo/src/apolo_apps_lightrag/schemas/LightRAGAppOutputs.json
types_name: LightRAGAppOutputs
processor: LightRAGOutputsProcessor
image: ghcr.io/neuro-inc/app-lightrag
short_description: Advanced RAG framework with graph-enhanced retrieval capabilities
description: |
LightRAG is a simple and fast Retrieval-Augmented Generation (RAG) system that incorporates
graph structure into text indexing and retrieval processes. Unlike traditional RAG approaches,
LightRAG utilizes both low-level and high-level knowledge discovery to enhance text retrieval.
It supports both naive and local search methods, making it suitable for comprehensive
question-answering tasks. The system includes a user-friendly web interface for document
management and querying, with built-in support for various LLM providers and embedding models.
pub_date: "2025-06-25T00:00:00+00:00"
logo: https://storage.googleapis.com/development-421920-assets/app-logos/lightrag-logo.png
tags:
- "RAG"
- "LightRAG"
- "Knowledge Graph"
- "Vector Search"
- "Document Processing"
- "LLM"
- "Embeddings"
- "PostgreSQL"
- "Graph"
- "AI"
- "NLP"
assets:
- type: image
url: https://storage.googleapis.com/development-421920-assets/app-logos/lightrag-banner.png
- type: video
url: https://www.youtube.com/watch?v=oageL-1I0GE
- type: pdf
url: https://arxiv.org/abs/2410.05779
urls:
- name: LightRAG GitHub Repository
type: documentation
url: https://github.com/HKUDS/LightRAG
- name: LightRAG DeepWiki Documentation
type: documentation
url: https://deepwiki.com/HKUDS/LightRAG
- name: LightRAG Official Repository
type: external
url: https://github.com/HKUDS/LightRAG

1
.apolo/project.yaml Normal file
View file

@ -0,0 +1 @@
id: lightrag

View file

@ -0,0 +1,3 @@
.PHONY: gen-types-schemas
gen-types-schemas:
@.apolo/scripts/gen_types_schemas.sh

View file

@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
APP_PACKAGE_DIR=".apolo/src/apolo_apps_lightrag"
INPUT_SCHEMA="${APP_PACKAGE_DIR}/schemas/LightRAGAppInputs.json"
OUTPUT_SCHEMA="${APP_PACKAGE_DIR}/schemas/LightRAGAppOutputs.json"
if command -v poetry >/dev/null 2>&1; then
APP_TYPES_CMD=(poetry run app-types)
elif [[ -x "${REPO_ROOT}/.venv/bin/app-types" ]]; then
APP_TYPES_CMD=("${REPO_ROOT}/.venv/bin/app-types")
elif command -v app-types >/dev/null 2>&1; then
APP_TYPES_CMD=(app-types)
else
echo "app-types CLI not found. Install dependencies via 'poetry install --with dev'." >&2
exit 1
fi
(
cd "${REPO_ROOT}"
"${APP_TYPES_CMD[@]}" dump-types-schema "${APP_PACKAGE_DIR}" LightRAGAppInputs "${INPUT_SCHEMA}"
"${APP_TYPES_CMD[@]}" dump-types-schema "${APP_PACKAGE_DIR}" LightRAGAppOutputs "${OUTPUT_SCHEMA}"
)

View file

@ -0,0 +1,15 @@
from apolo_apps_lightrag.inputs_processor import (
LightRAGInputsProcessor,
)
from apolo_apps_lightrag.outputs_processor import (
LightRAGOutputsProcessor,
)
from apolo_apps_lightrag.types import LightRAGAppInputs, LightRAGAppOutputs
__all__ = [
"LightRAGInputsProcessor",
"LightRAGOutputsProcessor",
"LightRAGAppInputs",
"LightRAGAppOutputs",
]

View file

@ -0,0 +1,246 @@
import logging
import typing as t
from apolo_app_types.app_types import AppType
from apolo_app_types.helm.apps.base import BaseChartValueProcessor
from apolo_app_types.helm.apps.common import gen_extra_values
from apolo_app_types.helm.utils.deep_merging import merge_list_of_dicts
from apolo_app_types.protocols.common.openai_compat import (
OpenAICompatChatAPI,
OpenAICompatEmbeddingsAPI,
)
from apolo_app_types.protocols.common.secrets_ import serialize_optional_secret
from .types import (
AnthropicLLMProvider,
GeminiLLMProvider,
LightRAGAppInputs,
OllamaEmbeddingProvider,
OllamaLLMProvider,
OpenAIEmbeddingProvider,
OpenAILLMProvider,
)
logger = logging.getLogger(__name__)
class LightRAGInputsProcessor(BaseChartValueProcessor[LightRAGAppInputs]):
def _extract_llm_config(self, llm_config: t.Any) -> dict[str, t.Any]:
"""Extract LLM configuration from provider-specific config."""
if isinstance(llm_config, OpenAICompatChatAPI):
if not llm_config.hf_model:
msg = "OpenAI compatible chat API must have hf_model configured"
raise ValueError(msg)
model = llm_config.hf_model.model_hf_name
host = llm_config.complete_url
return {
"binding": "openai",
"model": model,
"host": host,
"api_key": getattr(llm_config, "api_key", None),
}
if isinstance(llm_config, OpenAILLMProvider):
host = llm_config.complete_url
return {
"binding": "openai",
"model": llm_config.model,
"host": host,
"api_key": llm_config.api_key,
}
if isinstance(llm_config, AnthropicLLMProvider):
host = llm_config.complete_url
return {
"binding": "anthropic",
"model": llm_config.model,
"host": host,
"api_key": llm_config.api_key,
}
if isinstance(llm_config, OllamaLLMProvider):
host = llm_config.complete_url
return {
"binding": "ollama",
"model": llm_config.model,
"host": host,
"api_key": None,
}
if isinstance(llm_config, GeminiLLMProvider):
host = llm_config.complete_url
return {
"binding": "gemini",
"model": llm_config.model,
"host": host,
"api_key": llm_config.api_key,
}
binding = getattr(llm_config, "provider", "openai")
model = getattr(llm_config, "model", "gpt-4o-mini")
api_key = getattr(llm_config, "api_key", None)
host = ""
if hasattr(llm_config, "complete_url"):
host = llm_config.complete_url
elif hasattr(llm_config, "host") and llm_config.host:
protocol = getattr(llm_config, "protocol", "https")
port = getattr(llm_config, "port", 443)
host = f"{protocol}://{llm_config.host}:{port}"
return {"binding": binding, "model": model, "host": host, "api_key": api_key}
def _extract_embedding_config(self, embedding_config: t.Any) -> dict[str, t.Any]:
"""Extract embedding configuration from provider-specific config."""
if isinstance(embedding_config, OpenAICompatEmbeddingsAPI):
if embedding_config.hf_model is None:
msg = "OpenAI compatible embeddings API must have hf_model configured"
raise ValueError(msg)
model = embedding_config.hf_model.model_hf_name
host = embedding_config.complete_url
return {
"binding": "openai",
"model": model,
"api_key": getattr(embedding_config, "api_key", None),
"dimensions": 1536,
"host": host,
}
if isinstance(embedding_config, OpenAIEmbeddingProvider):
host = embedding_config.complete_url
return {
"binding": "openai",
"model": embedding_config.model,
"api_key": embedding_config.api_key,
"dimensions": 1536,
"host": host,
}
if isinstance(embedding_config, OllamaEmbeddingProvider):
host = embedding_config.complete_url
return {
"binding": "ollama",
"model": embedding_config.model,
"api_key": None,
"dimensions": 1024,
"host": host,
}
binding = getattr(embedding_config, "provider", "openai")
model = getattr(embedding_config, "model", "text-embedding-ada-002")
api_key = getattr(embedding_config, "api_key", None)
dimensions = 1536
if hasattr(embedding_config, "dimensions"):
dimensions = embedding_config.dimensions
host = ""
if hasattr(embedding_config, "complete_url"):
host = embedding_config.complete_url
elif hasattr(embedding_config, "host") and embedding_config.host:
protocol = getattr(embedding_config, "protocol", "https")
port = getattr(embedding_config, "port", 443)
host = f"{protocol}://{embedding_config.host}:{port}"
return {
"binding": binding,
"model": model,
"api_key": api_key,
"dimensions": dimensions,
"host": host,
}
async def _get_environment_values(
self,
input_: LightRAGAppInputs,
app_secrets_name: str,
) -> dict[str, t.Any]:
llm_config = self._extract_llm_config(input_.llm_config)
embedding_config = self._extract_embedding_config(input_.embedding_config)
env_config = {
"HOST": "0.0.0.0",
"PORT": 9621,
"WEBUI_TITLE": "Graph RAG Engine",
"WEBUI_DESCRIPTION": "Simple and Fast Graph Based RAG System",
"LLM_BINDING": llm_config["binding"],
"LLM_MODEL": llm_config["model"],
"LLM_BINDING_HOST": llm_config["host"],
"LLM_BINDING_API_KEY": serialize_optional_secret(
llm_config["api_key"], app_secrets_name
),
"OPENAI_API_KEY": serialize_optional_secret(
llm_config["api_key"], app_secrets_name
)
or "",
"EMBEDDING_BINDING": embedding_config["binding"],
"EMBEDDING_MODEL": embedding_config["model"],
"EMBEDDING_DIM": embedding_config["dimensions"],
"EMBEDDING_BINDING_HOST": embedding_config["host"],
"EMBEDDING_BINDING_API_KEY": serialize_optional_secret(
embedding_config["api_key"], app_secrets_name
)
or "",
"LIGHTRAG_KV_STORAGE": "PGKVStorage",
"LIGHTRAG_VECTOR_STORAGE": "PGVectorStorage",
"LIGHTRAG_DOC_STATUS_STORAGE": "PGDocStatusStorage",
"LIGHTRAG_GRAPH_STORAGE": "NetworkXStorage",
"POSTGRES_HOST": input_.pgvector_user.pgbouncer_host,
"POSTGRES_PORT": input_.pgvector_user.pgbouncer_port,
"POSTGRES_USER": input_.pgvector_user.user,
"POSTGRES_PASSWORD": input_.pgvector_user.password,
"POSTGRES_DATABASE": input_.pgvector_user.dbname,
"POSTGRES_WORKSPACE": "default",
}
return {"env": env_config}
async def _get_persistence_values(
self,
input_: LightRAGAppInputs,
) -> dict[str, t.Any]:
return {
"persistence": {
"enabled": True,
"ragStorage": {
"size": f"{input_.persistence.rag_storage_size}Gi",
},
"inputs": {
"size": f"{input_.persistence.inputs_storage_size}Gi",
},
}
}
async def gen_extra_values(
self,
input_: LightRAGAppInputs,
app_name: str,
namespace: str,
app_id: str,
app_secrets_name: str,
*_: t.Any,
**kwargs: t.Any,
) -> dict[str, t.Any]:
env_values = await self._get_environment_values(input_, app_secrets_name)
persistence_values = await self._get_persistence_values(input_)
platform_values = await gen_extra_values(
apolo_client=self.client,
preset_type=input_.preset,
ingress_http=input_.ingress_http,
ingress_grpc=None,
namespace=namespace,
app_id=app_id,
app_type=AppType.LightRAG,
)
base_values = {
"replicaCount": 1,
"image": {
"repository": "ghcr.io/hkuds/lightrag",
"tag": "1.3.8",
"pullPolicy": "IfNotPresent",
},
"service": {
"type": "ClusterIP",
"port": 9621,
},
"nameOverride": "",
"fullnameOverride": app_name,
}
logger.debug("Generated LightRAG values for app %s", app_name)
return merge_list_of_dicts(
[
base_values,
env_values,
persistence_values,
platform_values,
]
)
__all__ = ["LightRAGInputsProcessor"]

View file

@ -0,0 +1,66 @@
import logging
import typing as t
from apolo_app_types.clients.kube import get_service_host_port
from apolo_app_types.outputs.base import BaseAppOutputsProcessor
from apolo_app_types.outputs.common import (
INSTANCE_LABEL,
get_internal_external_web_urls,
)
from apolo_app_types.outputs.utils.ingress import get_ingress_host_port
from apolo_app_types.protocols.common.networking import HttpApi, ServiceAPI, WebApp
from .types import LightRAGAppOutputs
logger = logging.getLogger(__name__)
async def _generate_lightrag_outputs(
helm_values: dict[str, t.Any],
app_instance_id: str,
) -> LightRAGAppOutputs:
labels = {"app.kubernetes.io/name": "lightrag", INSTANCE_LABEL: app_instance_id}
internal_web_app_url, external_web_app_url = await get_internal_external_web_urls(
labels
)
internal_host, internal_port = await get_service_host_port(match_labels=labels)
internal_server_url = None
if internal_host:
internal_server_url = HttpApi(
host=internal_host,
port=int(internal_port),
protocol="http",
)
external_server_url = None
ingress_host_port = await get_ingress_host_port(match_labels=labels)
if ingress_host_port:
external_server_url = HttpApi(
host=ingress_host_port[0],
port=int(ingress_host_port[1]),
protocol="https",
)
return LightRAGAppOutputs(
app_url=ServiceAPI[WebApp](
internal_url=internal_web_app_url,
external_url=external_web_app_url,
),
server_url=ServiceAPI[HttpApi](
internal_url=internal_server_url,
external_url=external_server_url,
),
)
class LightRAGOutputsProcessor(BaseAppOutputsProcessor[LightRAGAppOutputs]):
async def _generate_outputs(
self,
helm_values: dict[str, t.Any],
app_instance_id: str,
) -> LightRAGAppOutputs:
outputs = await _generate_lightrag_outputs(helm_values, app_instance_id)
logger.info("Got outputs: %s", outputs)
return outputs
__all__ = ["LightRAGOutputsProcessor"]

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,245 @@
{
"$defs": {
"HttpApi": {
"properties": {
"host": {
"title": "Host",
"type": "string",
"x-description": "The hostname of the HTTP endpoint.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Hostname"
},
"port": {
"default": 80,
"exclusiveMinimum": 0,
"title": "Port",
"type": "integer",
"x-description": "The port of the HTTP endpoint.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Port"
},
"protocol": {
"default": "http",
"title": "Protocol",
"type": "string",
"x-description": "The protocol to use, e.g., http or https.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Protocol"
},
"timeout": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 30.0,
"title": "Timeout",
"x-description": "Connection timeout in seconds.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Connection Timeout"
},
"base_path": {
"default": "/",
"title": "Base Path",
"type": "string"
}
},
"required": [
"host"
],
"title": "HttpApi",
"type": "object",
"x-description": "HTTP API Configuration.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "HTTP API",
"x-type": "OpenAICompatibleEmbeddingsRestAPI"
},
"ServiceAPI_HttpApi_": {
"properties": {
"internal_url": {
"anyOf": [
{
"$ref": "#/$defs/HttpApi"
},
{
"type": "null"
}
],
"default": null,
"x-description": "Internal URL to access the service. This route is not protected by platform authorization and only workloads from the same project can access it.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Internal URL"
},
"external_url": {
"anyOf": [
{
"$ref": "#/$defs/HttpApi"
},
{
"type": "null"
}
],
"default": null,
"x-description": "External URL for accessing the service from outside the cluster. This route might be secured by platform authorization and is accessible from any network with a valid platform authorization token that has appropriate permissions.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "External URL"
}
},
"title": "ServiceAPI[HttpApi]",
"type": "object",
"x-description": "Service APIs URLs.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Service APIs",
"x-type": "ServiceAPI[GrpcAPI]"
},
"ServiceAPI_WebApp_": {
"properties": {
"internal_url": {
"anyOf": [
{
"$ref": "#/$defs/WebApp"
},
{
"type": "null"
}
],
"default": null,
"x-description": "Internal URL to access the service. This route is not protected by platform authorization and only workloads from the same project can access it.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Internal URL"
},
"external_url": {
"anyOf": [
{
"$ref": "#/$defs/WebApp"
},
{
"type": "null"
}
],
"default": null,
"x-description": "External URL for accessing the service from outside the cluster. This route might be secured by platform authorization and is accessible from any network with a valid platform authorization token that has appropriate permissions.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "External URL"
}
},
"title": "ServiceAPI[WebApp]",
"type": "object",
"x-description": "Service APIs URLs.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Service APIs",
"x-type": "ServiceAPI[GrpcAPI]"
},
"WebApp": {
"properties": {
"host": {
"title": "Host",
"type": "string",
"x-description": "The hostname of the HTTP endpoint.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Hostname"
},
"port": {
"default": 80,
"exclusiveMinimum": 0,
"title": "Port",
"type": "integer",
"x-description": "The port of the HTTP endpoint.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Port"
},
"protocol": {
"default": "http",
"title": "Protocol",
"type": "string",
"x-description": "The protocol to use, e.g., http or https.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Protocol"
},
"timeout": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 30.0,
"title": "Timeout",
"x-description": "Connection timeout in seconds.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "Connection Timeout"
},
"base_path": {
"default": "/",
"title": "Base Path",
"type": "string"
},
"api_type": {
"const": "webapp",
"default": "webapp",
"title": "Api Type",
"type": "string"
}
},
"required": [
"host"
],
"title": "WebApp",
"type": "object",
"x-description": "HTTP API Configuration.",
"x-is-advanced-field": false,
"x-meta-type": "inline",
"x-title": "HTTP API",
"x-type": "OpenAICompatibleEmbeddingsRestAPI"
}
},
"description": "LightRAG outputs.",
"properties": {
"app_url": {
"anyOf": [
{
"$ref": "#/$defs/ServiceAPI_WebApp_"
},
{
"type": "null"
}
],
"default": null,
"description": "The main application URL for accessing the service. This is the primary endpoint users should use to access the application.",
"title": "Application URL"
},
"server_url": {
"anyOf": [
{
"$ref": "#/$defs/ServiceAPI_HttpApi_"
},
{
"type": "null"
}
],
"default": null
}
},
"title": "LightRAGAppOutputs",
"type": "object"
}

View file

@ -0,0 +1,426 @@
from typing import Literal
from pydantic import BaseModel, ConfigDict, Field, field_validator
from apolo_app_types import AppInputs, AppOutputs
from apolo_app_types.protocols.common import (
IngressHttp,
Preset,
SchemaExtraMetadata,
SchemaMetaType,
)
from apolo_app_types.protocols.common.networking import HttpApi, RestAPI, ServiceAPI
from apolo_app_types.protocols.common.openai_compat import (
OpenAICompatChatAPI,
OpenAICompatEmbeddingsAPI,
)
from apolo_app_types.protocols.postgres import CrunchyPostgresUserCredentials
class LightRAGPersistence(BaseModel):
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="LightRAG Persistence",
description="Configure persistent storage for LightRAG data and inputs.",
).as_json_schema_extra(),
)
rag_storage_size: int = Field(
default=10,
gt=0,
json_schema_extra=SchemaExtraMetadata(
title="RAG Storage Size (GB)",
description="Size of the persistent volume for RAG data storage.",
).as_json_schema_extra(),
)
inputs_storage_size: int = Field(
default=5,
gt=0,
json_schema_extra=SchemaExtraMetadata(
title="Inputs Storage Size (GB)",
description="Size of the persistent volume for input files.",
).as_json_schema_extra(),
)
@field_validator("rag_storage_size", "inputs_storage_size", mode="before")
@classmethod
def validate_storage_size(cls, value: int) -> int:
if value and isinstance(value, int) and value < 1:
error_message = "Storage size must be greater than 1GB."
raise ValueError(error_message)
return value
class OpenAILLMProvider(RestAPI):
"""OpenAI LLM provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="OpenAI LLM Provider",
description="OpenAI chat completion API configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
default="api.openai.com",
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="OpenAI API host",
).as_json_schema_extra(),
)
port: int = Field(
default=443,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Set the port.",
).as_json_schema_extra(),
)
protocol: Literal["https"] = "https"
timeout: int | None = Field(
default=60,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Set the connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/v1"
provider: Literal["openai"] = "openai"
model: str = Field(
default="gpt-4o-mini",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Chat completion model name.",
).as_json_schema_extra(),
)
api_key: str = Field(
default="",
json_schema_extra=SchemaExtraMetadata(
title="API Key",
description="OpenAI API key.",
).as_json_schema_extra(),
)
class AnthropicLLMProvider(RestAPI):
"""Anthropic LLM provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="Anthropic LLM Provider",
description="Anthropic Claude API configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
default="api.anthropic.com",
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="Anthropic API host",
).as_json_schema_extra(),
)
port: int = Field(
default=443,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Set the port.",
).as_json_schema_extra(),
)
protocol: Literal["https"] = "https"
timeout: int | None = Field(
default=60,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Set the connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/v1"
provider: Literal["anthropic"] = "anthropic"
model: str = Field(
default="claude-3-5-sonnet-20241022",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Anthropic Claude model name.",
).as_json_schema_extra(),
)
api_key: str = Field(
default="",
json_schema_extra=SchemaExtraMetadata(
title="API Key",
description="Anthropic API key.",
).as_json_schema_extra(),
)
class OllamaLLMProvider(RestAPI):
"""Ollama LLM provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="Ollama LLM Provider",
description="Configuration for a self-hosted Ollama server.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="Ollama server host.",
).as_json_schema_extra(),
)
port: int = Field(
default=11434,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Ollama server port.",
).as_json_schema_extra(),
)
protocol: Literal["http", "https"] = Field(
default="http",
json_schema_extra=SchemaExtraMetadata(
title="Protocol",
description="Ollama server protocol.",
).as_json_schema_extra(),
)
timeout: int | None = Field(
default=300,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Configure connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/api"
provider: Literal["ollama"] = "ollama"
model: str = Field(
default="llama3.1:8b-instruct-q4_0",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Ollama model name.",
).as_json_schema_extra(),
)
class GeminiLLMProvider(RestAPI):
"""Google Gemini LLM provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="Gemini LLM Provider",
description="Google Gemini API configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
default="generativelanguage.googleapis.com",
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="Google AI API host",
).as_json_schema_extra(),
)
port: int = Field(
default=443,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Set the port.",
).as_json_schema_extra(),
)
protocol: Literal["https"] = "https"
timeout: int | None = Field(
default=60,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Configure connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/v1"
provider: Literal["gemini"] = "gemini"
model: str = Field(
default="gemini-1.5-flash",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Google Gemini model name.",
).as_json_schema_extra(),
)
api_key: str = Field(
default="",
json_schema_extra=SchemaExtraMetadata(
title="API Key",
description="Google AI API key.",
).as_json_schema_extra(),
)
LLMProvider = (
OpenAICompatChatAPI
| OpenAILLMProvider
| AnthropicLLMProvider
| OllamaLLMProvider
| GeminiLLMProvider
)
class OpenAIEmbeddingProvider(RestAPI):
"""OpenAI embedding provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="OpenAI Embedding Provider",
description="OpenAI embeddings API configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
default="api.openai.com",
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="OpenAI API host",
).as_json_schema_extra(),
)
port: int = Field(
default=443,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Set the port.",
).as_json_schema_extra(),
)
protocol: Literal["https"] = "https"
timeout: int | None = Field(
default=60,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Set the connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/v1"
provider: Literal["openai"] = "openai"
model: str = Field(
default="text-embedding-ada-002",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Embedding model name.",
).as_json_schema_extra(),
)
api_key: str = Field(
default="",
json_schema_extra=SchemaExtraMetadata(
title="API Key",
description="OpenAI API key.",
).as_json_schema_extra(),
)
class OllamaEmbeddingProvider(RestAPI):
"""Ollama embedding provider configuration."""
model_config = ConfigDict(
protected_namespaces=(),
json_schema_extra=SchemaExtraMetadata(
title="Ollama Embedding Provider",
description="Ollama local embedding model configuration.",
meta_type=SchemaMetaType.INLINE,
).as_json_schema_extra(),
)
host: str = Field(
json_schema_extra=SchemaExtraMetadata(
title="Host",
description="Ollama server host.",
).as_json_schema_extra(),
)
port: int = Field(
default=11434,
json_schema_extra=SchemaExtraMetadata(
title="Port",
description="Ollama server port.",
).as_json_schema_extra(),
)
protocol: Literal["http", "https"] = Field(
default="http",
json_schema_extra=SchemaExtraMetadata(
title="Protocol",
description="Ollama server protocol.",
).as_json_schema_extra(),
)
timeout: int | None = Field(
default=300,
json_schema_extra=SchemaExtraMetadata(
title="Timeout",
description="Configure connection timeout in seconds.",
).as_json_schema_extra(),
)
base_path: str = "/api"
provider: Literal["ollama"] = "ollama"
model: str = Field(
default="nomic-embed-text",
json_schema_extra=SchemaExtraMetadata(
title="Model",
description="Ollama embedding model name.",
).as_json_schema_extra(),
)
EmbeddingProvider = (
OpenAICompatEmbeddingsAPI | OpenAIEmbeddingProvider | OllamaEmbeddingProvider
)
LightRAGLLMConfig = LLMProvider
LightRAGEmbeddingConfig = EmbeddingProvider
class LightRAGAppInputs(AppInputs):
preset: Preset
ingress_http: IngressHttp
pgvector_user: CrunchyPostgresUserCredentials
llm_config: LightRAGLLMConfig = Field(
default=OpenAICompatChatAPI(host="", port=443, protocol="https"),
json_schema_extra=SchemaExtraMetadata(
title="LLM Configuration",
description="LLM provider configuration.",
).as_json_schema_extra(),
)
embedding_config: LightRAGEmbeddingConfig = Field(
default=OpenAICompatEmbeddingsAPI(host="", port=443, protocol="https"),
json_schema_extra=SchemaExtraMetadata(
title="Embedding Configuration",
description="Embedding provider configuration.",
).as_json_schema_extra(),
)
persistence: LightRAGPersistence = Field(
default_factory=LightRAGPersistence,
json_schema_extra=SchemaExtraMetadata(
title="Persistence Configuration",
description="Configure persistent storage for LightRAG data and inputs.",
).as_json_schema_extra(),
)
class LightRAGAppOutputs(AppOutputs):
"""LightRAG outputs."""
server_url: ServiceAPI[HttpApi] | None = None
__all__ = [
"LightRAGAppInputs",
"LightRAGAppOutputs",
"LightRAGEmbeddingConfig",
"LightRAGLLMConfig",
"LightRAGPersistence",
"OpenAILLMProvider",
"AnthropicLLMProvider",
"OllamaLLMProvider",
"GeminiLLMProvider",
"OpenAIEmbeddingProvider",
"OllamaEmbeddingProvider",
]

0
.apolo/tests/__init__.py Normal file
View file

6
.apolo/tests/conftest.py Normal file
View file

@ -0,0 +1,6 @@
import pytest
pytest_plugins = [
"apolo_app_types_fixtures.apolo_clients",
"apolo_app_types_fixtures.constants",
]

View file

View file

@ -23,7 +23,7 @@ LLM_BINDING_API_KEY=sk-your-openai-api-key-here
# EMBEDDING_BINDING_HOST=https://api.openai.com/v1
# EMBEDDING_BINDING_API_KEY=sk-your-openai-api-key-here
EMBEDDING_BINDING=openai
EMBEDDING_MODEL=gemini-embedding-001
EMBEDDING_MODEL=gemini-embedding-001
EMBEDDING_DIM=3072
EMBEDDING_BINDING_HOST=https://generativelanguage.googleapis.com/v1beta/openai/
EMBEDDING_BINDING_API_KEY=AI-your-gemini-api-key-here

17
.github/actionlint-matcher.json vendored Normal file
View file

@ -0,0 +1,17 @@
{
"problemMatcher": [
{
"owner": "actionlint",
"pattern": [
{
"code": 5,
"column": 3,
"file": 1,
"line": 2,
"message": 4,
"regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$"
}
]
}
]
}

106
.github/workflows/ci.yaml vendored Normal file
View file

@ -0,0 +1,106 @@
name: CI
on:
push:
branches: [master]
tags: ["v*"]
pull_request:
branches: [master]
schedule:
- cron: 0 4 * * *
jobs:
test:
name: All checks are passed
uses: ./.github/workflows/test.yaml
secrets: inherit
approve:
name: Approve bot PR
runs-on: ubuntu-latest
if: endsWith(github.actor, '[bot]')
needs: test
permissions:
contents: write
pull-requests: write
steps:
- name: Checkout
uses: actions/checkout@v4
- name: metadata
id: metadata
if: github.actor == 'dependabot[bot]'
uses: dependabot/fetch-metadata@v2
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Enable auto-merge for bot PRs
run: gh pr merge --auto --squash --delete-branch "$PR_URL"
env:
PR_URL: ${{ github.event.pull_request.html_url }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
release-processor-image:
name: Release processor image
runs-on: ubuntu-latest
needs: test
if: github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/master')
steps:
- name: Checkout commit
uses: actions/checkout@v4
- name: Purge old artifacts
uses: kolpav/purge-artifacts-action@v1
with:
token: ${{ github.token }}
expire-in: 30mins
- name: Login to ghcr.io
uses: docker/login-action@v3.5.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ github.token }}
- name: Release development image
run: |
make build-hook-image
- name: Release development image
run: |
export IMAGE_TAG=development
make push-hook-image
- name: Release prod image
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
run: |
export IMAGE_TAG=${GITHUB_REF#refs/tags/v}
make push-hook-image
release-processor-image-pr:
name: Release processor image from PR
runs-on: ubuntu-latest
needs: test
if: github.event_name == 'pull_request'
steps:
- name: Checkout commit
uses: actions/checkout@v4
- name: Purge old artifacts
uses: kolpav/purge-artifacts-action@v1
with:
token: ${{ github.token }}
expire-in: 30mins
- name: Login to ghcr.io
uses: docker/login-action@v3.5.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ github.token }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ghcr.io/neuro-inc/app-lightrag
tags: |
type=ref,event=pr
- name: Build Docker image
run: make build-hook-image
- name: Push release for testing
if: github.actor != 'dependabot[bot]'
run: |
FULL_IMAGE_NAME=${{ steps.meta.outputs.tags }}
export IMAGE_TAG=${FULL_IMAGE_NAME##*:}
make push-hook-image

View file

@ -24,7 +24,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pre-commit
pip install pre-commit apolo-app-types
- name: Run pre-commit
run: pre-commit run --all-files --show-diff-on-failure

74
.github/workflows/test.yaml vendored Normal file
View file

@ -0,0 +1,74 @@
on:
workflow_call: {}
jobs:
lint:
name: Lint
runs-on: ubuntu-latest
steps:
- name: Checkout commit
uses: actions/checkout@v4
- name: Install python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Cache pre-commit hooks
uses: actions/cache@v4
with:
path: ~/.cache/pre-commit
key: pre-commit|py3.12|${{ hashFiles('.pre-commit-config.yaml') }}
- name: Install Poetry
run: pip install poetry
- name: Setup Python dependencies cache
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry
key: poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
poetry-
- name: Install dependencies
run: make install
- name: Add local venv to PATH
run: echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
- name: Lint
run: |
echo "::add-matcher::.github/actionlint-matcher.json"
make lint
test-unit:
name: Unit tests
runs-on: ubuntu-latest
steps:
- name: Checkout commit
uses: actions/checkout@v4
- name: Install python
uses: actions/setup-python@v5
with:
python-version-file: .python-version
cache: pip
- name: Install Poetry
run: pip install poetry
- name: Setup Python dependencies cache
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry
key: poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
poetry-
- name: Install dependencies
run: make setup
- name: Add local venv to PATH
run: echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
- name: Run unit tests
run: make test-unit
check:
name: Check
needs:
- lint
runs-on: ubuntu-latest
if: always()
steps:
- name: Decide whether the needed jobs succeeded or failed
uses: re-actors/alls-green@release/v1
with:
jobs: ${{ toJSON(needs) }}

View file

@ -26,3 +26,23 @@ repos:
- id: check-manifest
stages: [manual]
exclude: ^lightrag/api/webui/
- repo: https://github.com/mrtazz/checkmake.git
rev: 0.2.2
hooks:
- id: checkmake
- repo: https://github.com/gruntwork-io/pre-commit
rev: v0.1.30
hooks:
- id: helmlint
- repo: local
hooks:
- id: generate-types-schemas
name: Generate types schemas
entry: make -f .apolo/scripts/gen_types_schemas.mk gen-types-schemas
language: system

1
.python-version Normal file
View file

@ -0,0 +1 @@
3.11

View file

@ -7,16 +7,7 @@ CHART_PACKAGE_DIR := dist/charts
HELM_REGISTRY := ghcr.io/neuro-inc/helm-charts
RAW_VERSION := $(if $(VERSION),$(VERSION),$(shell git describe --tags --always --dirty 2>/dev/null))
SANITIZED_VERSION := $(shell RAW="$(RAW_VERSION)" python - <<'PY'
import os, re
raw = os.environ.get("RAW", "").strip()
if not raw:
raw = "0.0.0"
raw = raw.lstrip("v")
sanitized = re.sub(r"[^0-9A-Za-z\\.\\-]", "-", raw)
print(sanitized or "0.0.0")
PY
)
SANITIZED_VERSION := $(shell python -c 'import re; raw = "$(RAW_VERSION)".strip(); raw = raw[1:] if raw.startswith("v") else raw; raw = raw or "0.0.0"; sanitized = re.sub(r"[^0-9A-Za-z.\-]", "-", raw); print(sanitized or "0.0.0")')
CHART_VERSION := $(SANITIZED_VERSION)
CHART_PACKAGE := $(CHART_PACKAGE_DIR)/$(CHART_NAME)-$(CHART_VERSION).tgz

View file

@ -8,7 +8,7 @@ Advanced script to load markdown documentation into LightRAG with flexible refer
# Default mode (file path references)
python load_docs.py /path/to/your/docs
# URL mode (website link references)
# URL mode (website link references)
python load_docs.py /path/to/docs --mode urls --base-url https://docs.example.com/
```
@ -28,7 +28,7 @@ python load_docs.py docs/ --mode files
- [KG] administration/setup.md
```
### URLs Mode
### URLs Mode
Uses website URLs in query response citations:
```bash
python load_docs.py docs/ --mode urls --base-url https://docs.apolo.us/index/
@ -37,7 +37,7 @@ python load_docs.py docs/ --mode urls --base-url https://my-docs.com/v1/
**Query Response Example:**
```
### References
### References
- [DC] https://docs.apolo.us/index/getting-started/installation
- [KG] https://docs.apolo.us/index/administration/setup
```
@ -68,7 +68,7 @@ docs/
```bash
# Visit your docs site and note the URL patterns:
# https://docs.example.com/getting-started/installation
# https://docs.example.com/api/authentication
# https://docs.example.com/api/authentication
# https://docs.example.com/guides/deployment
```
@ -80,7 +80,7 @@ mkdir -p docs/{getting-started,api,guides}
**Step 3: Organize Your Markdown Files**
```bash
# Match each URL to a file path:
docs/getting-started/installation.md # → /getting-started/installation
docs/getting-started/installation.md # → /getting-started/installation
docs/api/authentication.md # → /api/authentication
docs/guides/deployment.md # → /guides/deployment
docs/guides/README.md # → /guides (overview page)
@ -109,7 +109,7 @@ curl -I https://docs.example.com/api/authentication
apolo-docs/
├── getting-started/
│ ├── first-steps/
│ │ ├── getting-started.md → /index/getting-started/first-steps/getting-started
│ │ ├── getting-started.md → /index/getting-started/first-steps/getting-started
│ │ └── README.md → /index/getting-started/first-steps
│ ├── apolo-base-docker-image.md → /index/getting-started/apolo-base-docker-image
│ └── faq.md → /index/getting-started/faq
@ -135,14 +135,14 @@ python load_docs.py docs/ --endpoint https://lightrag.example.com
# Load to local instance, skip test query
python load_docs.py docs/ --no-test
# Files mode with custom endpoint
# Files mode with custom endpoint
python load_docs.py docs/ --mode files --endpoint http://localhost:9621
```
## Features
- **Dual Reference Modes**: File paths or live website URLs in citations
- **Flexible Base URL**: Works with any documentation site structure
- **Flexible Base URL**: Works with any documentation site structure
- **Simple dependency**: Only requires `httpx` and Python standard library
- **Automatic discovery**: Finds all `.md` files recursively
- **Smart metadata**: Adds appropriate title, path/URL, and source information
@ -171,4 +171,4 @@ This loader is perfect for:
pip install httpx
```
**Note**: This script is included with LightRAG deployments and provides a simple way to load any markdown documentation into your LightRAG instance.
**Note**: This script is included with LightRAG deployments and provides a simple way to load any markdown documentation into your LightRAG instance.

22
hooks.Dockerfile Normal file
View file

@ -0,0 +1,22 @@
FROM python:3.12-slim
LABEL org.opencontainers.image.source="https://github.com/neuro-inc/LightRAG"
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_CREATE=0
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY README.md poetry.lock pyproject.toml .
RUN pip --no-cache-dir install poetry && poetry install --no-root --no-cache
COPY .apolo .apolo
RUN poetry install --only-root --no-cache
ENTRYPOINT ["app-types"]

View file

@ -14,8 +14,8 @@ from typing import Dict, List, Optional
async def load_document_to_lightrag(
content: str,
title: str,
content: str,
title: str,
doc_url: str,
endpoint: str = "http://localhost:9621",
headers: Optional[Dict[str, str]] = None
@ -34,7 +34,7 @@ async def load_document_to_lightrag(
"file_source": doc_url
}
)
if response.status_code == 200:
print(f"✅ Loaded: {title}")
return True
@ -47,7 +47,7 @@ async def load_document_to_lightrag(
except:
print(f" Response: {response.text}")
return False
except Exception as e:
print(f"❌ Error loading {title}: {e}")
return False
@ -58,27 +58,27 @@ def convert_file_path_to_url(relative_path: str, base_url: str) -> str:
# Ensure base URL ends with /
if not base_url.endswith('/'):
base_url += '/'
# Handle special cases
if relative_path in ["README.md", "SUMMARY.md"]:
return base_url.rstrip('/')
# Remove .md extension and convert path
url_path = relative_path.replace(".md", "")
# Handle README files in subdirectories - they map to the directory URL
if url_path.endswith("/README"):
url_path = url_path[:-7] # Remove "/README"
# Clean up any double slashes
url_path = url_path.strip("/")
return f"{base_url}{url_path}"
def load_markdown_files(docs_path: Path, mode: str = "files", base_url: str = None) -> List[tuple]:
"""Load all markdown files from directory structure
Args:
docs_path: Path to documentation directory
mode: 'files' for file paths, 'urls' for URL references
@ -86,42 +86,42 @@ def load_markdown_files(docs_path: Path, mode: str = "files", base_url: str = No
"""
if not docs_path.exists():
raise FileNotFoundError(f"Documentation directory not found: {docs_path}")
if mode == "urls" and not base_url:
raise ValueError("base_url is required when mode is 'urls'")
# Find all markdown files, excluding SUMMARY.md as it's just the table of contents
md_files = [f for f in docs_path.rglob("*.md") if f.name != "SUMMARY.md"]
print(f"📚 Found {len(md_files)} markdown files")
print(f"🔧 Mode: {mode}")
if mode == "urls":
print(f"🌐 Base URL: {base_url}")
documents = []
for file_path in md_files:
try:
# Load content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if not content:
continue
# Generate title from filename
title = file_path.stem.replace("-", " ").replace("_", " ").title()
if title.lower() == "readme":
# Use parent directory name for README files
title = f"{file_path.parent.name.replace('-', ' ').replace('_', ' ').title()} Overview"
# Get relative path for metadata
relative_path = str(file_path.relative_to(docs_path))
if mode == "files":
# Use file path as reference
reference = relative_path
source_info = f"File: {file_path.name}"
# Prepare content with file metadata
content_with_metadata = f"""
Title: {title}
@ -134,7 +134,7 @@ Source: {source_info}
# Convert file path to documentation URL
reference = convert_file_path_to_url(relative_path, base_url)
source_info = f"Documentation Site"
# Prepare content with URL metadata
content_with_metadata = f"""
Title: {title}
@ -143,13 +143,13 @@ Source: {source_info}
{content}
"""
documents.append((content_with_metadata, title, reference))
except Exception as e:
print(f"⚠️ Error processing {file_path}: {e}")
continue
return documents
@ -189,7 +189,7 @@ async def test_query(
headers=request_headers,
json={"query": "What is this documentation about?", "mode": "local"}
)
if response.status_code == 200:
result = response.json()
print(f"✅ Query successful!")
@ -202,7 +202,7 @@ async def test_query(
print(f" Error details: {error_detail}")
except:
print(f" Response: {response.text}")
except Exception as e:
print(f"❌ Query error: {e}")
@ -216,22 +216,22 @@ async def main():
Examples:
# Load with file path references (default mode)
python load_docs.py ../apolo-copilot/docs/official-apolo-documentation/docs
# Load with URL references
python load_docs.py docs/ --mode urls --base-url https://docs.apolo.us/index/
# Load Apolo docs with URL references (common use case)
python load_docs.py ../apolo-copilot/docs/official-apolo-documentation/docs \\
--mode urls --base-url https://docs.apolo.us/index/
# Use custom endpoint
python load_docs.py docs/ --endpoint https://lightrag.example.com
# Load with different documentation base URL
python load_docs.py docs/ --mode urls --base-url https://my-docs.example.com/docs/
"""
)
parser.add_argument(
"docs_path",
nargs="?",
@ -259,7 +259,7 @@ Examples:
action="store_true",
help="Skip test query after loading"
)
args = parser.parse_args()
api_key = os.getenv("LIGHTRAG_API_KEY")
if api_key:
@ -267,7 +267,7 @@ Examples:
else:
auth_headers = None
print(" LIGHTRAG_API_KEY not set, continuing without authentication.")
print("🚀 Loading Documentation into LightRAG")
print("=" * 60)
print(f"📁 Documentation path: {args.docs_path}")
@ -280,12 +280,12 @@ Examples:
sys.exit(1)
print(f"🌐 LightRAG endpoint: {args.endpoint}")
print()
# Test LightRAG connectivity
if not await test_lightrag_health(args.endpoint, headers=auth_headers):
print("❌ Cannot connect to LightRAG. Please ensure it's running and accessible.")
sys.exit(1)
# Load documents
docs_path = Path(args.docs_path).resolve()
try:
@ -293,24 +293,24 @@ Examples:
except (FileNotFoundError, ValueError) as e:
print(f"{e}")
sys.exit(1)
if not documents:
print("❌ No markdown files found to load")
sys.exit(1)
# Calculate statistics
total_content = sum(len(content) for content, _, _ in documents)
avg_content = total_content // len(documents) if documents else 0
print(f"📊 Total content: {total_content:,} characters")
print(f"📊 Average length: {avg_content:,} characters")
# Load documents
successful = 0
failed = 0
print(f"\n🔄 Starting to load documents...")
for i, (content, title, doc_url) in enumerate(documents):
success = await load_document_to_lightrag(
content,
@ -319,23 +319,23 @@ Examples:
args.endpoint,
headers=auth_headers
)
if success:
successful += 1
else:
failed += 1
# Progress update
if (i + 1) % 10 == 0:
print(f"📈 Progress: {i + 1}/{len(documents)} ({successful} success, {failed} failed)")
# Small delay to avoid overwhelming the service
await asyncio.sleep(0.3)
print(f"\n✅ Loading complete!")
print(f"📊 Successful: {successful}")
print(f"📊 Failed: {failed}")
# Test query unless disabled
if not args.no_test and successful > 0:
await test_query(args.endpoint, headers=auth_headers)

2377
poetry.lock generated Normal file

File diff suppressed because it is too large Load diff

111
pyproject.toml Normal file
View file

@ -0,0 +1,111 @@
[project]
name = "apolo-apps-lightrag"
version = "0.0.1"
description = "Apolo LightRAG application"
authors = [
{name = "Apolo", email = "dev@apolo.us"}
]
readme = "README.md"
dynamic = ["version"]
requires-python = ">=3.11.0,<4.0"
[tool.poetry]
name = "apolo-apps-lightrag"
authors = ["Apolo.us"]
packages = [
{ include = "apolo_apps_lightrag", from = ".apolo/src" },
]
[tool.poetry.dependencies]
apolo-sdk = "^25.7.2"
pydantic = "^2.9.2"
pyyaml = "^6.0.2"
yarl = "^1.18.3"
apolo-app-types = "^25.9.0"
[tool.poetry.group.dev]
optional = true
[tool.poetry.group.dev.dependencies]
pre-commit = "^4.2.0"
types-PyYAML = "^6.0.12.20241230"
pytest = "^8.3.4"
pytest-asyncio = "^0.25.3"
pytest-cov = "^6.2.1"
mypy = "^1.17.1"
[tool.ruff]
target-version = "py311"
lint.select = [
"E", "F", "I", "C90", "UP", "B", "ASYNC", "N", "FBT", "A", "C4", "EM", "FA", "ICN",
"G", "PIE", "T20", "PYI", "PT", "RET", "PTH"
]
lint.ignore = [
"A003",
"N818"
]
[tool.ruff.lint.isort.sections]
ApoloSDK = ["apolo-sdk"]
[tool.ruff.lint.isort]
combine-as-imports = true
lines-after-imports = 2
section-order = ["future", "standard-library", "third-party", "ApoloSDK", "first-party", "local-folder"]
known-first-party = ["apolo_app_types"]
known-local-folder = ["tests"]
[tool.mypy]
check_untyped_defs = true
disallow_any_generics = true
disallow_untyped_defs = true
follow_imports = "silent"
strict_optional = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_unused_configs = true
plugins = ['pydantic.mypy']
exclude = [
"tests/"
]
[[tool.mypy.overrides]]
module = "pytest"
ignore_missing_imports = true
[tool.flake8]
extend-exclude = [
".git",
".env",
"__pycache__",
".eggs",
]
max-line-length = 88
extend-ignore = [
"N801",
"N802",
"N803",
"E252",
"W503",
"E133",
"E203",
"F541",
]
[tool.coverage.report]
fail_under = 0
skip_empty = true
sort = "-cover"
omit = [
"./apolo/tests/*",
]
[tool.pytest.ini_options]
asyncio_mode = "auto"
asyncio_default_fixture_loop_scope = "session"
log_cli = false
log_level = "INFO"
junit_family = "xunit2"
testpaths = [
"./apolo/tests/",
]