* Partial implementation of phase-0 * Partial implementation of phase-1 * add report * add postgress * Revert "add postgress" This reverts commit 27778dc6bb3906b5220dd386e47fe32ca7415332. * remove junk * Cleaned up annd setup docs * update docs * moved report * Updated load_markdown_files function: Now returns tuples with (content, title, relative_path) instead of just (content, title) * fixes to load docs script and more env variables for llm configuration * update prod values * update docs * apolo docs support with linking * update docs to reflect url conventions and mapping with docs * Adds ingress and forwardAuth configurations Adds ingress configuration to expose the application. Adds forwardAuth configuration to enable user authentication. Includes middleware to strip headers. * Adds ingress and forward authentication middleware support
319 lines
No EOL
11 KiB
Python
Executable file
319 lines
No EOL
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Simplified script to load documentation into LightRAG
|
|
Loads all markdown files from a directory structure
|
|
"""
|
|
|
|
import asyncio
|
|
import httpx
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
|
|
|
|
async def load_document_to_lightrag(
|
|
content: str,
|
|
title: str,
|
|
doc_url: str,
|
|
endpoint: str = "http://localhost:9621"
|
|
) -> bool:
|
|
"""Load a single document to LightRAG with URL reference"""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
response = await client.post(
|
|
f"{endpoint}/documents/text",
|
|
headers={"Content-Type": "application/json"},
|
|
json={
|
|
"text": content,
|
|
"file_source": doc_url
|
|
}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
print(f"✅ Loaded: {title}")
|
|
return True
|
|
else:
|
|
print(f"❌ Failed to load {title}: {response.status_code}")
|
|
if response.status_code == 500:
|
|
try:
|
|
error_detail = response.json()
|
|
print(f" Error details: {error_detail}")
|
|
except:
|
|
print(f" Response: {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error loading {title}: {e}")
|
|
return False
|
|
|
|
|
|
def convert_file_path_to_url(relative_path: str, base_url: str) -> str:
|
|
"""Convert file path to documentation URL"""
|
|
# Ensure base URL ends with /
|
|
if not base_url.endswith('/'):
|
|
base_url += '/'
|
|
|
|
# Handle special cases
|
|
if relative_path in ["README.md", "SUMMARY.md"]:
|
|
return base_url.rstrip('/')
|
|
|
|
# Remove .md extension and convert path
|
|
url_path = relative_path.replace(".md", "")
|
|
|
|
# Handle README files in subdirectories - they map to the directory URL
|
|
if url_path.endswith("/README"):
|
|
url_path = url_path[:-7] # Remove "/README"
|
|
|
|
# Clean up any double slashes
|
|
url_path = url_path.strip("/")
|
|
|
|
return f"{base_url}{url_path}"
|
|
|
|
|
|
def load_markdown_files(docs_path: Path, mode: str = "files", base_url: str = None) -> List[tuple]:
|
|
"""Load all markdown files from directory structure
|
|
|
|
Args:
|
|
docs_path: Path to documentation directory
|
|
mode: 'files' for file paths, 'urls' for URL references
|
|
base_url: Base URL for documentation site (required for 'urls' mode)
|
|
"""
|
|
if not docs_path.exists():
|
|
raise FileNotFoundError(f"Documentation directory not found: {docs_path}")
|
|
|
|
if mode == "urls" and not base_url:
|
|
raise ValueError("base_url is required when mode is 'urls'")
|
|
|
|
# Find all markdown files, excluding SUMMARY.md as it's just the table of contents
|
|
md_files = [f for f in docs_path.rglob("*.md") if f.name != "SUMMARY.md"]
|
|
print(f"📚 Found {len(md_files)} markdown files")
|
|
print(f"🔧 Mode: {mode}")
|
|
if mode == "urls":
|
|
print(f"🌐 Base URL: {base_url}")
|
|
|
|
documents = []
|
|
|
|
for file_path in md_files:
|
|
try:
|
|
# Load content
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read().strip()
|
|
|
|
if not content:
|
|
continue
|
|
|
|
# Generate title from filename
|
|
title = file_path.stem.replace("-", " ").replace("_", " ").title()
|
|
if title.lower() == "readme":
|
|
# Use parent directory name for README files
|
|
title = f"{file_path.parent.name.replace('-', ' ').replace('_', ' ').title()} Overview"
|
|
|
|
# Get relative path for metadata
|
|
relative_path = str(file_path.relative_to(docs_path))
|
|
|
|
if mode == "files":
|
|
# Use file path as reference
|
|
reference = relative_path
|
|
source_info = f"File: {file_path.name}"
|
|
|
|
# Prepare content with file metadata
|
|
content_with_metadata = f"""
|
|
Title: {title}
|
|
Path: {relative_path}
|
|
Source: {source_info}
|
|
|
|
{content}
|
|
"""
|
|
else: # urls mode
|
|
# Convert file path to documentation URL
|
|
reference = convert_file_path_to_url(relative_path, base_url)
|
|
source_info = f"Documentation Site"
|
|
|
|
# Prepare content with URL metadata
|
|
content_with_metadata = f"""
|
|
Title: {title}
|
|
URL: {reference}
|
|
Source: {source_info}
|
|
|
|
{content}
|
|
"""
|
|
|
|
documents.append((content_with_metadata, title, reference))
|
|
|
|
except Exception as e:
|
|
print(f"⚠️ Error processing {file_path}: {e}")
|
|
continue
|
|
|
|
return documents
|
|
|
|
|
|
async def test_lightrag_health(endpoint: str = "http://localhost:9621") -> bool:
|
|
"""Test if LightRAG is accessible"""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
response = await client.get(f"{endpoint}/health")
|
|
if response.status_code == 200:
|
|
health_data = response.json()
|
|
print(f"✅ LightRAG is healthy: {health_data.get('status')}")
|
|
return True
|
|
else:
|
|
print(f"❌ LightRAG health check failed: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Cannot connect to LightRAG: {e}")
|
|
return False
|
|
|
|
|
|
async def test_query(endpoint: str = "http://localhost:9621") -> None:
|
|
"""Test a sample query"""
|
|
print(f"\n🧪 Testing query...")
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
response = await client.post(
|
|
f"{endpoint}/query",
|
|
headers={"Content-Type": "application/json"},
|
|
json={"query": "What is this documentation about?", "mode": "local"}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
print(f"✅ Query successful!")
|
|
print(f"Response: {result['response'][:200]}...")
|
|
else:
|
|
print(f"❌ Query failed: {response.status_code}")
|
|
if response.status_code == 500:
|
|
try:
|
|
error_detail = response.json()
|
|
print(f" Error details: {error_detail}")
|
|
except:
|
|
print(f" Response: {response.text}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Query error: {e}")
|
|
|
|
|
|
async def main():
|
|
"""Main loading function"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Load documentation into LightRAG with file paths or URL references",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Load with file path references (default mode)
|
|
python load_docs.py ../apolo-copilot/docs/official-apolo-documentation/docs
|
|
|
|
# Load with URL references
|
|
python load_docs.py docs/ --mode urls --base-url https://docs.apolo.us/index/
|
|
|
|
# Load Apolo docs with URL references (common use case)
|
|
python load_docs.py ../apolo-copilot/docs/official-apolo-documentation/docs \\
|
|
--mode urls --base-url https://docs.apolo.us/index/
|
|
|
|
# Use custom endpoint
|
|
python load_docs.py docs/ --endpoint https://lightrag.example.com
|
|
|
|
# Load with different documentation base URL
|
|
python load_docs.py docs/ --mode urls --base-url https://my-docs.example.com/docs/
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"docs_path",
|
|
nargs="?",
|
|
default="../apolo-copilot/docs/official-apolo-documentation/docs",
|
|
help="Path to documentation directory (default: ../apolo-copilot/docs/official-apolo-documentation/docs)"
|
|
)
|
|
parser.add_argument(
|
|
"--mode",
|
|
choices=["files", "urls"],
|
|
default="files",
|
|
help="Reference mode: 'files' for file paths, 'urls' for URL references (default: files)"
|
|
)
|
|
parser.add_argument(
|
|
"--base-url",
|
|
dest="base_url",
|
|
help="Base URL for documentation site (required when mode=urls). Example: https://docs.apolo.us/index/"
|
|
)
|
|
parser.add_argument(
|
|
"--endpoint",
|
|
default="http://localhost:9621",
|
|
help="LightRAG endpoint URL (default: http://localhost:9621)"
|
|
)
|
|
parser.add_argument(
|
|
"--no-test",
|
|
action="store_true",
|
|
help="Skip test query after loading"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
print("🚀 Loading Documentation into LightRAG")
|
|
print("=" * 60)
|
|
print(f"📁 Documentation path: {args.docs_path}")
|
|
print(f"🔧 Reference mode: {args.mode}")
|
|
if args.mode == "urls":
|
|
if args.base_url:
|
|
print(f"🌐 Base URL: {args.base_url}")
|
|
else:
|
|
print("❌ Error: --base-url is required when mode is 'urls'")
|
|
sys.exit(1)
|
|
print(f"🌐 LightRAG endpoint: {args.endpoint}")
|
|
print()
|
|
|
|
# Test LightRAG connectivity
|
|
if not await test_lightrag_health(args.endpoint):
|
|
print("❌ Cannot connect to LightRAG. Please ensure it's running and accessible.")
|
|
sys.exit(1)
|
|
|
|
# Load documents
|
|
docs_path = Path(args.docs_path).resolve()
|
|
try:
|
|
documents = load_markdown_files(docs_path, args.mode, args.base_url)
|
|
except (FileNotFoundError, ValueError) as e:
|
|
print(f"❌ {e}")
|
|
sys.exit(1)
|
|
|
|
if not documents:
|
|
print("❌ No markdown files found to load")
|
|
sys.exit(1)
|
|
|
|
# Calculate statistics
|
|
total_content = sum(len(content) for content, _, _ in documents)
|
|
avg_content = total_content // len(documents) if documents else 0
|
|
|
|
print(f"📊 Total content: {total_content:,} characters")
|
|
print(f"📊 Average length: {avg_content:,} characters")
|
|
|
|
# Load documents
|
|
successful = 0
|
|
failed = 0
|
|
|
|
print(f"\n🔄 Starting to load documents...")
|
|
|
|
for i, (content, title, doc_url) in enumerate(documents):
|
|
success = await load_document_to_lightrag(content, title, doc_url, args.endpoint)
|
|
|
|
if success:
|
|
successful += 1
|
|
else:
|
|
failed += 1
|
|
|
|
# Progress update
|
|
if (i + 1) % 10 == 0:
|
|
print(f"📈 Progress: {i + 1}/{len(documents)} ({successful} success, {failed} failed)")
|
|
|
|
# Small delay to avoid overwhelming the service
|
|
await asyncio.sleep(0.3)
|
|
|
|
print(f"\n✅ Loading complete!")
|
|
print(f"📊 Successful: {successful}")
|
|
print(f"📊 Failed: {failed}")
|
|
|
|
# Test query unless disabled
|
|
if not args.no_test and successful > 0:
|
|
await test_query(args.endpoint)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |