246 lines
8.3 KiB
Python
246 lines
8.3 KiB
Python
import os
|
||
import asyncio
|
||
import argparse
|
||
import logging
|
||
import logging.config
|
||
import json
|
||
from pathlib import Path
|
||
from lightrag import LightRAG
|
||
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
|
||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
||
from lightrag.utils import logger, set_verbose_debug
|
||
|
||
WORKING_DIR = "./dickens"
|
||
if not os.path.exists(WORKING_DIR):
|
||
os.mkdir(WORKING_DIR)
|
||
|
||
|
||
def configure_logging():
|
||
"""Configure logging for the application"""
|
||
|
||
# Reset any existing handlers to ensure clean configuration
|
||
for logger_name in ["uvicorn", "uvicorn.access", "uvicorn.error", "lightrag"]:
|
||
logger_instance = logging.getLogger(logger_name)
|
||
logger_instance.handlers = []
|
||
logger_instance.filters = []
|
||
|
||
# Get log directory path from environment variable or use current directory
|
||
log_dir = os.getenv("LOG_DIR", os.getcwd())
|
||
log_file_path = os.path.abspath(
|
||
os.path.join(log_dir, "lightrag_tigergraph_demo.log")
|
||
)
|
||
|
||
print(f"\nLightRAG TigerGraph demo log file: {log_file_path}\n")
|
||
os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
|
||
|
||
# Get log file max size and backup count from environment variables
|
||
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
|
||
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
|
||
|
||
logging.config.dictConfig(
|
||
{
|
||
"version": 1,
|
||
"disable_existing_loggers": False,
|
||
"formatters": {
|
||
"default": {
|
||
"format": "%(levelname)s: %(message)s",
|
||
},
|
||
"detailed": {
|
||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||
},
|
||
},
|
||
"handlers": {
|
||
"console": {
|
||
"formatter": "default",
|
||
"class": "logging.StreamHandler",
|
||
"stream": "ext://sys.stderr",
|
||
},
|
||
"file": {
|
||
"formatter": "detailed",
|
||
"class": "logging.handlers.RotatingFileHandler",
|
||
"filename": log_file_path,
|
||
"maxBytes": log_max_bytes,
|
||
"backupCount": log_backup_count,
|
||
"encoding": "utf-8",
|
||
},
|
||
},
|
||
"loggers": {
|
||
"lightrag": {
|
||
"handlers": ["console", "file"],
|
||
"level": "INFO",
|
||
"propagate": False,
|
||
},
|
||
},
|
||
}
|
||
)
|
||
|
||
# Set the logger level to INFO
|
||
logger.setLevel(logging.INFO)
|
||
# Enable verbose debug if needed
|
||
set_verbose_debug(os.getenv("VERBOSE_DEBUG", "false").lower() == "true")
|
||
|
||
|
||
def load_json_texts(json_path: str | Path) -> list[str]:
|
||
"""
|
||
Load texts from a plain JSON file.
|
||
|
||
Expects JSON array format: [{"text": "..."}, {"text": "..."}]
|
||
|
||
Args:
|
||
json_path: Path to JSON file
|
||
|
||
Returns:
|
||
List of text strings extracted from "text" field
|
||
"""
|
||
json_path = Path(json_path)
|
||
|
||
if not json_path.exists():
|
||
raise FileNotFoundError(f"JSON file not found: {json_path}")
|
||
|
||
with open(json_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
if not isinstance(data, list):
|
||
raise ValueError(f"Expected JSON array, got {type(data).__name__}")
|
||
|
||
texts = []
|
||
for item in data:
|
||
if isinstance(item, dict) and "text" in item:
|
||
texts.append(item["text"])
|
||
else:
|
||
raise ValueError(
|
||
f"Expected object with 'text' field, got {type(item).__name__}"
|
||
)
|
||
|
||
return texts
|
||
|
||
|
||
async def initialize_rag():
|
||
"""Initialize LightRAG with TigerGraph implementation."""
|
||
rag = LightRAG(
|
||
working_dir=WORKING_DIR,
|
||
llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model
|
||
embedding_func=openai_embed, # Use OpenAI embedding function
|
||
graph_storage="TigerGraphStorage",
|
||
)
|
||
|
||
# Initialize database connections
|
||
await rag.initialize_storages()
|
||
# Initialize pipeline status for document processing
|
||
await initialize_pipeline_status()
|
||
|
||
return rag
|
||
|
||
|
||
async def test_ingestion(json_file=None):
|
||
"""Test document ingestion into TigerGraph"""
|
||
print("=" * 60)
|
||
print("Initializing LightRAG with TigerGraph...")
|
||
print("=" * 60)
|
||
|
||
rag = await initialize_rag()
|
||
print(f"✓ LightRAG initialized: {type(rag)}")
|
||
|
||
# Test documents for ingestion
|
||
test_documents = [
|
||
"TigerGraph is a graph database platform designed for enterprise-scale graph analytics. It supports distributed graph processing and real-time queries.",
|
||
"LightRAG is a framework that combines retrieval-augmented generation with knowledge graphs. It uses graph storage backends like TigerGraph, Neo4j, and Memgraph.",
|
||
"Graph databases store data as nodes and edges, making them ideal for relationship-heavy data. They excel at traversing complex connections between entities.",
|
||
]
|
||
|
||
print("\n" + "=" * 60)
|
||
print("Ingesting test documents...")
|
||
print("=" * 60)
|
||
|
||
# Insert documents
|
||
for i, doc in enumerate(test_documents, 1):
|
||
print(f"\n[{i}/{len(test_documents)}] Inserting document...")
|
||
track_id = await rag.ainsert(input=doc, file_paths=f"test_doc_{i}.txt")
|
||
print(f" ✓ Document inserted with track_id: {track_id}")
|
||
|
||
# Test JSON ingestion if JSON file is provided or exists
|
||
if json_file:
|
||
json_test_file = Path(json_file)
|
||
if json_test_file.exists():
|
||
print("\n" + "=" * 60)
|
||
print("Ingesting JSON file...")
|
||
print("=" * 60)
|
||
|
||
try:
|
||
texts = load_json_texts(json_test_file)
|
||
print(f"✓ Loaded {len(texts)} texts from {json_test_file}")
|
||
|
||
for i, text in enumerate(texts, 1):
|
||
print(f"\n[{i}/{len(texts)}] Inserting from JSON...")
|
||
track_id = await rag.ainsert(
|
||
input=text, file_paths=str(json_test_file)
|
||
)
|
||
print(f" ✓ Text inserted with track_id: {track_id}")
|
||
except Exception as e:
|
||
print(f"✗ Error loading JSON file: {e}")
|
||
import traceback
|
||
|
||
traceback.print_exc()
|
||
else:
|
||
print(
|
||
f"\nℹ No JSON file found at {json_test_file} (skipping JSON ingestion test)"
|
||
)
|
||
print(" Create a test_data.json file with format:")
|
||
print(' [{"text": "Your text here"}, {"text": "Another text"}]')
|
||
print(" Or use --json-file parameter to specify a JSON file")
|
||
|
||
print("\n" + "=" * 60)
|
||
print("Verifying ingestion...")
|
||
print("=" * 60)
|
||
|
||
# Verify by checking graph stats
|
||
try:
|
||
# Get all labels (entity IDs) from the graph
|
||
all_labels = await rag.chunk_entity_relation_graph.get_all_labels()
|
||
print(f"\n✓ Found {len(all_labels)} entities in the graph")
|
||
if all_labels:
|
||
print(f" Sample entities: {all_labels[:5]}")
|
||
|
||
# Get all nodes
|
||
all_nodes = await rag.chunk_entity_relation_graph.get_all_nodes()
|
||
print(f"✓ Found {len(all_nodes)} nodes in the graph")
|
||
|
||
# Get all edges
|
||
all_edges = await rag.chunk_entity_relation_graph.get_all_edges()
|
||
print(f"✓ Found {len(all_edges)} edges in the graph")
|
||
|
||
# Test a simple query
|
||
print("\n" + "=" * 60)
|
||
print("Testing query...")
|
||
print("=" * 60)
|
||
response = await rag.aquery("What is TigerGraph?")
|
||
print("\nQuery: 'What is TigerGraph?'")
|
||
print(f"Response: {response}")
|
||
|
||
except Exception as e:
|
||
print(f"\n✗ Error during verification: {e}")
|
||
import traceback
|
||
|
||
traceback.print_exc()
|
||
|
||
print("\n" + "=" * 60)
|
||
print("Ingestion test completed!")
|
||
print("=" * 60)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
parser = argparse.ArgumentParser(
|
||
description="LightRAG TigerGraph demo",
|
||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||
)
|
||
parser.add_argument(
|
||
"--json-file",
|
||
type=str,
|
||
default=None,
|
||
help='Path to JSON file with texts to ingest (format: [{"text": "..."}, ...]). Defaults to test_data.json if not specified.',
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
# Configure logging before running the main function
|
||
configure_logging()
|
||
asyncio.run(test_ingestion(json_file=args.json_file))
|