feat(examples, lightrag): fix logging and code improvements

Fix logging output in evaluation test harness and examples:
- Replace print() statements with logger calls in e2e_test_harness.py
- Update copy_llm_cache_to_another_storage.py to use logger instead of print
- Remove redundant logging configuration in copy_llm_cache_to_another_storage.py
Fix path handling and typos:
- Correct makedirs() call in lightrag_openai_demo.py to create log_dir directly
- Update constants.py comments to clarify SOURCE_IDS_LIMIT_METHOD options
- Remove duplicate return statement in utils.py normalize_extracted_info()
- Fix error string formatting in chroma_impl.py with !s conversion
- Remove unused pipmaster import from chroma_impl.py
This commit is contained in:
clssck 2025-12-05 18:10:19 +01:00
parent dd1413f3eb
commit 65d2cd16b1
6 changed files with 50 additions and 77 deletions

View file

@ -24,7 +24,7 @@ def configure_logging():
log_file_path = os.path.abspath(os.path.join(log_dir, 'lightrag_demo.log'))
print(f'\nLightRAG demo log file: {log_file_path}\n')
os.makedirs(os.path.dirname(log_dir), exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
# Get log file max size and backup count from environment variables
log_max_bytes = int(os.getenv('LOG_MAX_BYTES', 10485760)) # Default 10MB

View file

@ -5,7 +5,6 @@ This handy script helps you to copy the LLM caches from one storage solution to
"""
import asyncio
import logging
import os
from dotenv import load_dotenv
@ -13,13 +12,12 @@ from dotenv import load_dotenv
from lightrag.kg.json_kv_impl import JsonKVStorage
from lightrag.kg.postgres_impl import PGKVStorage, PostgreSQLDB
from lightrag.namespace import NameSpace
from lightrag.utils import logger
load_dotenv()
ROOT_DIR = os.environ.get('ROOT_DIR')
WORKING_DIR = f'{ROOT_DIR}/dickens'
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
if not os.path.exists(WORKING_DIR):
os.mkdir(WORKING_DIR)
@ -66,13 +64,13 @@ async def copy_from_postgres_to_json():
if mode not in kv:
kv[mode] = {}
kv[mode][hash_value] = cache_entry
print(f'Copying {flattened_key} -> {mode}[{hash_value}]')
logger.info(f'Copying {flattened_key} -> {mode}[{hash_value}]')
else:
print(f'Skipping invalid key format: {flattened_key}')
logger.warning(f'Skipping invalid key format: {flattened_key}')
await to_llm_response_cache.upsert(kv)
await to_llm_response_cache.index_done_callback()
print('Mission accomplished!')
logger.info('Mission accomplished!')
async def copy_from_json_to_postgres():

View file

@ -61,9 +61,9 @@ DEFAULT_RERANK_BINDING = 'null'
# Default source ids limit in meta data for entity and relation
DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 300
DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 300
### control chunk_ids limitation method: FIFO, FIFO
### FIFO: First in first out
### control chunk_ids limitation method: KEEP, FIFO
### KEEP: Keep oldest (less merge action and faster)
### FIFO: First in first out
SOURCE_IDS_LIMIT_METHOD_KEEP = 'KEEP'
SOURCE_IDS_LIMIT_METHOD_FIFO = 'FIFO'
DEFAULT_SOURCE_IDS_LIMIT_METHOD = SOURCE_IDS_LIMIT_METHOD_FIFO

View file

@ -36,6 +36,7 @@ from pathlib import Path
import httpx
from dotenv import load_dotenv
from lightrag.utils import logger
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
@ -85,28 +86,26 @@ class E2ETestHarness:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.get(f'{self.rag_url}/health')
response.raise_for_status()
print(f'[OK] LightRAG API accessible at {self.rag_url}')
logger.info(f'LightRAG API accessible at {self.rag_url}')
return True
except Exception as e:
print(f'[ERROR] Cannot connect to LightRAG API: {e}')
logger.error(f'Cannot connect to LightRAG API: {e}')
return False
async def download_papers(self) -> list[str]:
"""Download arXiv papers."""
if self.skip_download:
print('[SKIP] Paper download (--skip-download)')
logger.info('Paper download skipped (--skip-download)')
# Check existing papers
existing = [
str(self.papers_dir / f'{pid}.pdf')
for pid in self.paper_ids
if (self.papers_dir / f'{pid}.pdf').exists()
]
print(f'[INFO] Found {len(existing)} existing papers')
logger.info(f'Found {len(existing)} existing papers')
return existing
print('\n' + '=' * 60)
print('STEP 1: Download arXiv Papers')
print('=' * 60)
logger.info('STEP 1: Download arXiv Papers')
from lightrag.evaluation.download_arxiv import download_papers
@ -115,7 +114,7 @@ class E2ETestHarness:
async def clear_existing_data(self) -> bool:
"""Clear existing documents in LightRAG (optional)."""
print('\n[INFO] Clearing existing data...')
logger.info('Clearing existing data...')
try:
headers = {'X-API-Key': self.api_key} if self.api_key else {}
async with httpx.AsyncClient(timeout=60.0) as client:
@ -134,7 +133,7 @@ class E2ETestHarness:
all_docs.extend(status_docs)
if all_docs:
print(f'[INFO] Clearing {len(all_docs)} existing documents...')
logger.info(f'Clearing {len(all_docs)} existing documents...')
for doc in all_docs:
doc_id = doc.get('id')
if doc_id:
@ -142,24 +141,22 @@ class E2ETestHarness:
f'{self.rag_url}/documents/{doc_id}',
headers=headers,
)
print('[OK] Cleared existing documents')
logger.info('Cleared existing documents')
else:
print('[OK] No existing documents to clear')
logger.info('No existing documents to clear')
return True
except Exception as e:
print(f'[WARN] Could not clear data: {e}')
logger.warning(f'Could not clear data: {e}')
return False
async def ingest_papers(self, paper_paths: list[str]) -> bool:
"""Ingest papers into LightRAG."""
if self.skip_ingest:
print('[SKIP] Paper ingestion (--skip-ingest)')
logger.info('Paper ingestion skipped (--skip-ingest)')
return True
print('\n' + '=' * 60)
print('STEP 2: Ingest Papers into LightRAG')
print('=' * 60)
logger.info('STEP 2: Ingest Papers into LightRAG')
headers = {'X-API-Key': self.api_key} if self.api_key else {}
@ -167,10 +164,10 @@ class E2ETestHarness:
for paper_path in paper_paths:
path = Path(paper_path)
if not path.exists():
print(f'[WARN] Paper not found: {paper_path}')
logger.warning(f'Paper not found: {paper_path}')
continue
print(f'[UPLOAD] {path.name}')
logger.info(f'Uploading {path.name}')
try:
with open(path, 'rb') as f:
@ -182,17 +179,15 @@ class E2ETestHarness:
)
response.raise_for_status()
result = response.json()
print(f' [OK] Uploaded: {result}')
logger.info(f'Uploaded: {result}')
except Exception as e:
print(f' [ERROR] Upload failed: {e}')
logger.error(f'Upload failed: {e}')
return True
async def wait_for_processing(self) -> bool:
"""Wait for all documents to finish processing."""
print('\n' + '=' * 60)
print('STEP 3: Wait for Document Processing')
print('=' * 60)
logger.info('STEP 3: Wait for Document Processing')
headers = {'X-API-Key': self.api_key} if self.api_key else {}
start_time = time.time()
@ -215,31 +210,29 @@ class E2ETestHarness:
failed = len(statuses.get('failed', []))
elapsed = int(time.time() - start_time)
print(
f' [{elapsed}s] Processing: {processing}, Pending: {pending}, Completed: {completed}, Failed: {failed}'
logger.info(
f'[{elapsed}s] Processing: {processing}, Pending: {pending}, Completed: {completed}, Failed: {failed}'
)
if processing == 0 and pending == 0:
print('[OK] All documents processed')
logger.info('All documents processed')
return True
except Exception as e:
print(f' [WARN] Status check failed: {e}')
logger.warning(f'Status check failed: {e}')
await asyncio.sleep(POLL_INTERVAL_SECONDS)
print('[ERROR] Timeout waiting for document processing')
logger.error('Timeout waiting for document processing')
return False
async def generate_dataset(self) -> Path:
"""Generate Q&A dataset from ingested papers."""
if self.dataset_path and self.dataset_path.exists():
print(f'[SKIP] Using existing dataset: {self.dataset_path}')
logger.info(f'Using existing dataset: {self.dataset_path}')
return self.dataset_path
print('\n' + '=' * 60)
print('STEP 4: Generate Q&A Dataset')
print('=' * 60)
logger.info('STEP 4: Generate Q&A Dataset')
from lightrag.evaluation.generate_arxiv_dataset import generate_dataset
@ -256,9 +249,7 @@ class E2ETestHarness:
async def run_ragas_evaluation(self, dataset_path: Path) -> dict:
"""Run RAGAS evaluation."""
print('\n' + '=' * 60)
print('STEP 5: Run RAGAS Evaluation')
print('=' * 60)
logger.info('STEP 5: Run RAGAS Evaluation')
from lightrag.evaluation.eval_rag_quality import RAGEvaluator
@ -272,14 +263,11 @@ class E2ETestHarness:
async def run_full_pipeline(self) -> dict:
"""Run the complete E2E test pipeline."""
print('=' * 70)
print('E2E RAGAS TEST HARNESS FOR LIGHTRAG')
print('=' * 70)
print(f'RAG URL: {self.rag_url}')
print(f'Papers: {", ".join(self.paper_ids)}')
print(f'Questions: {self.questions_per_paper} per paper')
print(f'Results: {self.results_dir}')
print('=' * 70)
logger.info('E2E RAGAS TEST HARNESS FOR LIGHTRAG')
logger.info(f'RAG URL: {self.rag_url}')
logger.info(f'Papers: {", ".join(self.paper_ids)}')
logger.info(f'Questions: {self.questions_per_paper} per paper')
logger.info(f'Results: {self.results_dir}')
start_time = time.time()
@ -321,12 +309,9 @@ class E2ETestHarness:
with open(summary_path, 'w') as f:
json.dump(summary, f, indent=2)
print('\n' + '=' * 70)
print('E2E PIPELINE COMPLETE')
print('=' * 70)
print(f'Total time: {elapsed_time:.1f} seconds')
print(f'Summary saved: {summary_path}')
print('=' * 70)
logger.info('E2E PIPELINE COMPLETE')
logger.info(f'Total time: {elapsed_time:.1f} seconds')
logger.info(f'Summary saved: {summary_path}')
return summary
@ -345,14 +330,12 @@ async def run_ab_test(
Returns:
A/B comparison results
"""
print('=' * 70)
print('A/B TEST: WITH vs WITHOUT ORPHAN CONNECTIONS')
print('=' * 70)
logger.info('A/B TEST: WITH vs WITHOUT ORPHAN CONNECTIONS')
results = {}
# Test A: WITHOUT orphan connections
print('\n[A] Running WITHOUT orphan connections...')
logger.info('[A] Running WITHOUT orphan connections...')
os.environ['AUTO_CONNECT_ORPHANS'] = 'false'
harness_a = E2ETestHarness(**harness_config)
@ -363,7 +346,7 @@ async def run_ab_test(
await harness_a.clear_existing_data()
# Test B: WITH orphan connections
print('\n[B] Running WITH orphan connections...')
logger.info('[B] Running WITH orphan connections...')
os.environ['AUTO_CONNECT_ORPHANS'] = 'true'
# Force re-ingest for test B
@ -375,9 +358,7 @@ async def run_ab_test(
results['with_orphans'] = await harness_b.run_full_pipeline()
# Compare results
print('\n' + '=' * 70)
print('A/B COMPARISON')
print('=' * 70)
logger.info('A/B COMPARISON')
a_stats = results['without_orphans'].get('ragas_results', {}).get('average_metrics', {})
b_stats = results['with_orphans'].get('ragas_results', {}).get('average_metrics', {})
@ -401,7 +382,7 @@ async def run_ab_test(
}
status = 'UP' if diff > 0 else ('DOWN' if diff < 0 else '~')
print(f' {metric:<20} A: {a_val:.4f} B: {b_val:.4f} [{status}] {pct:+.1f}%')
logger.info(f'{metric:<20} A: {a_val:.4f} B: {b_val:.4f} [{status}] {pct:+.1f}%')
# Verdict
ragas_improvement = comparison['improvement'].get('ragas_score', {}).get('percent', 0)
@ -413,13 +394,13 @@ async def run_ab_test(
verdict = 'NO SIGNIFICANT DIFFERENCE'
comparison['verdict'] = verdict
print(f'\nVERDICT: {verdict}')
logger.info(f'VERDICT: {verdict}')
# Save comparison
comp_path = harness_a.results_dir / f'ab_comparison_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(comp_path, 'w') as f:
json.dump(comparison, f, indent=2)
print(f'\nComparison saved: {comp_path}')
logger.info(f'Comparison saved: {comp_path}')
return comparison

View file

@ -4,14 +4,10 @@ from dataclasses import dataclass
from typing import Any, final
import numpy as np
import pipmaster as pm
from lightrag.base import BaseVectorStorage
from lightrag.utils import logger
if not pm.is_installed('chromadb'):
pm.install('chromadb')
from chromadb import HttpClient, PersistentClient # type: ignore
from chromadb.config import Settings # type: ignore
@ -217,7 +213,7 @@ class ChromaVectorDBStorage(BaseVectorStorage):
self._collection.delete(ids=ids)
logger.debug(f'Successfully deleted {len(ids)} vectors from {self.namespace}')
except Exception as e:
logger.error(f'Error while deleting vectors from {self.namespace}: {e}')
logger.error(f'Error while deleting vectors from {self.namespace}: {e!s}')
raise
async def get_by_id(self, id: str) -> dict[str, Any] | None:
@ -245,7 +241,7 @@ class ChromaVectorDBStorage(BaseVectorStorage):
**result['metadatas'][0],
}
except Exception as e:
logger.error(f'Error retrieving vector data for ID {id}: {e}')
logger.error(f'Error retrieving vector data for ID {id}: {e!s}')
return None
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:

View file

@ -2042,8 +2042,6 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
if len(name) < 6 and should_filter_by_dots(name):
# Filter out mixed numeric and dot content with length < 6
return ''
# Filter out mixed numeric and dot content with length < 6, requiring at least one dot
return ''
return name