LightRAG/tests/test_prompt_quality_deep.py
clssck 082a5a8fad test(lightrag,api): add comprehensive test coverage and S3 support
Add extensive test suites for API routes and utilities:
- Implement test_search_routes.py (406 lines) for search endpoint validation
- Implement test_upload_routes.py (724 lines) for document upload workflows
- Implement test_s3_client.py (618 lines) for S3 storage operations
- Implement test_citation_utils.py (352 lines) for citation extraction
- Implement test_chunking.py (216 lines) for text chunking validation
Add S3 storage client implementation:
- Create lightrag/storage/s3_client.py with S3 operations
- Add storage module initialization with exports
- Integrate S3 client with document upload handling
Enhance API routes and core functionality:
- Add search_routes.py with full-text and graph search endpoints
- Add upload_routes.py with multipart document upload support
- Update operate.py with bulk operations and health checks
- Enhance postgres_impl.py with bulk upsert and parameterized queries
- Update lightrag_server.py to register new API routes
- Improve utils.py with citation and formatting utilities
Update dependencies and configuration:
- Add S3 and test dependencies to pyproject.toml
- Update docker-compose.test.yml for testing environment
- Sync uv.lock with new dependencies
Apply code quality improvements across all modified files:
- Add type hints to function signatures
- Update imports and router initialization
- Fix logging and error handling
2025-12-05 23:13:39 +01:00

671 lines
27 KiB
Python

"""
Deep Quality Analysis for Optimized Prompts.
Tests prompts on challenging, diverse inputs to identify weaknesses.
Run with: uv run --extra test python tests/test_prompt_quality_deep.py
"""
from __future__ import annotations
import asyncio
import json
import sys
from dataclasses import dataclass, field
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from lightrag.prompt import PROMPTS
# =============================================================================
# Test Data: Entity Extraction (5 Domains)
# =============================================================================
ENTITY_TEST_TEXTS = {
"technical": {
"name": "Technical/API Documentation",
"text": """
The FastAPI framework provides automatic OpenAPI documentation via Swagger UI at /docs.
To enable OAuth2 authentication, use the OAuth2PasswordBearer class from fastapi.security.
The dependency injection system allows you to declare dependencies using Depends().
Response models are validated using Pydantic's BaseModel class. For async database
operations, SQLAlchemy 2.0 with asyncpg driver is recommended. Rate limiting can be
implemented using slowapi middleware with Redis as the backend store.
""",
"expected_entities": ["FastAPI", "OpenAPI", "Swagger UI", "OAuth2", "Pydantic", "SQLAlchemy", "Redis"],
"expected_relations": ["FastAPI provides OpenAPI", "OAuth2 for authentication", "Pydantic validates responses"],
},
"legal": {
"name": "Legal/Contract",
"text": """
WHEREAS, Acme Corporation ("Licensor") owns certain intellectual property rights in
the Software known as "DataSync Pro"; and WHEREAS, Beta Technologies Inc. ("Licensee")
desires to obtain a non-exclusive license to use said Software; NOW THEREFORE, in
consideration of the mutual covenants herein, the parties agree: Licensee shall pay
Licensor a royalty of 5% of gross revenues derived from Software usage. This Agreement
shall be governed by the laws of the State of Delaware. Any disputes shall be resolved
through binding arbitration administered by JAMS in San Francisco, California.
""",
"expected_entities": ["Acme Corporation", "Beta Technologies Inc", "DataSync Pro", "Delaware", "JAMS", "San Francisco"],
"expected_relations": ["Licensor owns Software", "Licensee pays royalty", "Delaware governs agreement"],
},
"scientific": {
"name": "Scientific/Research Abstract",
"text": """
We investigated the effects of CRISPR-Cas9 gene editing on tumor suppressor p53 expression
in HeLa cell lines. Using Western blot analysis and qPCR, we observed a 73% reduction in
p53 protein levels after 48 hours of transfection. Control groups treated with scrambled
sgRNA showed no significant change (p > 0.05). Our findings suggest that targeted p53
knockout can be achieved with high efficiency, supporting further research into cancer
immunotherapy applications. Funding was provided by NIH grant R01-CA123456.
""",
"expected_entities": ["CRISPR-Cas9", "p53", "HeLa", "Western blot", "qPCR", "NIH"],
"expected_relations": ["CRISPR-Cas9 edits p53", "Western blot measures protein", "NIH funds research"],
},
"news": {
"name": "News/Current Events",
"text": """
BREAKING: Tesla CEO Elon Musk announced today that the company will invest $5 billion
in a new Gigafactory in Austin, Texas, creating an estimated 10,000 jobs by 2025.
The announcement came during Tesla's Q3 earnings call, where the company reported
record revenue of $25.2 billion. Analysts at Goldman Sachs raised their price target
to $300, citing strong demand for the Model Y in European markets. Shares rose 8%
in after-hours trading on the NASDAQ exchange.
""",
"expected_entities": ["Tesla", "Elon Musk", "Austin", "Texas", "Gigafactory", "Goldman Sachs", "Model Y", "NASDAQ"],
"expected_relations": ["Musk leads Tesla", "Tesla invests in Gigafactory", "Goldman Sachs analyzes Tesla"],
},
"conversational": {
"name": "Conversational/Interview",
"text": """
Interviewer: Dr. Chen, your research on quantum computing has gained significant attention.
Can you explain the breakthrough?
Dr. Chen: Certainly. Our team at MIT developed a new error correction method using topological
qubits. Unlike traditional approaches by IBM or Google, we achieved 99.9% fidelity at room
temperature. My colleague Dr. Sarah Martinez deserves much of the credit - her algorithm
made this possible.
Interviewer: What are the practical applications?
Dr. Chen: Drug discovery is the immediate target. Pfizer has already licensed our technology
for protein folding simulations.
""",
"expected_entities": ["Dr. Chen", "MIT", "IBM", "Google", "Dr. Sarah Martinez", "Pfizer"],
"expected_relations": ["Dr. Chen works at MIT", "Martinez developed algorithm", "Pfizer licenses technology"],
},
# New domains for expanded coverage
"medical": {
"name": "Medical/Healthcare",
"text": """
The patient presented with Type 2 diabetes mellitus complicated by diabetic retinopathy.
Treatment included metformin 500mg twice daily and monthly intravitreal injections of
Avastin (bevacizumab). Dr. Sarah Chen at Johns Hopkins recommended adding Jardiance
(empagliflozin) given the patient's elevated HbA1c of 8.2%. Insurance coverage through
Aetna required prior authorization.
""",
"expected_entities": ["Type 2 diabetes", "diabetic retinopathy", "metformin", "Avastin",
"Dr. Sarah Chen", "Johns Hopkins", "Jardiance", "Aetna"],
"expected_relations": ["metformin treats diabetes", "Avastin treats retinopathy", "Aetna covers insurance"],
},
"financial": {
"name": "Financial/Trading",
"text": """
The S&P 500 index futures dropped 2.3% following hawkish Fed minutes. BlackRock's
iShares ETF (SPY) saw record outflows of $4.2 billion. Morgan Stanley upgraded
NVIDIA to Overweight with a $500 price target, citing AI datacenter demand.
Bitcoin fell below $40,000 as Coinbase reported a 30% decline in trading volume.
""",
"expected_entities": ["S&P 500", "BlackRock", "SPY", "Morgan Stanley", "NVIDIA",
"Bitcoin", "Coinbase"],
"expected_relations": ["BlackRock manages SPY", "Morgan Stanley analyzes NVIDIA", "Coinbase trades Bitcoin"],
},
"social_media": {
"name": "Social Media/Informal",
"text": """
OMG just saw @elonmusk tweet about #Dogecoin again! Price pumped 15% in an hour!
Meanwhile TikTok is banning crypto content and YouTube demonetized half the crypto
channels. My Discord server is going crazy. Even r/wallstreetbets is talking about it.
""",
"expected_entities": ["Elon Musk", "Dogecoin", "TikTok", "YouTube", "Discord",
"r/wallstreetbets"],
"expected_relations": ["Musk tweets about Dogecoin", "TikTok bans crypto", "YouTube demonetizes channels"],
},
}
# =============================================================================
# Test Data: Keywords Extraction (Query Types)
# =============================================================================
KEYWORD_TEST_QUERIES = {
"factual": {
"query": "What is the capital of France?",
"expected_high": ["geography", "capital city"],
"expected_low": ["France", "capital"],
},
"analytical": {
"query": "Why does inflation cause interest rates to rise?",
"expected_high": ["economics", "monetary policy", "cause-effect"],
"expected_low": ["inflation", "interest rates"],
},
"comparison": {
"query": "How does Python compare to JavaScript for web development?",
"expected_high": ["programming languages", "comparison", "web development"],
"expected_low": ["Python", "JavaScript"],
},
"procedural": {
"query": "How to deploy a Docker container to AWS ECS?",
"expected_high": ["deployment", "cloud computing", "containerization"],
"expected_low": ["Docker", "AWS", "ECS", "container"],
},
"multi_topic": {
"query": "What is machine learning and how does it relate to artificial intelligence?",
"expected_high": ["machine learning", "artificial intelligence", "technology"],
"expected_low": ["machine learning", "artificial intelligence"],
},
# New query types for expanded coverage
"negation": {
"query": "What programming languages are NOT object-oriented?",
"expected_high": ["programming languages", "paradigms"],
"expected_low": ["object-oriented", "programming"],
},
"implicit": {
"query": "Tell me about climate",
"expected_high": ["climate", "environment"],
"expected_low": ["climate"],
},
"multi_hop": {
"query": "Who is the CEO of the company that makes the iPhone?",
"expected_high": ["business", "leadership", "technology"],
"expected_low": ["CEO", "iPhone"],
},
"ambiguous": {
"query": "What is Java used for?",
"expected_high": ["technology", "programming"],
"expected_low": ["Java"],
},
}
# =============================================================================
# Test Data: Orphan Validation (Edge Cases)
# =============================================================================
ORPHAN_TEST_CASES = [
# Same domain, directly related
{
"orphan": {"name": "Python", "type": "concept", "desc": "Programming language known for simplicity"},
"candidate": {"name": "Django", "type": "concept", "desc": "Web framework written in Python"},
"expected": True,
"difficulty": "easy",
"category": "same_domain_direct",
},
# Same domain, tangentially related
{
"orphan": {"name": "Bitcoin", "type": "concept", "desc": "Decentralized cryptocurrency"},
"candidate": {"name": "Visa", "type": "organization", "desc": "Payment processing company"},
"expected": True, # Both are payment/financial
"difficulty": "medium",
"category": "same_domain_tangential",
},
# Same domain, unrelated
{
"orphan": {"name": "Photosynthesis", "type": "concept", "desc": "Process plants use to convert sunlight to energy"},
"candidate": {"name": "Mitosis", "type": "concept", "desc": "Cell division process"},
"expected": False, # Both biology but not directly related
"difficulty": "medium",
"category": "same_domain_unrelated",
},
# Different domains, surprisingly related
{
"orphan": {"name": "Netflix", "type": "organization", "desc": "Streaming entertainment company"},
"candidate": {"name": "AWS", "type": "organization", "desc": "Amazon's cloud computing platform"},
"expected": True, # Netflix runs on AWS
"difficulty": "hard",
"category": "cross_domain_related",
},
# False positive trap (high similarity, no logic)
{
"orphan": {"name": "Java", "type": "concept", "desc": "Programming language developed by Sun Microsystems"},
"candidate": {"name": "Java", "type": "location", "desc": "Indonesian island known for coffee production"},
"expected": False, # Same name, completely different things
"difficulty": "hard",
"category": "false_positive_trap",
},
# Temporal relationship
{
"orphan": {"name": "Windows 11", "type": "product", "desc": "Microsoft operating system released in 2021"},
"candidate": {"name": "Windows 10", "type": "product", "desc": "Microsoft operating system released in 2015"},
"expected": True, # Successor relationship
"difficulty": "easy",
"category": "temporal",
},
# Causal relationship
{
"orphan": {"name": "COVID-19 Pandemic", "type": "event", "desc": "Global health crisis starting in 2020"},
"candidate": {"name": "Remote Work", "type": "concept", "desc": "Working from home or outside traditional office"},
"expected": True, # Pandemic caused remote work surge
"difficulty": "medium",
"category": "causal",
},
# Hierarchical
{
"orphan": {"name": "Toyota Camry", "type": "product", "desc": "Mid-size sedan manufactured by Toyota"},
"candidate": {"name": "Toyota", "type": "organization", "desc": "Japanese automotive manufacturer"},
"expected": True, # Part-of relationship
"difficulty": "easy",
"category": "hierarchical",
},
# Competitor relationship
{
"orphan": {"name": "Uber", "type": "organization", "desc": "Ride-sharing company"},
"candidate": {"name": "Lyft", "type": "organization", "desc": "Ride-sharing company"},
"expected": True, # Direct competitors
"difficulty": "easy",
"category": "competitor",
},
# No relationship
{
"orphan": {"name": "Beethoven", "type": "person", "desc": "Classical music composer from 18th century"},
"candidate": {"name": "Kubernetes", "type": "concept", "desc": "Container orchestration platform"},
"expected": False, # Completely unrelated
"difficulty": "easy",
"category": "no_relationship",
},
# New edge cases for expanded coverage
# Subsidiary relationship (ownership)
{
"orphan": {"name": "YouTube", "type": "organization", "desc": "Video sharing platform"},
"candidate": {"name": "Google", "type": "organization", "desc": "Technology company and search engine"},
"expected": True, # Google owns YouTube
"difficulty": "medium",
"category": "subsidiary",
},
# Scientific alias (same thing, different names)
{
"orphan": {"name": "SARS-CoV-2", "type": "concept", "desc": "Coronavirus that causes COVID-19"},
"candidate": {"name": "Coronavirus", "type": "concept", "desc": "Family of viruses affecting respiratory system"},
"expected": True, # SARS-CoV-2 is a type of coronavirus
"difficulty": "medium",
"category": "scientific_alias",
},
# Pseudonym (same person, different names)
{
"orphan": {"name": "Mark Twain", "type": "person", "desc": "American author of Tom Sawyer and Huckleberry Finn"},
"candidate": {"name": "Samuel Clemens", "type": "person", "desc": "American writer born in Missouri in 1835"},
"expected": True, # Same person (pen name)
"difficulty": "hard",
"category": "pseudonym",
},
# Weak false positive (similar names/categories but unrelated)
{
"orphan": {"name": "Mount Everest", "type": "location", "desc": "Highest mountain in the Himalayas"},
"candidate": {"name": "Mount Rushmore", "type": "location", "desc": "Memorial carved into mountain in South Dakota"},
"expected": False, # Both "Mount" but no logical connection
"difficulty": "medium",
"category": "weak_false_positive",
},
# Spurious correlation (statistically correlated but no causal link)
{
"orphan": {"name": "Ice Cream Sales", "type": "concept", "desc": "Consumer purchases of frozen dessert"},
"candidate": {"name": "Drowning Deaths", "type": "concept", "desc": "Fatalities from submersion in water"},
"expected": False, # Both increase in summer but no causal relationship
"difficulty": "hard",
"category": "spurious_correlation",
},
]
# =============================================================================
# Helper Functions
# =============================================================================
async def call_llm(prompt: str, model: str = "gpt-4o-mini") -> str:
"""Call OpenAI API."""
import openai
client = openai.AsyncOpenAI()
response = await client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
)
return response.choices[0].message.content
def format_entity_prompt(text: str) -> str:
"""Format entity extraction prompt."""
examples = "\n".join(PROMPTS["entity_extraction_examples"])
tuple_del = PROMPTS["DEFAULT_TUPLE_DELIMITER"]
comp_del = PROMPTS["DEFAULT_COMPLETION_DELIMITER"]
examples = examples.format(
tuple_delimiter=tuple_del,
completion_delimiter=comp_del,
)
return PROMPTS["entity_extraction_system_prompt"].format(
tuple_delimiter=tuple_del,
completion_delimiter=comp_del,
entity_types="person, organization, location, concept, product, event, category, method",
language="English",
examples=examples,
input_text=text,
) + "\n" + PROMPTS["entity_extraction_user_prompt"].format(
completion_delimiter=comp_del,
language="English",
)
def parse_entities(output: str) -> tuple[list[dict], list[dict]]:
"""Parse entity extraction output."""
entities = []
relations = []
tuple_del = "<|#|>"
for line in output.strip().split("\n"):
line = line.strip()
if not line or "<|COMPLETE|>" in line:
continue
parts = line.split(tuple_del)
if len(parts) >= 4 and parts[0].lower() == "entity":
entities.append({
"name": parts[1].strip(),
"type": parts[2].strip(),
"desc": parts[3].strip() if len(parts) > 3 else "",
})
elif len(parts) >= 5 and parts[0].lower() == "relation":
relations.append({
"source": parts[1].strip(),
"target": parts[2].strip(),
"keywords": parts[3].strip(),
})
return entities, relations
@dataclass
class EntityResult:
domain: str
entities: list[dict]
relations: list[dict]
expected_entities: list[str]
issues: list[str] = field(default_factory=list)
precision_notes: str = ""
recall_notes: str = ""
@dataclass
class KeywordResult:
query_type: str
query: str
high_keywords: list[str]
low_keywords: list[str]
issues: list[str] = field(default_factory=list)
@dataclass
class OrphanResult:
category: str
orphan: str
candidate: str
expected: bool
actual: bool
confidence: float
correct: bool
reasoning: str = ""
# =============================================================================
# Test Functions
# =============================================================================
async def test_entity_extraction_deep() -> list[EntityResult]:
"""Deep test entity extraction on 5 domains."""
results = []
for _domain, data in ENTITY_TEST_TEXTS.items():
print(f" Testing {data['name']}...")
prompt = format_entity_prompt(data["text"])
output = await call_llm(prompt)
entities, relations = parse_entities(output)
result = EntityResult(
domain=data["name"],
entities=entities,
relations=relations,
expected_entities=data["expected_entities"],
)
# Analyze precision (are extracted entities important?)
entity_names = [e["name"].lower() for e in entities]
found = [exp for exp in data["expected_entities"] if any(exp.lower() in n for n in entity_names)]
missed = [exp for exp in data["expected_entities"] if not any(exp.lower() in n for n in entity_names)]
if missed:
result.issues.append(f"RECALL: Missed expected entities: {missed}")
result.recall_notes = f"Found {len(found)}/{len(data['expected_entities'])}"
else:
result.recall_notes = f"Found all {len(found)} expected"
# Check for generic/unhelpful entities
generic_entities = [e for e in entities if len(e["desc"]) < 20]
if generic_entities:
result.issues.append(f"QUALITY: {len(generic_entities)} entities have very short descriptions")
# Check relationship density
if len(entities) > 0:
ratio = len(relations) / len(entities)
if ratio < 0.5:
result.issues.append(f"CONNECTIVITY: Low relation/entity ratio ({ratio:.2f})")
results.append(result)
return results
async def test_keywords_extraction_deep() -> list[KeywordResult]:
"""Deep test keywords extraction on varied query types."""
results = []
examples = "\n".join(PROMPTS["keywords_extraction_examples"])
for query_type, data in KEYWORD_TEST_QUERIES.items():
print(f" Testing {query_type} query...")
prompt = PROMPTS["keywords_extraction"].format(
examples=examples,
query=data["query"],
)
output = await call_llm(prompt)
try:
clean = output.strip()
if clean.startswith("```"):
clean = clean.split("```")[1].replace("json", "").strip()
parsed = json.loads(clean)
result = KeywordResult(
query_type=query_type,
query=data["query"],
high_keywords=parsed.get("high_level_keywords", []),
low_keywords=parsed.get("low_level_keywords", []),
)
# Check if key concepts are captured
all_kw = " ".join(result.high_keywords + result.low_keywords).lower()
for exp in data["expected_low"]:
if exp.lower() not in all_kw:
result.issues.append(f"MISS: Expected '{exp}' not in keywords")
# Check for reasonable count
if len(result.high_keywords) == 0:
result.issues.append("EMPTY: No high-level keywords")
if len(result.low_keywords) == 0 and query_type != "factual":
result.issues.append("EMPTY: No low-level keywords")
except json.JSONDecodeError:
result = KeywordResult(
query_type=query_type,
query=data["query"],
high_keywords=[],
low_keywords=[],
issues=["PARSE ERROR: Invalid JSON output"],
)
results.append(result)
return results
async def test_orphan_validation_deep() -> list[OrphanResult]:
"""Deep test orphan validation on edge cases."""
results = []
for case in ORPHAN_TEST_CASES:
print(f" Testing {case['category']}: {case['orphan']['name']}{case['candidate']['name']}...")
prompt = PROMPTS["orphan_connection_validation"].format(
orphan_name=case["orphan"]["name"],
orphan_type=case["orphan"]["type"],
orphan_description=case["orphan"]["desc"],
candidate_name=case["candidate"]["name"],
candidate_type=case["candidate"]["type"],
candidate_description=case["candidate"]["desc"],
similarity_score=0.75,
)
output = await call_llm(prompt)
try:
clean = output.strip()
if clean.startswith("```"):
clean = clean.split("```")[1].replace("json", "").strip()
parsed = json.loads(clean)
actual = parsed.get("should_connect", False)
# Handle discrete labels (HIGH/MEDIUM/LOW/NONE) or numeric
raw_confidence = parsed.get("confidence", 0.0)
if isinstance(raw_confidence, str):
confidence_map = {"HIGH": 0.95, "MEDIUM": 0.85, "LOW": 0.60, "NONE": 0.20}
confidence = confidence_map.get(raw_confidence.upper(), 0.75)
else:
confidence = raw_confidence
reasoning = parsed.get("reasoning", "")
results.append(OrphanResult(
category=case["category"],
orphan=case["orphan"]["name"],
candidate=case["candidate"]["name"],
expected=case["expected"],
actual=actual,
confidence=confidence,
correct=(actual == case["expected"]),
reasoning=reasoning,
))
except json.JSONDecodeError:
results.append(OrphanResult(
category=case["category"],
orphan=case["orphan"]["name"],
candidate=case["candidate"]["name"],
expected=case["expected"],
actual=False,
confidence=0.0,
correct=False,
reasoning="PARSE ERROR",
))
return results
# =============================================================================
# Main
# =============================================================================
async def main() -> None:
"""Run deep quality analysis."""
print("\n" + "=" * 70)
print(" DEEP PROMPT QUALITY ANALYSIS")
print("=" * 70)
# Entity Extraction
print(f"\n📊 ENTITY EXTRACTION ({len(ENTITY_TEST_TEXTS)} domains)")
print("-" * 50)
entity_results = await test_entity_extraction_deep()
for r in entity_results:
status = "" if not r.issues else ""
print(f"\n{status} {r.domain}")
print(f" Entities: {len(r.entities)} | Relations: {len(r.relations)}")
print(f" Recall: {r.recall_notes}")
if r.issues:
for issue in r.issues:
print(f"{issue}")
# Show extracted entities
print(f" Extracted: {[e['name'] for e in r.entities[:8]]}{'...' if len(r.entities) > 8 else ''}")
# Keywords Extraction
print(f"\n\n📊 KEYWORDS EXTRACTION ({len(KEYWORD_TEST_QUERIES)} query types)")
print("-" * 50)
keyword_results = await test_keywords_extraction_deep()
for r in keyword_results:
status = "" if not r.issues else ""
print(f"\n{status} {r.query_type}: \"{r.query[:50]}...\"")
print(f" High: {r.high_keywords}")
print(f" Low: {r.low_keywords}")
if r.issues:
for issue in r.issues:
print(f"{issue}")
# Orphan Validation
print(f"\n\n📊 ORPHAN VALIDATION ({len(ORPHAN_TEST_CASES)} edge cases)")
print("-" * 50)
orphan_results = await test_orphan_validation_deep()
correct = sum(1 for r in orphan_results if r.correct)
print(f"\nAccuracy: {correct}/{len(orphan_results)} ({100*correct/len(orphan_results):.0f}%)")
for r in orphan_results:
status = "" if r.correct else ""
decision = "CONNECT" if r.actual else "REJECT"
expected = "CONNECT" if r.expected else "REJECT"
print(f"\n{status} [{r.category}] {r.orphan}{r.candidate}")
print(f" Decision: {decision} (conf={r.confidence:.2f}) | Expected: {expected}")
if not r.correct:
print(f" ❌ WRONG: {r.reasoning[:80]}...")
# Summary
print("\n" + "=" * 70)
print(" SUMMARY")
print("=" * 70)
entity_issues = sum(len(r.issues) for r in entity_results)
keyword_issues = sum(len(r.issues) for r in keyword_results)
orphan_accuracy = 100 * correct / len(orphan_results)
print(f"\nEntity Extraction: {entity_issues} issues across {len(entity_results)} domains")
print(f"Keywords Extraction: {keyword_issues} issues across {len(keyword_results)} query types")
print(f"Orphan Validation: {orphan_accuracy:.0f}% accuracy ({correct}/{len(orphan_results)})")
# Recommendations
print("\n📋 RECOMMENDATIONS:")
if entity_issues > 3:
print(" • Entity extraction needs improvement (recall or description quality)")
if keyword_issues > 2:
print(" • Keywords extraction needs clearer guidance")
if orphan_accuracy < 80:
print(" • Orphan validation needs better examples or criteria")
if entity_issues <= 3 and keyword_issues <= 2 and orphan_accuracy >= 80:
print(" • All prompts performing well! Minor tuning may help but not critical.")
if __name__ == "__main__":
asyncio.run(main())