From 1bdd906753aabd835fe501e23e8c850336e16b00 Mon Sep 17 00:00:00 2001 From: clssck Date: Mon, 1 Dec 2025 21:02:44 +0100 Subject: [PATCH] chore(lightrag): remove legacy prompts and clean up prompt.py Remove unused LLM-generated citation prompts that were kept for backward compatibility but never referenced in codebase. Consolidate duplicate instructions in entity summarization prompt and fix minor typos. - Remove rag_response_with_llm_citations prompt (dead code) - Remove naive_rag_response_with_llm_citations prompt (dead code) - Remove unused cite_ready_* backward compatibility aliases - Consolidate duplicate context/objectivity instructions in summarize prompt - Fix typo in example (extra parenthesis) - Clarify delimiter documentation comment --- lightrag/prompt.py | 141 +++------------------------------------------ 1 file changed, 9 insertions(+), 132 deletions(-) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 2044caad..a3424a3c 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -4,7 +4,7 @@ from typing import Any PROMPTS: dict[str, Any] = {} -# All delimiters must be formatted as "<|UPPER_CASE_STRING|>" +# All delimiters must be formatted as "<|TOKEN|>" style markers (e.g., "<|#|>" or "<|COMPLETE|>") PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|#|>" PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" @@ -16,7 +16,7 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel * **Identification:** Identify clearly defined and meaningful entities in the input text. * **Entity Details:** For each identified entity, extract the following information: * `entity_name`: The name of the entity. If the entity name is case-insensitive, capitalize the first letter of each significant word (title case). Ensure **consistent naming** across the entire extraction process. - * `entity_type`: Categorize the entity using one of the following types: `{entity_types}`. If none of the provided entity types apply, do not add new entity type and classify it as `Other`. + * `entity_type`: Categorize the entity using one of the following types: `{entity_types}`. If none of the provided types apply, do not invent a new type; classify it as `Other`. * `entity_description`: Provide a concise yet comprehensive description of the entity's attributes and activities, based *solely* on the information present in the input text. * **Output Format - Entities:** Output a total of 4 fields for each entity, delimited by `{tuple_delimiter}`, on a single line. The first field *must* be the literal string `entity`. * Format: `entity{tuple_delimiter}entity_name{tuple_delimiter}entity_type{tuple_delimiter}entity_description` @@ -131,7 +131,7 @@ entity{tuple_delimiter}Jordan{tuple_delimiter}person{tuple_delimiter}Jordan shar entity{tuple_delimiter}Cruz{tuple_delimiter}person{tuple_delimiter}Cruz is associated with a vision of control and order, influencing the dynamics among other characters. entity{tuple_delimiter}The Device{tuple_delimiter}equipment{tuple_delimiter}The Device is central to the story, with potential game-changing implications, and is revered by Taylor. relation{tuple_delimiter}Alex{tuple_delimiter}Taylor{tuple_delimiter}power dynamics, observation{tuple_delimiter}Alex observes Taylor's authoritarian behavior and notes changes in Taylor's attitude toward the device. -relation{tuple_delimiter}Alex{tuple_delimiter}Jordan{tuple_delimiter}shared goals, rebellion{tuple_delimiter}Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision.) +relation{tuple_delimiter}Alex{tuple_delimiter}Jordan{tuple_delimiter}shared goals, rebellion{tuple_delimiter}Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision. relation{tuple_delimiter}Taylor{tuple_delimiter}Jordan{tuple_delimiter}conflict resolution, mutual respect{tuple_delimiter}Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce. relation{tuple_delimiter}Jordan{tuple_delimiter}Cruz{tuple_delimiter}ideological conflict, rebellion{tuple_delimiter}Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order. relation{tuple_delimiter}Taylor{tuple_delimiter}The Device{tuple_delimiter}reverence, technological significance{tuple_delimiter}Taylor shows reverence towards the device, indicating its importance and potential impact. @@ -199,18 +199,13 @@ Your task is to synthesize a list of descriptions of a given entity or relation 1. Input Format: The description list is provided in JSON format. Each JSON object (representing a single description) appears on a new line within the `Description List` section. 2. Output Format: The merged description will be returned as plain text, presented in multiple paragraphs, without any additional formatting or extraneous comments before or after the summary. 3. Comprehensiveness: The summary must integrate all key information from *every* provided description. Do not omit any important facts or details. -4. Context: Ensure the summary is written from an objective, third-person perspective; explicitly mention the name of the entity or relation for full clarity and context. -5. Context & Objectivity: - - Write the summary from an objective, third-person perspective. - - Explicitly mention the full name of the entity or relation at the beginning of the summary to ensure immediate clarity and context. -6. Conflict Handling: +4. Clarity: Write from an objective, third-person perspective and explicitly mention the full name of the entity or relation at the beginning for immediate context. +5. Conflict Handling: - In cases of conflicting or inconsistent descriptions, first determine if these conflicts arise from multiple, distinct entities or relationships that share the same name. - If distinct entities/relations are identified, summarize each one *separately* within the overall output. - If conflicts within a single entity/relation (e.g., historical discrepancies) exist, attempt to reconcile them or present both viewpoints with noted uncertainty. -7. Length Constraint:The summary's total length must not exceed {summary_length} tokens, while still maintaining depth and completeness. -8. Language: The entire output must be written in {language}. Proper nouns (e.g., personal names, place names, organization names) may in their original language if proper translation is not available. - - The entire output must be written in {language}. - - Proper nouns (e.g., personal names, place names, organization names) should be retained in their original language if a proper, widely accepted translation is not available or would cause ambiguity. +6. Length Constraint: The summary's total length must not exceed {summary_length} tokens while still maintaining depth and completeness. +7. Language: Write the entire output in {language}. Retain proper nouns (e.g., personal names, place names, organization names) in their original language if a clear, widely accepted translation is unavailable. ---Input--- {description_type} Name: {description_name} @@ -263,63 +258,6 @@ STRICT GROUNDING: - Entity summaries for overview, Source Excerpts for precision """ -# Legacy prompt with LLM-generated citations (for backward compatibility) -PROMPTS["rag_response_with_llm_citations"] = """---Role--- - -You are an expert AI assistant specializing in synthesizing information from a provided knowledge base. Your primary function is to answer user queries accurately by ONLY using the information within the provided **Context**. - ----Goal--- - -Generate a comprehensive, well-structured answer to the user query. -The answer must integrate relevant facts from the Knowledge Graph and Document Chunks found in the **Context**. -Consider the conversation history if provided to maintain conversational flow and avoid repeating information. - ----Instructions--- - -1. Step-by-Step Instruction: - - Carefully determine the user's query intent in the context of the conversation history to fully understand the user's information need. - - Scrutinize both `Knowledge Graph Data` and `Document Chunks` in the **Context**. Identify and extract all pieces of information that are directly relevant to answering the user query. - - Weave the extracted facts into a coherent and logical response. Your own knowledge must ONLY be used to formulate fluent sentences and connect ideas, NOT to introduce any external information. - - Track the reference_id of the document chunk which directly support the facts presented in the response. Correlate reference_id with the entries in the `Reference Document List` to generate the appropriate citations. - - Generate a references section at the end of the response. Each reference document must directly support the facts presented in the response. - - Do not generate anything after the reference section. - -2. Content & Grounding: - - Strictly adhere to the provided context from the **Context**; DO NOT invent, assume, or infer any information not explicitly stated. - - If the answer cannot be found in the **Context**, state that you do not have enough information to answer. Do not attempt to guess. - - CRITICAL FOR FACTS: When stating specific facts (dates, numbers, names, statistics), you MUST verify each fact appears EXACTLY in the provided context. If a specific date or number is not explicitly stated in the context, say "the exact [year/number/date] is not specified in the available information" rather than guessing. - - When the question asks "which" or "who" or "how many", provide ONLY the direct answer with facts from context. Do not elaborate with information not explicitly in the context. - -3. Formatting & Language: - - The response MUST be in the same language as the user query. - - The response MUST utilize Markdown formatting for enhanced clarity and structure (e.g., headings, bold text, bullet points). - - The response should be presented in {response_type}. - -4. References Section Format: - - The References section should be under heading: `### References` - - Reference list entries should adhere to the format: `* [n] Document Title`. Do not include a caret (`^`) after opening square bracket (`[`). - - The Document Title in the citation must retain its original language. - - Output each citation on an individual line - - Provide maximum of 5 most relevant citations. - - Do not generate footnotes section or any comment, summary, or explanation after the references. - -5. Reference Section Example: -``` -### References - -- [1] Document Title One -- [2] Document Title Two -- [3] Document Title Three -``` - -6. Additional Instructions: {user_prompt} - - ----Context--- - -{context_data} -""" - # Default naive RAG response prompt - cite-ready (no LLM-generated citations) PROMPTS["naive_rag_response"] = """---Role--- @@ -356,67 +294,6 @@ Generate a comprehensive, well-structured answer to the user query using ONLY in {content_data} """ -# Legacy naive RAG prompt with LLM-generated citations (for backward compatibility) -PROMPTS["naive_rag_response_with_llm_citations"] = """---Role--- - -You are an expert AI assistant specializing in synthesizing information from a provided knowledge base. Your primary function is to answer user queries accurately by ONLY using the information within the provided **Context**. - ----Goal--- - -Generate a comprehensive, well-structured answer to the user query. -The answer must integrate relevant facts from the Document Chunks found in the **Context**. -Consider the conversation history if provided to maintain conversational flow and avoid repeating information. - ----Instructions--- - -1. Step-by-Step Instruction: - - Carefully determine the user's query intent in the context of the conversation history to fully understand the user's information need. - - Scrutinize `Document Chunks` in the **Context**. Identify and extract all pieces of information that are directly relevant to answering the user query. - - Weave the extracted facts into a coherent and logical response. Your own knowledge must ONLY be used to formulate fluent sentences and connect ideas, NOT to introduce any external information. - - Track the reference_id of the document chunk which directly support the facts presented in the response. Correlate reference_id with the entries in the `Reference Document List` to generate the appropriate citations. - - Generate a **References** section at the end of the response. Each reference document must directly support the facts presented in the response. - - Do not generate anything after the reference section. - -2. Content & Grounding: - - Strictly adhere to the provided context from the **Context**; DO NOT invent, assume, or infer any information not explicitly stated. - - If the answer cannot be found in the **Context**, state that you do not have enough information to answer. Do not attempt to guess. - - CRITICAL FOR FACTS: When stating specific facts (dates, numbers, names, statistics), you MUST verify each fact appears EXACTLY in the provided context. If a specific date or number is not explicitly stated in the context, say "the exact [year/number/date] is not specified in the available information" rather than guessing. - - When the question asks "which" or "who" or "how many", provide ONLY the direct answer with facts from context. Do not elaborate with information not explicitly in the context. - -3. Formatting & Language: - - The response MUST be in the same language as the user query. - - The response MUST utilize Markdown formatting for enhanced clarity and structure (e.g., headings, bold text, bullet points). - - The response should be presented in {response_type}. - -4. References Section Format: - - The References section should be under heading: `### References` - - Reference list entries should adhere to the format: `* [n] Document Title`. Do not include a caret (`^`) after opening square bracket (`[`). - - The Document Title in the citation must retain its original language. - - Output each citation on an individual line - - Provide maximum of 5 most relevant citations. - - Do not generate footnotes section or any comment, summary, or explanation after the references. - -5. Reference Section Example: -``` -### References - -- [1] Document Title One -- [2] Document Title Two -- [3] Document Title Three -``` - -6. Additional Instructions: {user_prompt} - - ----Context--- - -{content_data} -""" - -# Backward compatibility aliases - the default prompts are now cite-ready -PROMPTS["cite_ready_rag_response"] = PROMPTS["rag_response"] -PROMPTS["cite_ready_naive_rag_response"] = PROMPTS["naive_rag_response"] - PROMPTS["kg_query_context"] = """ ## Entity Summaries (use for definitions and general facts) @@ -442,7 +319,7 @@ PROMPTS["kg_query_context"] = """ """ PROMPTS["naive_query_context"] = """ -Document Chunks (Each entry has a reference_id refer to the `Reference Document List`): +Document Chunks (Each entry includes a reference_id that refers to the `Reference Document List`): ```json {text_chunks_str} @@ -466,7 +343,7 @@ Given a user query, your task is to extract two distinct types of keywords: ---Instructions & Constraints--- 1. **Output Format**: Your output MUST be a valid JSON object and nothing else. Do not include any explanatory text, markdown code fences (like ```json), or any other text before or after the JSON. It will be parsed directly by a JSON parser. -2. **Source of Truth**: All keywords must be explicitly derived from the user query, with both high-level and low-level keyword categories are required to contain content. +2. **Source of Truth**: Derive all keywords explicitly from the user query. Populate both keyword lists when the query contains meaningful content; if the query is trivial or nonsensical, return empty lists (see edge cases). 3. **Concise & Meaningful**: Keywords should be concise words or meaningful phrases. Prioritize multi-word phrases when they represent a single concept. For example, from "latest financial report of Apple Inc.", you should extract "latest financial report" and "Apple Inc." rather than "latest", "financial", "report", and "Apple". 4. **Handle Edge Cases**: For queries that are too simple, vague, or nonsensical (e.g., "hello", "ok", "asdfghjkl"), you must return a JSON object with empty lists for both keyword types.