fix(api): Change content field from string to list in query responses

BREAKING CHANGE: The `content` field in query response references is now an array of strings instead of a concatenated string. This preserves individual chunk boundaries when a single file has multiple chunks. Changes: - Update QueryResponse Pydantic model to accept List[str] for content - Modify query_text endpoint to return content as list (query_routes.py:425) - Modify query_text_stream endpoint to support chunk content enrichment - Update OpenAPI schema and examples to reflect array structure - Update API README with breaking change notice and migration guide - Fix RAGAS evaluation to flatten chunk content lists
2025-11-03 04:37:09 +01:00 · 2025-11-03 04:37:09 +01:00 · 9d69e8d776
commit 9d69e8d776
parent 363f3051b1
3 changed files with 77 additions and 16 deletions
--- a/lightrag/api/README.md
+++ b/lightrag/api/README.md
@ -474,6 +474,8 @@ The `include_chunk_content` parameter (default: `false`) controls whether the ac
 - **Citation Display**: Showing users the exact text passages that support the response
 - **Transparency**: Providing full visibility into the RAG retrieval process

+**Important**: The `content` field is an **array of strings**, where each string represents a chunk from the same file. A single file may correspond to multiple chunks, so the content is returned as a list to preserve chunk boundaries.
+
 **Example API Request:**

 ```json
@ -494,18 +496,25 @@ The `include_chunk_content` parameter (default: `false`) controls whether the ac
    {
      "reference_id": "1",
      "file_path": "/documents/intro.md",
-      "content": "LightRAG is a retrieval-augmented generation system that combines knowledge graphs with vector similarity search..."
+      "content": [
+        "LightRAG is a retrieval-augmented generation system that combines knowledge graphs with vector similarity search...",
+        "The system uses a dual-indexing approach with both vector embeddings and graph structures for enhanced retrieval..."
+      ]
    },
    {
      "reference_id": "2",
      "file_path": "/documents/features.md",
-      "content": "The system provides multiple query modes including local, global, hybrid, and mix modes..."
+      "content": [
+        "The system provides multiple query modes including local, global, hybrid, and mix modes..."
+      ]
    }
  ]
 }
 ```

-**Note**: This parameter only works when `include_references=true`. Setting `include_chunk_content=true` without including references has no effect.
+**Notes**:
+- This parameter only works when `include_references=true`. Setting `include_chunk_content=true` without including references has no effect.
+- **Breaking Change**: Prior versions returned `content` as a single concatenated string. Now it returns an array of strings to preserve individual chunk boundaries. If you need a single string, join the array elements with your preferred separator (e.g., `"\n\n".join(content)`).

 ### .env Examples

--- a/lightrag/api/routers/query_routes.py
+++ b/lightrag/api/routers/query_routes.py
@ -4,7 +4,7 @@ This module contains all query-related routes for the LightRAG API.

 import json
 import logging
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Dict, List, Literal, Optional, Union

 from fastapi import APIRouter, Depends, HTTPException
 from lightrag.base import QueryParam
@ -150,9 +150,9 @@ class QueryResponse(BaseModel):
    response: str = Field(
        description="The generated response",
    )
-    references: Optional[List[Dict[str, str]]] = Field(
+    references: Optional[List[Dict[str, Union[str, List[str]]]]] = Field(
        default=None,
-        description="Reference list (Disabled when include_references=False, /query/data always includes references.)",
+        description="Reference list (Disabled when include_references=False, /query/data always includes references.). The 'content' field in each reference is a list of strings when include_chunk_content=True.",
    )


@ -208,6 +208,11 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                                        "properties": {
                                            "reference_id": {"type": "string"},
                                            "file_path": {"type": "string"},
+                                            "content": {
+                                                "type": "array",
+                                                "items": {"type": "string"},
+                                                "description": "List of chunk contents from this file (only included when include_chunk_content=True)",
+                                            },
                                        },
                                    },
                                    "description": "Reference list (only included when include_references=True)",
@ -235,19 +240,24 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                            },
                            "with_chunk_content": {
                                "summary": "Response with chunk content",
-                                "description": "Example response when include_references=True and include_chunk_content=True",
+                                "description": "Example response when include_references=True and include_chunk_content=True. Note: content is an array of chunks from the same file.",
                                "value": {
                                    "response": "Artificial Intelligence (AI) is a branch of computer science that aims to create intelligent machines capable of performing tasks that typically require human intelligence, such as learning, reasoning, and problem-solving.",
                                    "references": [
                                        {
                                            "reference_id": "1",
                                            "file_path": "/documents/ai_overview.pdf",
-                                            "content": "Artificial Intelligence (AI) represents a transformative field in computer science focused on creating systems that can perform tasks requiring human-like intelligence. These tasks include learning from experience, understanding natural language, recognizing patterns, and making decisions.",
+                                            "content": [
+                                                "Artificial Intelligence (AI) represents a transformative field in computer science focused on creating systems that can perform tasks requiring human-like intelligence. These tasks include learning from experience, understanding natural language, recognizing patterns, and making decisions.",
+                                                "AI systems can be categorized into narrow AI, which is designed for specific tasks, and general AI, which aims to match human cognitive abilities across a wide range of domains.",
+                                            ],
                                        },
                                        {
                                            "reference_id": "2",
                                            "file_path": "/documents/machine_learning.txt",
-                                            "content": "Machine learning is a subset of AI that enables computers to learn and improve from experience without being explicitly programmed. It focuses on the development of algorithms that can access data and use it to learn for themselves.",
+                                            "content": [
+                                                "Machine learning is a subset of AI that enables computers to learn and improve from experience without being explicitly programmed. It focuses on the development of algorithms that can access data and use it to learn for themselves."
+                                            ],
                                        },
                                    ],
                                },
@ -421,7 +431,8 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                    ref_copy = ref.copy()
                    ref_id = ref.get("reference_id", "")
                    if ref_id in ref_id_to_content:
-                        ref_copy["content"] = "\n\n".join(ref_id_to_content[ref_id])
+                        # Keep content as a list of chunks (one file may have multiple chunks)
+                        ref_copy["content"] = ref_id_to_content[ref_id]
                    enriched_references.append(ref_copy)
                references = enriched_references

@ -454,6 +465,11 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                                "description": "Multiple NDJSON lines when stream=True and include_references=True. First line contains references, subsequent lines contain response chunks.",
                                "value": '{"references": [{"reference_id": "1", "file_path": "/documents/ai_overview.pdf"}, {"reference_id": "2", "file_path": "/documents/ml_basics.txt"}]}\n{"response": "Artificial Intelligence (AI) is a branch of computer science"}\n{"response": " that aims to create intelligent machines capable of performing"}\n{"response": " tasks that typically require human intelligence, such as learning,"}\n{"response": " reasoning, and problem-solving."}',
                            },
+                            "streaming_with_chunk_content": {
+                                "summary": "Streaming mode with chunk content (stream=true, include_chunk_content=true)",
+                                "description": "Multiple NDJSON lines when stream=True, include_references=True, and include_chunk_content=True. First line contains references with content arrays (one file may have multiple chunks), subsequent lines contain response chunks.",
+                                "value": '{"references": [{"reference_id": "1", "file_path": "/documents/ai_overview.pdf", "content": ["Artificial Intelligence (AI) represents a transformative field...", "AI systems can be categorized into narrow AI and general AI..."]}, {"reference_id": "2", "file_path": "/documents/ml_basics.txt", "content": ["Machine learning is a subset of AI that enables computers to learn..."]}]}\n{"response": "Artificial Intelligence (AI) is a branch of computer science"}\n{"response": " that aims to create intelligent machines capable of performing"}\n{"response": " tasks that typically require human intelligence."}',
+                            },
                            "streaming_without_references": {
                                "summary": "Streaming mode without references (stream=true)",
                                "description": "Multiple NDJSON lines when stream=True and include_references=False. Only response chunks are sent.",
@ -650,6 +666,30 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                references = result.get("data", {}).get("references", [])
                llm_response = result.get("llm_response", {})

+                # Enrich references with chunk content if requested
+                if request.include_references and request.include_chunk_content:
+                    data = result.get("data", {})
+                    chunks = data.get("chunks", [])
+                    # Create a mapping from reference_id to chunk content
+                    ref_id_to_content = {}
+                    for chunk in chunks:
+                        ref_id = chunk.get("reference_id", "")
+                        content = chunk.get("content", "")
+                        if ref_id and content:
+                            # Collect chunk content
+                            ref_id_to_content.setdefault(ref_id, []).append(content)
+
+                    # Add content to references
+                    enriched_references = []
+                    for ref in references:
+                        ref_copy = ref.copy()
+                        ref_id = ref.get("reference_id", "")
+                        if ref_id in ref_id_to_content:
+                            # Keep content as a list of chunks (one file may have multiple chunks)
+                            ref_copy["content"] = ref_id_to_content[ref_id]
+                        enriched_references.append(ref_copy)
+                    references = enriched_references
+
                if llm_response.get("is_streaming"):
                    # Streaming mode: send references first, then stream response chunks
                    if request.include_references:
--- a/lightrag/evaluation/eval_rag_quality.py
+++ b/lightrag/evaluation/eval_rag_quality.py
@ -170,14 +170,26 @@ class RAGEvaluator:
                first_ref = references[0]
                logger.debug("🔍 First Reference Keys: %s", list(first_ref.keys()))
                if "content" in first_ref:
-                    logger.debug(
-                        "🔍 Content Preview: %s...", first_ref["content"][:100]
-                    )
+                    content_preview = first_ref["content"]
+                    if isinstance(content_preview, list) and content_preview:
+                        logger.debug(
+                            "🔍 Content Preview (first chunk): %s...",
+                            content_preview[0][:100],
+                        )
+                    elif isinstance(content_preview, str):
+                        logger.debug("🔍 Content Preview: %s...", content_preview[:100])

            # Extract chunk content from enriched references
-            contexts = [
-                ref.get("content", "") for ref in references if ref.get("content")
-            ]
+            # Note: content is now a list of chunks per reference (one file may have multiple chunks)
+            contexts = []
+            for ref in references:
+                content = ref.get("content", [])
+                if isinstance(content, list):
+                    # Flatten the list: each chunk becomes a separate context
+                    contexts.extend(content)
+                elif isinstance(content, str):
+                    # Backward compatibility: if content is still a string (shouldn't happen)
+                    contexts.append(content)

            return {
                "answer": answer,