Add comprehensive documentation covering 6 modules: - 01-API-LAYER: Authentication, routing, SSE streaming - 02-SERVICE-LAYER: Dialog, Task, LLM service analysis - 03-RAG-ENGINE: Hybrid search, embedding, reranking - 04-AGENT-SYSTEM: Canvas engine, components, tools - 05-DOCUMENT-PROCESSING: Task executor, PDF parsing - 06-ALGORITHMS: BM25, fusion, RAPTOR Total 28 documentation files with code analysis, diagrams, and formulas.
11 KiB
11 KiB
RAPTOR Algorithm
Tong Quan
RAPTOR (Recursive Abstractive Processing for Tree-Organized Retrieval) xây dựng hierarchical summaries để improve retrieval.
File Location
/rag/raptor.py
Algorithm Overview
┌─────────────────────────────────────────────────────────────────┐
│ RAPTOR ALGORITHM │
└─────────────────────────────────────────────────────────────────┘
Original Chunks
[C1, C2, C3, C4, C5, C6, ...]
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ 1. EMBEDDING │
│ Generate embeddings for all chunks │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ 2. UMAP DIMENSIONALITY REDUCTION │
│ Reduce to ~12 dimensions for clustering │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ 3. GAUSSIAN MIXTURE MODEL (GMM) │
│ Find optimal clusters using BIC │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ 4. HIERARCHICAL SUMMARIZATION │
│ LLM summarizes each cluster │
│ Summaries become new "chunks" │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ 5. RECURSIVE ITERATION │
│ Repeat steps 2-4 on summaries │
│ Until single summary or max depth │
└─────────────────────────────────────────────────────────────────┘
Result: Multi-level tree of summaries
Level 0: Original chunks
Level 1: Cluster summaries
Level 2: Meta-summaries
...
Implementation
class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
def __init__(self, max_cluster, llm_model, embd_model, max_token=512):
self._max_cluster = max_cluster
self._llm_model = llm_model
self._embd_model = embd_model
self._max_token = max_token
self._prompt = """
Summarize the following text, focusing on key information:
{cluster_content}
Summary:
"""
async def build_tree(self, chunks: list[str]) -> list[tuple[str, np.ndarray]]:
"""
Build RAPTOR tree from chunks.
Args:
chunks: List of text chunks
Returns:
List of (text, embedding) tuples including summaries
"""
# Generate initial embeddings
embeddings = await self._embedding_encode(chunks)
# Start with original chunks
all_chunks = [(text, emb) for text, emb in zip(chunks, embeddings)]
# Recursive summarization
current_chunks = all_chunks.copy()
while len(current_chunks) > 1:
# Cluster and summarize
summaries = await self._cluster_and_summarize(current_chunks)
if not summaries:
break
# Add summaries to tree
all_chunks.extend(summaries)
# Use summaries as input for next level
current_chunks = summaries
return all_chunks
UMAP Dimensionality Reduction
def _reduce_dimensions(self, embeddings: np.ndarray) -> np.ndarray:
"""
Reduce embedding dimensions for clustering.
Uses UMAP with cosine metric.
"""
import umap
n_samples = len(embeddings)
# Calculate neighbors based on sample size
n_neighbors = int((n_samples - 1) ** 0.8)
# Target dimensions
n_components = min(12, n_samples - 2)
reducer = umap.UMAP(
n_neighbors=max(2, n_neighbors),
n_components=n_components,
metric="cosine",
random_state=42
)
return reducer.fit_transform(embeddings)
Optimal Cluster Selection (BIC)
def _get_optimal_clusters(self, embeddings: np.ndarray) -> int:
"""
Find optimal number of clusters using BIC.
BIC = -2 × log(L) + k × log(n)
Lower BIC = better model
"""
from sklearn.mixture import GaussianMixture
max_clusters = min(self._max_cluster, len(embeddings))
n_clusters = np.arange(1, max_clusters)
bics = []
for n in n_clusters:
gm = GaussianMixture(
n_components=n,
random_state=42,
covariance_type='full'
)
gm.fit(embeddings)
bics.append(gm.bic(embeddings))
# Select cluster count with minimum BIC
optimal_clusters = n_clusters[np.argmin(bics)]
return optimal_clusters
Clustering with GMM
def _cluster_chunks(self, chunks: list[tuple], embeddings: np.ndarray) -> list[list[int]]:
"""
Cluster chunks using Gaussian Mixture Model.
Returns:
List of cluster assignments (chunk indices per cluster)
"""
from sklearn.mixture import GaussianMixture
# Reduce dimensions
reduced = self._reduce_dimensions(embeddings)
# Find optimal clusters
n_clusters = self._get_optimal_clusters(reduced)
# Fit GMM
gm = GaussianMixture(
n_components=n_clusters,
random_state=42
)
gm.fit(reduced)
# Get probabilities
probs = gm.predict_proba(reduced)
# Assign chunks to clusters (threshold = 0.1)
clusters = [[] for _ in range(n_clusters)]
for i, prob in enumerate(probs):
for j, p in enumerate(prob):
if p > 0.1: # Threshold
clusters[j].append(i)
return [c for c in clusters if c] # Remove empty
LLM Summarization
async def _summarize_cluster(self, chunk_indices: list[int],
chunks: list[tuple]) -> tuple[str, np.ndarray]:
"""
Summarize a cluster of chunks using LLM.
Returns:
(summary_text, summary_embedding)
"""
# Combine chunk texts
texts = [chunks[i][0] for i in chunk_indices]
cluster_content = "\n\n".join(texts)
# Truncate if too long
if num_tokens_from_string(cluster_content) > self._max_token * 4:
cluster_content = cluster_content[:self._max_token * 4]
# Generate summary
prompt = self._prompt.format(cluster_content=cluster_content)
summary = await self._chat(
"You're a helpful assistant that summarizes text.",
[{"role": "user", "content": prompt}],
{"max_tokens": max(self._max_token, 512)}
)
# Embed summary
embedding = await self._embedding_encode([summary])
return summary, embedding[0]
Main Loop
async def _cluster_and_summarize(self, chunks: list[tuple]) -> list[tuple]:
"""
One level of clustering and summarization.
"""
if len(chunks) <= 2:
return []
# Extract embeddings
embeddings = np.array([c[1] for c in chunks])
# Cluster
clusters = self._cluster_chunks(chunks, embeddings)
if len(clusters) <= 1:
return []
# Summarize each cluster
summaries = []
for cluster_indices in clusters:
if len(cluster_indices) < 2:
continue
summary, emb = await self._summarize_cluster(cluster_indices, chunks)
summaries.append((summary, emb))
return summaries
Tree Structure Output
# Final output structure:
tree = [
# Level 0: Original chunks
("Original chunk 1 content...", embedding_1),
("Original chunk 2 content...", embedding_2),
...
# Level 1: Cluster summaries
("Summary of chunks 1-3...", summary_emb_1),
("Summary of chunks 4-6...", summary_emb_2),
...
# Level 2: Meta-summaries
("High-level summary of summaries...", meta_emb_1),
...
]
# All entries indexed in vector store
# Search retrieves from any level
Configuration
# RAPTOR configuration
{
"max_cluster": 50, # Maximum clusters per level
"max_token": 512, # Summary max tokens
"threshold": 0.1, # GMM probability threshold
}
# In parser_config:
{
"raptor": {
"enabled": True,
"max_cluster": 30,
"max_depth": 3
}
}
Benefits
- Multi-level Retrieval: Search across different abstraction levels
- Improved Recall: Summaries capture themes missed by individual chunks
- Scalability: Reduces search space through hierarchy
- Context: Summaries provide broader context for questions
Related Files
/rag/raptor.py- RAPTOR implementation/rag/svr/task_executor.py- RAPTOR task handling/api/db/services/task_service.py- Task types