Add comprehensive documentation covering 6 modules: - 01-API-LAYER: Authentication, routing, SSE streaming - 02-SERVICE-LAYER: Dialog, Task, LLM service analysis - 03-RAG-ENGINE: Hybrid search, embedding, reranking - 04-AGENT-SYSTEM: Canvas engine, components, tools - 05-DOCUMENT-PROCESSING: Task executor, PDF parsing - 06-ALGORITHMS: BM25, fusion, RAPTOR Total 28 documentation files with code analysis, diagrams, and formulas.
10 KiB
10 KiB
Embedding Generation
Tong Quan
Embedding generation chuyển đổi text thành dense vectors để thực hiện semantic search.
File Location
/rag/llm/embedding_model.py
Supported Embedding Models
| Provider | Class | Max Tokens | Dimensions |
|---|---|---|---|
| OpenAI | OpenAIEmbed |
8191 | 1536/3072 |
| Azure OpenAI | AzureEmbed |
Custom | 1536/3072 |
| Builtin | BuiltinEmbed |
8000 | Varies |
| Qwen | QWenEmbed |
2048 | 1024 |
| ZHIPU-AI | ZhipuEmbed |
512-3072 | 1024 |
| Jina | JinaEmbed |
8196 | 768/1024 |
| Mistral | MistralEmbed |
8196 | 1024 |
| Voyage AI | VoyageEmbed |
Custom | 1024 |
| Cohere | CoHereEmbed |
Custom | 1024 |
| NVIDIA | NvidiaEmbed |
Custom | 1024 |
Architecture
┌─────────────────────────────────────────────────────────────────┐
│ TEXT INPUT │
│ ["chunk1", "chunk2", "chunk3", ...] │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ TEXT PREPROCESSING │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ 1. Token counting │ │
│ │ 2. Truncation to max_tokens │ │
│ │ 3. Batch splitting (16 texts/batch) │ │
│ └─────────────────────────────────────────────────────────┘ │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ EMBEDDING MODEL │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ OpenAI / Jina / Cohere / Local Model │ │
│ │ → API call with batch │ │
│ │ → Return vectors + token count │ │
│ └─────────────────────────────────────────────────────────┘ │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ OUTPUT VECTORS │
│ np.ndarray (N x D) where N=texts, D=dimensions │
│ + total_token_count │
└─────────────────────────────────────────────────────────────────┘
Base Implementation
class Base(ABC):
def __init__(self, key, model_name, **kwargs):
"""Abstract base for all embedding models"""
pass
def encode(self, texts: list) -> tuple[np.ndarray, int]:
"""
Encode texts to embeddings.
Args:
texts: List of strings to embed
Returns:
(embeddings, token_count): NumPy array and total tokens used
"""
raise NotImplementedError()
def encode_queries(self, text: str) -> tuple[np.ndarray, int]:
"""Encode single query text."""
raise NotImplementedError()
OpenAI Embedding
class OpenAIEmbed(Base):
def __init__(self, key, model_name, **kwargs):
self.client = OpenAI(
api_key=key,
base_url=kwargs.get("base_url", "https://api.openai.com/v1")
)
self.model_name = model_name
def encode(self, texts: list):
batch_size = 16 # OpenAI max
texts = [truncate(t, 8191) for t in texts] # Token limit
ress = []
total_tokens = 0
for i in range(0, len(texts), batch_size):
res = self.client.embeddings.create(
input=texts[i : i + batch_size],
model=self.model_name,
encoding_format="float",
extra_body={"drop_params": True}
)
ress.extend([d.embedding for d in res.data])
total_tokens += res.usage.total_tokens
return np.array(ress), total_tokens
Builtin Embedding (HuggingFace TEI)
class BuiltinEmbed(Base):
_model = None
_model_name = ""
_model_lock = threading.Lock() # Thread-safe initialization
MAX_TOKENS = {
"BAAI/bge-large-zh-v1.5": 500,
"BAAI/bge-m3": 8000,
"maidalun1020/bce-embedding-base_v1": 500,
"jina-embeddings-v3": 30000,
}
def __init__(self, key, model_name, **kwargs):
if not BuiltinEmbed._model and "tei-" in os.getenv("COMPOSE_PROFILES", ""):
with BuiltinEmbed._model_lock:
# Lazy load HuggingFace TEI model
BuiltinEmbed._model = HuggingFaceEmbed(
embedding_cfg["api_key"],
settings.EMBEDDING_MDL,
base_url=embedding_cfg["base_url"]
)
self._model = BuiltinEmbed._model
def encode(self, texts: list):
return self._model.encode(texts)
Qwen Embedding with Retry
class QWenEmbed(Base):
def encode(self, texts: list):
import dashscope
batch_size = 4
texts = [truncate(t, 2048) for t in texts]
res = []
for i in range(0, len(texts), batch_size):
retry_max = 5
resp = dashscope.TextEmbedding.call(
model=self.model_name,
input=texts[i : i + batch_size],
api_key=self.key,
text_type="document"
)
# Retry if empty response
while resp["output"] is None and retry_max > 0:
time.sleep(10)
resp = dashscope.TextEmbedding.call(...)
retry_max -= 1
if retry_max == 0:
raise LookupError("Retry_max reached")
res.extend([d["embedding"] for d in resp["output"]["embeddings"]])
return np.array(res), total_tokens
Embedding Workflow in RAG
# In task_executor.py - build_chunks()
async def embedding(chunks, embd_mdl, parser_config):
"""Generate embeddings for chunks."""
# Prepare text for embedding
texts_to_embed = []
for chunk in chunks:
# Option 1: Title + Content weighted embedding
if parser_config.get("title_weight", 0) > 0:
text = chunk["title"] + " " + chunk["content"]
# Option 2: Question-based embedding
elif parser_config.get("question_kwd"):
text = chunk["question_kwd"]
else:
text = chunk["content_with_weight"]
texts_to_embed.append(text)
# Batch embedding
batch_size = 16
for i in range(0, len(texts_to_embed), batch_size):
batch = texts_to_embed[i:i+batch_size]
embeddings, tokens = embd_mdl.encode(batch)
# Store vectors in chunks
for j, emb in enumerate(embeddings):
chunk_idx = i + j
chunks[chunk_idx][f"q_{len(emb)}_vec"] = emb.tolist()
Title-Content Weighted Embedding
def weighted_embedding(title_emb, content_emb, title_weight=0.1):
"""
Combine title and content embeddings.
Formula:
weighted_vec = title_weight * title_emb + (1 - title_weight) * content_emb
"""
return title_weight * title_emb + (1 - title_weight) * content_emb
Vector Storage
# Elasticsearch mapping
{
"q_1024_vec": {
"type": "dense_vector",
"dims": 1024,
"index": true,
"similarity": "cosine"
}
}
# Vector field naming convention
# q_{dimension}_vec
# Examples: q_768_vec, q_1024_vec, q_1536_vec, q_3072_vec
Performance Optimization
Batch Processing
# OpenAI: 16 texts per batch
# Qwen: 4 texts per batch
# Others: 8-16 texts per batch
Text Truncation
def truncate(text: str, max_tokens: int) -> str:
"""Truncate text to fit within token limit."""
token_count = num_tokens_from_string(text)
if token_count <= max_tokens:
return text
# Truncate with safety margin
target_len = int(len(text) * max_tokens / token_count * 0.95)
return text[:target_len]
Caching
# Model instances are not cached between requests
# But configuration is cached in database
# Thread-safe lazy initialization for builtin models
Configuration
EMBEDDING_CFG = {
"factory": "OpenAI",
"api_key": os.getenv("OPENAI_API_KEY"),
"base_url": "https://api.openai.com/v1",
"model": "text-embedding-ada-002"
}
# Parser config for embedding behavior
{
"title_weight": 0.1, # Weight for title embedding
"question_kwd": False, # Use generated questions for embedding
}
Related Files
/rag/llm/embedding_model.py- All embedding implementations/rag/svr/task_executor.py- Embedding generation in pipeline/rag/utils/es_conn.py- Vector storage in Elasticsearch