Add comprehensive documentation covering 6 modules: - 01-API-LAYER: Authentication, routing, SSE streaming - 02-SERVICE-LAYER: Dialog, Task, LLM service analysis - 03-RAG-ENGINE: Hybrid search, embedding, reranking - 04-AGENT-SYSTEM: Canvas engine, components, tools - 05-DOCUMENT-PROCESSING: Task executor, PDF parsing - 06-ALGORITHMS: BM25, fusion, RAPTOR Total 28 documentation files with code analysis, diagrams, and formulas.
16 KiB
16 KiB
Hybrid Search Algorithm
Tổng Quan
Hybrid Search kết hợp Vector Search (semantic) với BM25 (lexical) để đạt được kết quả tốt nhất.
Algorithm Flow
┌─────────────────────────────────────────────────────────────────────────┐
│ HYBRID SEARCH ALGORITHM │
└─────────────────────────────────────────────────────────────────────────┘
User Query
│
▼
┌─────────────────┐
│ Query Processing │
│ │
│ 1. Tokenize │
│ 2. TF-IDF │
│ 3. Synonyms │
└────────┬────────┘
│
┌──────────────┴──────────────┐
│ │
▼ ▼
┌─────────────────────┐ ┌─────────────────────┐
│ VECTOR SEARCH │ │ BM25 SEARCH │
│ │ │ │
│ 1. Embed query │ │ 1. Build query │
│ 2. ANN search │ │ 2. Full-text match │
│ 3. Cosine score │ │ 3. BM25 score │
│ │ │ │
│ Weight: 95% │ │ Weight: 5% │
└──────────┬──────────┘ └──────────┬──────────┘
│ │
└────────────┬───────────────┘
│
▼
┌─────────────────┐
│ SCORE FUSION │
│ │
│ final_score = │
│ 0.05 * bm25 + │
│ 0.95 * vector │
└────────┬────────┘
│
▼
┌─────────────────┐
│ THRESHOLD │
│ FILTERING │
│ │
│ score > 0.2 │
└────────┬────────┘
│
▼
┌─────────────────┐
│ TOP-K RESULTS │
│ │
│ Return top 6 │
│ chunks │
└─────────────────┘
Code Implementation
Main Search Function
# /rag/nlp/search.py
class Dealer:
"""Main search handler."""
def search(self, kb_ids, question, embd_mdl, tenant_ids,
highlight=True, top=1024, **kwargs):
"""
Execute hybrid search.
Args:
kb_ids: Knowledge base IDs
question: User query
embd_mdl: Embedding model
tenant_ids: Tenant IDs for access control
highlight: Enable highlighting
top: Initial candidate count
Returns:
Search results with scores
"""
# 1. Process query
query_tokens = self.qryr.question(question)
# 2. Generate query embedding
query_vector = embd_mdl.encode([question])[0]
# 3. Build Elasticsearch query
es_query = self._build_hybrid_query(
query_tokens,
query_vector,
kb_ids,
tenant_ids,
top
)
# 4. Execute search
results = self.es.search(
index=",".join([f"ragflow_{kb_id}" for kb_id in kb_ids]),
body=es_query
)
# 5. Process results
return self._process_results(results, highlight)
Query Building
def _build_hybrid_query(self, query_tokens, query_vector, kb_ids, tenant_ids, top):
"""
Build Elasticsearch hybrid query.
Combines:
- script_score for vector similarity
- bool query for BM25
"""
# Build BM25 query from tokens
bm25_query = self._build_bm25_query(query_tokens)
return {
"query": {
"script_score": {
"query": {
"bool": {
"must": bm25_query,
"filter": [
{"terms": {"kb_id": kb_ids}},
{"terms": {"tenant_id": tenant_ids}}
]
}
},
"script": {
"source": """
double vector_score = cosineSimilarity(
params.query_vector,
'q_1024_vec'
) + 1.0; // Shift to [0, 2]
return 0.05 * _score + 0.95 * vector_score;
""",
"params": {
"query_vector": query_vector.tolist()
}
}
}
},
"size": top,
"_source": ["content_with_weight", "docnm_kwd", "kb_id", "..."],
"highlight": {
"fields": {
"content_with_weight": {
"fragment_size": 120,
"number_of_fragments": 5
}
}
}
}
BM25 Query Construction
def _build_bm25_query(self, query_tokens):
"""
Build BM25 query with weighted terms.
Args:
query_tokens: Dict of token -> weight from qryr.question()
Returns:
Elasticsearch bool query
"""
should_clauses = []
for token, weight in query_tokens.items():
# Single term query with boost
should_clauses.append({
"match": {
"content_with_weight": {
"query": token,
"boost": weight
}
}
})
# Phrase query for compound terms (higher boost)
if len(token.split()) > 1:
should_clauses.append({
"match_phrase": {
"content_with_weight": {
"query": token,
"boost": weight * 1.5,
"slop": 2
}
}
})
return {
"bool": {
"should": should_clauses,
"minimum_should_match": "30%"
}
}
Query Processing
TF-IDF Weighting
# /rag/nlp/query.py
class RagQuery:
"""Query processor with TF-IDF weighting."""
def question(self, query: str) -> dict:
"""
Process query and return weighted tokens.
Returns:
Dict[token, weight] where weight is TF-IDF normalized
"""
# 1. Tokenize query
tokens = self.tokenize(query)
# 2. Calculate term frequencies
tf = Counter(tokens)
# 3. Apply TF-IDF weights
weighted = {}
for token, freq in tf.items():
# Log-normalized TF
tf_score = 1 + math.log(freq)
# IDF from corpus (pre-computed)
idf_score = self.idf.get(token, 1.0)
# Combined weight
weighted[token] = round(tf_score * idf_score, 2)
# 4. Normalize weights to [0, 1]
max_weight = max(weighted.values()) if weighted else 1
return {k: v / max_weight for k, v in weighted.items()}
Synonym Expansion
def expand_synonyms(self, tokens: dict) -> dict:
"""
Expand query with synonyms.
Example:
"machine learning" → ["machine learning", "ML", "AI"]
"""
expanded = dict(tokens)
for token in list(tokens.keys()):
synonyms = self.synonym_dict.get(token, [])
for syn in synonyms:
# Add synonym with reduced weight
expanded[syn] = tokens[token] * 0.8
return expanded
Score Fusion Formula
┌─────────────────────────────────────────────────────────────────────────┐
│ SCORE FUSION FORMULA │
└─────────────────────────────────────────────────────────────────────────┘
Given:
- BM25_Score: Full-text relevance score
- Vector_Score: Cosine similarity (shifted to [0, 2])
- α: vector_similarity_weight (default: 0.3 in config, 0.95 in ES)
Final Score Calculation:
In Elasticsearch script:
┌───────────────────────────────────────────────────────────────┐
│ final_score = 0.05 × _score + 0.95 × (cosine + 1.0) │
│ │
│ where: │
│ _score = BM25 score from bool query │
│ cosine = cosineSimilarity(query_vec, doc_vec) ∈ [-1, 1] │
│ cosine + 1.0 shifts to [0, 2] for positive scores │
└───────────────────────────────────────────────────────────────┘
Why 95:5 ratio?
- Semantic understanding is prioritized
- BM25 helps with exact keyword matches
- Prevents semantic drift while maintaining precision
Alternative weights (configurable):
┌───────────────────────────────────────────────────────────────┐
│ 70:30 - More emphasis on keywords (technical docs) │
│ 80:20 - Balanced (general use) │
│ 95:5 - Semantic-first (conversational queries) │
└───────────────────────────────────────────────────────────────┘
Threshold Filtering
def filter_by_threshold(results, threshold=0.2):
"""
Filter results by similarity threshold.
Args:
results: Search results with scores
threshold: Minimum score (default: 0.2)
Returns:
Filtered results above threshold
"""
# Normalize scores to [0, 1]
max_score = max(r["_score"] for r in results) if results else 1
normalized = [
{**r, "similarity": r["_score"] / max_score}
for r in results
]
# Filter by threshold
return [r for r in normalized if r["similarity"] >= threshold]
Result Processing
def _process_results(self, es_response, highlight=True):
"""
Process Elasticsearch response into structured results.
Returns:
{
"chunks": [
{
"chunk_id": "...",
"content": "...",
"similarity": 0.85,
"vector_similarity": 0.90,
"term_similarity": 0.60,
"docnm_kwd": "Document Name",
"positions": [[x0, x1, top, bottom]],
"highlight": "<em>matched</em> text..."
}
],
"doc_aggs": [
{"doc_id": "...", "doc_name": "...", "count": 3}
],
"total": 100
}
"""
chunks = []
doc_counts = Counter()
for hit in es_response["hits"]["hits"]:
source = hit["_source"]
chunk = {
"chunk_id": hit["_id"],
"content": source["content_with_weight"],
"similarity": hit["_score"] / max_score,
"docnm_kwd": source.get("docnm_kwd", ""),
"kb_id": source["kb_id"],
"doc_id": source["doc_id"],
"positions": self._parse_positions(source.get("position_int", []))
}
# Add highlight if available
if highlight and "highlight" in hit:
chunk["highlight"] = hit["highlight"]["content_with_weight"][0]
chunks.append(chunk)
doc_counts[source["doc_id"]] += 1
# Build document aggregations
doc_aggs = [
{"doc_id": doc_id, "count": count}
for doc_id, count in doc_counts.most_common()
]
return {
"chunks": chunks,
"doc_aggs": doc_aggs,
"total": es_response["hits"]["total"]["value"]
}
Elasticsearch Index Mapping
{
"mappings": {
"properties": {
"content_with_weight": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"q_1024_vec": {
"type": "dense_vector",
"dims": 1024,
"index": true,
"similarity": "cosine"
},
"docnm_kwd": {
"type": "keyword"
},
"kb_id": {
"type": "keyword"
},
"doc_id": {
"type": "keyword"
},
"position_int": {
"type": "integer"
}
}
},
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"ik_smart": {"type": "ik_smart"},
"ik_max_word": {"type": "ik_max_word"}
}
}
}
}
Performance Optimization
1. Pre-filtering
# Filter by tenant/KB before scoring
"filter": [
{"terms": {"kb_id": kb_ids}},
{"terms": {"tenant_id": tenant_ids}}
]
2. Top-K Limiting
# Get more candidates, then rerank
initial_candidates = 1024 # top
final_results = 6 # top_n after reranking
3. Approximate Nearest Neighbor (ANN)
# Elasticsearch uses HNSW for fast vector search
"index": true,
"similarity": "cosine"
4. Result Caching
@cache_result(ttl=300)
def search(self, kb_ids, question, ...):
# Cache results for 5 minutes
pass
Configuration
# Default search configuration
SEARCH_CONFIG = {
"vector_weight": 0.95, # Vector score weight
"bm25_weight": 0.05, # BM25 score weight
"similarity_threshold": 0.2, # Minimum similarity
"top_k": 1024, # Initial candidates
"top_n": 6, # Final results
"minimum_should_match": "30%" # BM25 match requirement
}
Related Files
/rag/nlp/search.py- Main search implementation/rag/nlp/query.py- Query processing/rag/utils/es_conn.py- Elasticsearch connection