Add comprehensive documentation covering 6 modules: - 01-API-LAYER: Authentication, routing, SSE streaming - 02-SERVICE-LAYER: Dialog, Task, LLM service analysis - 03-RAG-ENGINE: Hybrid search, embedding, reranking - 04-AGENT-SYSTEM: Canvas engine, components, tools - 05-DOCUMENT-PROCESSING: Task executor, PDF parsing - 06-ALGORITHMS: BM25, fusion, RAPTOR Total 28 documentation files with code analysis, diagrams, and formulas.
337 lines
13 KiB
Markdown
337 lines
13 KiB
Markdown
# Query Processing
|
||
|
||
## Tong Quan
|
||
|
||
Query processing chuyển đổi user queries thành optimized search queries với term weighting và expansion.
|
||
|
||
## File Location
|
||
```
|
||
/rag/nlp/query.py
|
||
/rag/nlp/term_weight.py
|
||
```
|
||
|
||
## Query Processing Flow
|
||
|
||
```
|
||
┌─────────────────────────────────────────────────────────────────┐
|
||
│ USER QUERY │
|
||
│ "What is machine learning?" │
|
||
└──────────────────────────┬──────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌─────────────────────────────────────────────────────────────────┐
|
||
│ QUERY NORMALIZATION │
|
||
│ ┌─────────────────────────────────────────────────────────┐ │
|
||
│ │ 1. Lowercase │ │
|
||
│ │ 2. Traditional → Simplified Chinese │ │
|
||
│ │ 3. Full-width → Half-width characters │ │
|
||
│ │ 4. Remove question words (what, how, why) │ │
|
||
│ └─────────────────────────────────────────────────────────┘ │
|
||
└──────────────────────────┬──────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌─────────────────────────────────────────────────────────────────┐
|
||
│ TOKENIZATION │
|
||
│ ┌─────────────────────────────────────────────────────────┐ │
|
||
│ │ RAGFlowTokenizer: │ │
|
||
│ │ - Fine-grained tokenization │ │
|
||
│ │ - Semantic tokenization │ │
|
||
│ │ - Multi-language support │ │
|
||
│ └─────────────────────────────────────────────────────────┘ │
|
||
└──────────────────────────┬──────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌─────────────────────────────────────────────────────────────────┐
|
||
│ TF-IDF WEIGHTING │
|
||
│ ┌─────────────────────────────────────────────────────────┐ │
|
||
│ │ weight = (0.3 × IDF1 + 0.7 × IDF2) × NER × PoS │ │
|
||
│ └─────────────────────────────────────────────────────────┘ │
|
||
└──────────────────────────┬──────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌─────────────────────────────────────────────────────────────────┐
|
||
│ QUERY EXPANSION │
|
||
│ ┌─────────────────────────────────────────────────────────┐ │
|
||
│ │ 1. Synonym expansion (0.25x weight) │ │
|
||
│ │ 2. Phrase queries (2x boost for bigrams) │ │
|
||
│ │ 3. Field boosting │ │
|
||
│ └─────────────────────────────────────────────────────────┘ │
|
||
└──────────────────────────┬──────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌─────────────────────────────────────────────────────────────────┐
|
||
│ ELASTICSEARCH QUERY │
|
||
│ (term^weight synonym) AND "bigram phrase"^boost │
|
||
└─────────────────────────────────────────────────────────────────┘
|
||
```
|
||
|
||
## Query Normalization
|
||
|
||
```python
|
||
class FulltextQueryer:
|
||
@staticmethod
|
||
def add_space_between_eng_zh(txt):
|
||
"""Add spaces between English and Chinese characters."""
|
||
# "hello你好" → "hello 你好"
|
||
return re.sub(r'([a-zA-Z])([\u4e00-\u9fa5])', r'\1 \2', txt)
|
||
|
||
@staticmethod
|
||
def rmWWW(txt):
|
||
"""Remove question words."""
|
||
question_words = ["what", "how", "why", "when", "where", "who",
|
||
"什么", "如何", "为什么", "哪里", "谁"]
|
||
for w in question_words:
|
||
txt = txt.replace(w, "")
|
||
return txt
|
||
|
||
def question(self, txt, tbl="qa", min_match=0.6):
|
||
"""Process query text."""
|
||
# Normalize
|
||
txt = self.add_space_between_eng_zh(txt)
|
||
txt = re.sub(r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+", " ",
|
||
rag_tokenizer.tradi2simp(
|
||
rag_tokenizer.strQ2B(txt.lower())
|
||
)).strip()
|
||
|
||
# Remove question words
|
||
txt = self.rmWWW(txt)
|
||
|
||
# Tokenize
|
||
tks = rag_tokenizer.tokenize(txt)
|
||
|
||
return self._build_query(tks, min_match)
|
||
```
|
||
|
||
## TF-IDF Weighting
|
||
|
||
```python
|
||
class Dealer:
|
||
def weights(self, tks, preprocess=True):
|
||
"""
|
||
Calculate TF-IDF weights for tokens.
|
||
|
||
Formula:
|
||
IDF(term) = log10(10 + (N - df(term) + 0.5) / (df(term) + 0.5))
|
||
Weight = (0.3 × IDF1 + 0.7 × IDF2) × NER × PoS
|
||
|
||
Where:
|
||
IDF1: based on term frequency
|
||
IDF2: based on document frequency
|
||
NER: Named Entity Recognition weight
|
||
PoS: Part-of-Speech tag weight
|
||
"""
|
||
|
||
def idf(s, N):
|
||
return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
|
||
|
||
tw = []
|
||
|
||
# IDF1: based on term frequency
|
||
idf1 = np.array([idf(freq(t), 10000000) for t in tks])
|
||
|
||
# IDF2: based on document frequency
|
||
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
||
|
||
# Composite weight
|
||
wts = (0.3 * idf1 + 0.7 * idf2) * np.array([
|
||
ner(t) * postag(t) for t in tks
|
||
])
|
||
|
||
# Normalize
|
||
S = np.sum([s for _, s in tw])
|
||
return [(t, s / S) for t, s in tw]
|
||
```
|
||
|
||
## NER and PoS Weights
|
||
|
||
```python
|
||
# Named Entity Recognition weights
|
||
NER_WEIGHTS = {
|
||
"toxic": 2, # Toxic/sensitive words
|
||
"func": 1, # Functional words
|
||
"corp": 3, # Corporation names
|
||
"loca": 3, # Location names
|
||
"sch": 3, # School names
|
||
"stock": 3, # Stock symbols
|
||
"firstnm": 1, # First names
|
||
}
|
||
|
||
# Part-of-Speech weights
|
||
POS_WEIGHTS = {
|
||
"r": 0.3, # Pronoun/Adverb
|
||
"c": 0.3, # Conjunction
|
||
"d": 0.3, # Adverb
|
||
"ns": 3, # Location noun
|
||
"nt": 3, # Organization noun
|
||
"n": 2, # Common noun
|
||
}
|
||
|
||
def ner(token):
|
||
"""Get NER weight for token."""
|
||
for entity_type, weight in NER_WEIGHTS.items():
|
||
if token in NER_DICT.get(entity_type, set()):
|
||
return weight
|
||
return 1
|
||
|
||
def postag(token):
|
||
"""Get PoS weight for token."""
|
||
pos = get_pos_tag(token)
|
||
return POS_WEIGHTS.get(pos, 1)
|
||
```
|
||
|
||
## Query Expansion
|
||
|
||
```python
|
||
def _build_query(self, tks_w, min_match=0.6):
|
||
"""
|
||
Build expanded Elasticsearch query.
|
||
|
||
Expansion strategies:
|
||
1. Synonym expansion with 0.25x weight
|
||
2. Bigram phrase queries with 2x boost
|
||
3. Field boosting
|
||
"""
|
||
|
||
# Sort by weight
|
||
tks_w = sorted(tks_w, key=lambda x: x[1] * -1)
|
||
|
||
q = []
|
||
for (tk, w), syn in zip(tks_w[:256], syns):
|
||
# Add term with synonym
|
||
if syn:
|
||
q.append(f"({tk}^{w:.4f} {syn})")
|
||
else:
|
||
q.append(f"{tk}^{w:.4f}")
|
||
|
||
# Add phrase queries (bigrams) with 2x boost
|
||
for i in range(1, len(tks_w)):
|
||
left, right = tks_w[i - 1][0], tks_w[i][0]
|
||
weight = max(tks_w[i - 1][1], tks_w[i][1]) * 2
|
||
q.append(f'"{left} {right}"^{weight:.4f}')
|
||
|
||
query = " ".join(q)
|
||
|
||
# Build match expression with minimum_should_match
|
||
return MatchTextExpr(
|
||
query,
|
||
fields=self.query_fields,
|
||
min_match=f"{int(min_match * 100)}%"
|
||
)
|
||
```
|
||
|
||
## Field Boosting
|
||
|
||
```python
|
||
# Query fields with boost factors
|
||
query_fields = [
|
||
"title_tks^10", # Title tokens: 10x boost
|
||
"title_sm_tks^5", # Small title tokens: 5x boost
|
||
"important_kwd^30", # Important keywords: 30x boost
|
||
"important_tks^20", # Important tokens: 20x boost
|
||
"question_tks^20", # Question tokens: 20x boost
|
||
"content_ltks^2", # Content tokens: 2x boost
|
||
"content_sm_ltks", # Small content tokens: 1x boost
|
||
]
|
||
```
|
||
|
||
## Synonym Expansion
|
||
|
||
```python
|
||
class SynonymLookup:
|
||
def lookup(self, token):
|
||
"""
|
||
Find synonyms for token.
|
||
|
||
Returns:
|
||
Space-separated synonym string with reduced weight
|
||
"""
|
||
synonyms = self.synonym_dict.get(token, [])
|
||
|
||
if not synonyms:
|
||
return ""
|
||
|
||
# Synonyms get 0.25x weight
|
||
return " ".join(synonyms)
|
||
|
||
# Example:
|
||
# "machine learning" → "ML AI 机器学习"
|
||
```
|
||
|
||
## Final Query Example
|
||
|
||
```
|
||
Input: "What is machine learning?"
|
||
|
||
After processing:
|
||
(machine^0.8542 ML AI) (learning^0.7231 教育) "machine learning"^1.7084
|
||
|
||
With field boosting:
|
||
{
|
||
"query_string": {
|
||
"query": "(machine^0.8542 ML AI) (learning^0.7231) \"machine learning\"^1.7084",
|
||
"fields": ["title_tks^10", "important_kwd^30", "content_ltks^2"],
|
||
"minimum_should_match": "60%"
|
||
}
|
||
}
|
||
```
|
||
|
||
## Tokenization
|
||
|
||
```python
|
||
# /rag/nlp/rag_tokenizer.py
|
||
|
||
class RAGFlowTokenizer:
|
||
def tokenize(self, text, fine_grained=True):
|
||
"""
|
||
Tokenize text with multi-granularity.
|
||
|
||
Args:
|
||
text: Input text
|
||
fine_grained: Use fine-grained tokenization
|
||
|
||
Returns:
|
||
List of tokens
|
||
"""
|
||
if fine_grained:
|
||
# Fine-grained: "机器学习" → ["机器", "学习", "机", "器", "学", "习"]
|
||
return self.fine_grained_tokenize(text)
|
||
else:
|
||
# Semantic: "机器学习" → ["机器学习"]
|
||
return self.semantic_tokenize(text)
|
||
|
||
def fine_grained_tokenize(self, text):
|
||
"""Break into smallest meaningful units."""
|
||
tokens = []
|
||
# ... tokenization logic
|
||
return tokens
|
||
|
||
def semantic_tokenize(self, text):
|
||
"""Keep semantic units intact."""
|
||
tokens = []
|
||
# ... tokenization logic
|
||
return tokens
|
||
```
|
||
|
||
## Configuration
|
||
|
||
```python
|
||
# Search configuration
|
||
{
|
||
"min_match": 0.3, # Minimum term match percentage
|
||
"query_fields": [...], # Fields with boost factors
|
||
"synonym_expansion": True, # Enable synonym expansion
|
||
}
|
||
|
||
# Tokenizer configuration
|
||
{
|
||
"fine_grained": True, # Fine-grained tokenization
|
||
"semantic": True, # Also use semantic tokenization
|
||
}
|
||
```
|
||
|
||
## Related Files
|
||
|
||
- `/rag/nlp/query.py` - FulltextQueryer class
|
||
- `/rag/nlp/term_weight.py` - TF-IDF weighting
|
||
- `/rag/nlp/rag_tokenizer.py` - RAGFlow tokenizer
|
||
- `/rag/nlp/search.py` - Query integration in search
|