From 566bce428b46c380d2e6dfa6ed8310e486d61909 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 27 Nov 2025 03:34:49 +0000 Subject: [PATCH] docs: Add comprehensive algorithm documentation (50+ algorithms) - Updated README.md with complete algorithm map across 12 categories - Added clustering_algorithms.md (K-Means, GMM, UMAP, Silhouette, Node2Vec) - Added graph_algorithms.md (PageRank, Leiden, Entity Extraction/Resolution) - Added nlp_algorithms.md (Trie tokenization, TF-IDF, NER, POS, Synonym) - Added vision_algorithms.md (OCR, Layout Recognition, TSR, NMS, IoU, XGBoost) - Added similarity_metrics.md (Cosine, Edit Distance, Token, Hybrid) --- personal_analyze/06-ALGORITHMS/README.md | 249 ++++--- .../06-ALGORITHMS/clustering_algorithms.md | 365 ++++++++++ .../06-ALGORITHMS/graph_algorithms.md | 471 +++++++++++++ .../06-ALGORITHMS/nlp_algorithms.md | 571 ++++++++++++++++ .../06-ALGORITHMS/similarity_metrics.md | 455 +++++++++++++ .../06-ALGORITHMS/vision_algorithms.md | 637 ++++++++++++++++++ 6 files changed, 2654 insertions(+), 94 deletions(-) create mode 100644 personal_analyze/06-ALGORITHMS/clustering_algorithms.md create mode 100644 personal_analyze/06-ALGORITHMS/graph_algorithms.md create mode 100644 personal_analyze/06-ALGORITHMS/nlp_algorithms.md create mode 100644 personal_analyze/06-ALGORITHMS/similarity_metrics.md create mode 100644 personal_analyze/06-ALGORITHMS/vision_algorithms.md diff --git a/personal_analyze/06-ALGORITHMS/README.md b/personal_analyze/06-ALGORITHMS/README.md index c4d41aab5..24bb29d40 100644 --- a/personal_analyze/06-ALGORITHMS/README.md +++ b/personal_analyze/06-ALGORITHMS/README.md @@ -2,36 +2,65 @@ ## Tong Quan -Module này chứa các thuật toán core của RAGFlow bao gồm scoring, similarity, chunking, và advanced RAG techniques. +Module này chứa TẤT CẢ các thuật toán được sử dụng trong RAGFlow, bao gồm 50+ algorithms chia thành 12 categories. -## Kien Truc Algorithms +## Algorithm Categories ``` -┌─────────────────────────────────────────────────────────────────┐ -│ CORE ALGORITHMS │ -└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ RAGFLOW ALGORITHM MAP │ +└─────────────────────────────────────────────────────────────────────────────┘ -┌─────────────────────────────────────────────────────────────────┐ -│ RETRIEVAL ALGORITHMS │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ -│ │ BM25 Scoring │ │ Vector Cosine │ │ Hybrid Fusion │ │ -│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ 1. CLUSTERING │ 2. DIMENSIONALITY REDUCTION │ +│ ├── K-Means │ ├── UMAP │ +│ ├── Gaussian Mixture Model (GMM) │ └── Node2Vec Embedding │ +│ └── Silhouette Score │ │ +└───────────────────────────────────┴─────────────────────────────────────────┘ -┌─────────────────────────────────────────────────────────────────┐ -│ ADVANCED RAG │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ -│ │ RAPTOR │ │ GraphRAG │ │ Cross-Encoder │ │ -│ │ (Hierarchical) │ │ (Knowledge G) │ │ (Reranking) │ │ -│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ 3. GRAPH ALGORITHMS │ 4. NLP/TEXT PROCESSING │ +│ ├── PageRank │ ├── Trie-based Tokenization │ +│ ├── Leiden Community Detection │ ├── Max-Forward/Backward Algorithm │ +│ ├── Entity Extraction (LLM) │ ├── DFS with Memoization │ +│ ├── Relation Extraction (LLM) │ ├── TF-IDF Term Weighting │ +│ ├── Entity Resolution │ ├── Named Entity Recognition (NER) │ +│ └── Largest Connected Component │ ├── Part-of-Speech Tagging (POS) │ +│ │ ├── Synonym Detection (WordNet) │ +│ │ └── Query Expansion │ +└───────────────────────────────────┴─────────────────────────────────────────┘ -┌─────────────────────────────────────────────────────────────────┐ -│ TEXT PROCESSING │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ -│ │ TF-IDF Weight │ │ Tokenization │ │ Query Expand │ │ -│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ 5. SIMILARITY/DISTANCE │ 6. INFORMATION RETRIEVAL │ +│ ├── Cosine Similarity │ ├── BM25 Scoring │ +│ ├── Edit Distance (Levenshtein) │ ├── Hybrid Score Fusion │ +│ ├── IoU (Intersection over Union)│ ├── Cross-Encoder Reranking │ +│ ├── Token Similarity │ └── Weighted Sum Fusion │ +│ └── Hybrid Similarity │ │ +└───────────────────────────────────┴─────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────────────────┐ +│ 7. CHUNKING/MERGING │ 8. MACHINE LEARNING MODELS │ +│ ├── Naive Merge (Token-based) │ ├── XGBoost (Text Concatenation) │ +│ ├── Hierarchical Merge │ ├── ONNX Models (Vision) │ +│ ├── Tree-based Merge │ └── Reranking Models │ +│ └── Binary Search Merge │ │ +└───────────────────────────────────┴─────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────────────────┐ +│ 9. VISION/IMAGE PROCESSING │ 10. ADVANCED RAG │ +│ ├── OCR (ONNX) │ ├── RAPTOR (Hierarchical Summarization)│ +│ ├── Layout Recognition (YOLOv10) │ ├── GraphRAG │ +│ ├── Table Structure Recognition │ └── Community Reports │ +│ └── Non-Maximum Suppression (NMS)│ │ +└───────────────────────────────────┴─────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────────────────┐ +│ 11. OPTIMIZATION │ 12. DATA STRUCTURES │ +│ ├── BIC (Bayesian Info Criterion)│ ├── Trie Tree │ +│ └── Silhouette Score │ ├── Hierarchical Tree │ +│ │ └── NetworkX Graph │ +└───────────────────────────────────┴─────────────────────────────────────────┘ ``` ## Files Trong Module Nay @@ -39,90 +68,122 @@ Module này chứa các thuật toán core của RAGFlow bao gồm scoring, simi | File | Mo Ta | |------|-------| | [bm25_scoring.md](./bm25_scoring.md) | BM25 ranking algorithm | -| [vector_similarity.md](./vector_similarity.md) | Cosine similarity calculations | -| [hybrid_score_fusion.md](./hybrid_score_fusion.md) | Score combination strategies | -| [tfidf_weighting.md](./tfidf_weighting.md) | TF-IDF term weighting | +| [hybrid_score_fusion.md](./hybrid_score_fusion.md) | Score combination | | [raptor_algorithm.md](./raptor_algorithm.md) | Hierarchical summarization | -| [graphrag_implementation.md](./graphrag_implementation.md) | Knowledge graph RAG | +| [clustering_algorithms.md](./clustering_algorithms.md) | KMeans, GMM, UMAP | +| [graph_algorithms.md](./graph_algorithms.md) | PageRank, Leiden, Entity Resolution | +| [nlp_algorithms.md](./nlp_algorithms.md) | Tokenization, TF-IDF, NER, POS | +| [vision_algorithms.md](./vision_algorithms.md) | OCR, Layout, NMS | +| [similarity_metrics.md](./similarity_metrics.md) | Cosine, Edit Distance, IoU | -## Algorithm Formulas +## Complete Algorithm Reference -### BM25 Scoring -``` -BM25(D, Q) = Σ IDF(qi) × (f(qi, D) × (k1 + 1)) / (f(qi, D) + k1 × (1 - b + b × |D|/avgdl)) +### 1. CLUSTERING ALGORITHMS -where: - f(qi, D) = term frequency of qi in document D - |D| = document length - avgdl = average document length - k1 = 1.2 (term frequency saturation) - b = 0.75 (length normalization) -``` +| Algorithm | File | Description | +|-----------|------|-------------| +| K-Means | `/deepdoc/parser/pdf_parser.py:36` | Column detection in PDF layout | +| GMM | `/rag/raptor.py:22` | RAPTOR cluster selection | +| Silhouette Score | `/deepdoc/parser/pdf_parser.py:37` | Cluster validation | -### Cosine Similarity -``` -cos(θ) = (A · B) / (||A|| × ||B||) +### 2. DIMENSIONALITY REDUCTION -where: - A, B = embedding vectors - A · B = dot product - ||A|| = L2 norm -``` +| Algorithm | File | Description | +|-----------|------|-------------| +| UMAP | `/rag/raptor.py:21` | Pre-clustering dimension reduction | +| Node2Vec | `/graphrag/general/entity_embedding.py:24` | Graph node embedding | -### Hybrid Score Fusion -``` -Hybrid_Score = α × BM25_Score + (1-α) × Vector_Score +### 3. GRAPH ALGORITHMS -Default: α = 0.05 (5% BM25, 95% Vector) -``` +| Algorithm | File | Description | +|-----------|------|-------------| +| PageRank | `/graphrag/entity_resolution.py:150` | Entity importance scoring | +| Leiden | `/graphrag/general/leiden.py:72` | Hierarchical community detection | +| Entity Extraction | `/graphrag/general/extractor.py` | LLM-based entity extraction | +| Relation Extraction | `/graphrag/general/extractor.py` | LLM-based relation extraction | +| Entity Resolution | `/graphrag/entity_resolution.py` | Entity deduplication | +| LCC | `/graphrag/general/leiden.py:67` | Largest connected component | -### TF-IDF Weighting -``` -IDF(term) = log10(10 + (N - df(term) + 0.5) / (df(term) + 0.5)) -Weight = (0.3 × IDF1 + 0.7 × IDF2) × NER × PoS -``` +### 4. NLP/TEXT PROCESSING -### Cross-Encoder Reranking -``` -Final_Rank = α × Token_Sim + β × Vector_Sim + γ × Rank_Features +| Algorithm | File | Description | +|-----------|------|-------------| +| Trie Tokenization | `/rag/nlp/rag_tokenizer.py:72` | Chinese word segmentation | +| Max-Forward | `/rag/nlp/rag_tokenizer.py:250` | Forward tokenization | +| Max-Backward | `/rag/nlp/rag_tokenizer.py:273` | Backward tokenization | +| DFS + Memo | `/rag/nlp/rag_tokenizer.py:120` | Disambiguation | +| TF-IDF | `/rag/nlp/term_weight.py:223` | Term weighting | +| NER | `/rag/nlp/term_weight.py:84` | Named entity recognition | +| POS Tagging | `/rag/nlp/term_weight.py:179` | Part-of-speech tagging | +| Synonym | `/rag/nlp/synonym.py:71` | Synonym lookup | +| Query Expansion | `/rag/nlp/query.py:85` | Query rewriting | +| Porter Stemmer | `/rag/nlp/rag_tokenizer.py:27` | English stemming | +| WordNet Lemmatizer | `/rag/nlp/rag_tokenizer.py:27` | Lemmatization | -where: - α = 0.3 (token weight) - β = 0.7 (vector weight) - γ = variable (PageRank, tag boost) -``` +### 5. SIMILARITY/DISTANCE METRICS -## Algorithm Parameters +| Algorithm | File | Formula | +|-----------|------|---------| +| Cosine Similarity | `/rag/nlp/query.py:221` | `cos(θ) = A·B / (‖A‖×‖B‖)` | +| Edit Distance | `/graphrag/entity_resolution.py:28` | Levenshtein distance | +| IoU | `/deepdoc/vision/operators.py:702` | `intersection / union` | +| Token Similarity | `/rag/nlp/query.py:230` | Weighted token overlap | +| Hybrid Similarity | `/rag/nlp/query.py:220` | `α×token + β×vector` | -| Algorithm | Parameter | Default | Range | -|-----------|-----------|---------|-------| -| **BM25** | k1 | 1.2 | 0-2.0 | -| | b | 0.75 | 0-1.0 | -| **Hybrid** | vector_weight | 0.95 | 0-1.0 | -| | text_weight | 0.05 | 0-1.0 | -| **TF-IDF** | IDF1 weight | 0.3 | - | -| | IDF2 weight | 0.7 | - | -| **Chunking** | chunk_size | 512 | 128-2048 | -| | overlap | 0-10% | 0-100% | -| **RAPTOR** | max_clusters | 10-50 | - | -| | GMM threshold | 0.1 | - | -| **GraphRAG** | entity_topN | 6 | 1-100 | -| | similarity_threshold | 0.3 | 0-1.0 | +### 6. INFORMATION RETRIEVAL -## Key Implementation Files +| Algorithm | File | Formula | +|-----------|------|---------| +| BM25 | `/rag/nlp/search.py` | ES native BM25 | +| Hybrid Fusion | `/rag/nlp/search.py:126` | `0.05×BM25 + 0.95×Vector` | +| Reranking | `/rag/nlp/search.py:330` | Cross-encoder scoring | +| Argsort Ranking | `/rag/nlp/search.py:429` | Score-based sorting | -- `/rag/nlp/search.py` - Search algorithms -- `/rag/nlp/term_weight.py` - TF-IDF implementation -- `/rag/nlp/query.py` - Query processing -- `/rag/raptor.py` - RAPTOR algorithm -- `/graphrag/search.py` - GraphRAG search -- `/rag/nlp/__init__.py` - Chunking algorithms +### 7. CHUNKING/MERGING -## Performance Metrics +| Algorithm | File | Description | +|-----------|------|-------------| +| Naive Merge | `/rag/nlp/__init__.py:582` | Token-based chunking | +| Naive Merge + Images | `/rag/nlp/__init__.py:645` | With image tracking | +| Hierarchical Merge | `/rag/nlp/__init__.py:487` | Tree-based merging | +| Binary Search | `/rag/nlp/__init__.py:512` | Efficient section lookup | +| DFS Tree Traversal | `/rag/flow/hierarchical_merger/` | Document hierarchy | -| Metric | Typical Value | -|--------|---------------| -| Vector Search Latency | < 100ms | -| BM25 Search Latency | < 50ms | -| Reranking Latency | 200-500ms | -| Total Retrieval | < 1s | +### 8. MACHINE LEARNING MODELS + +| Model | File | Purpose | +|-------|------|---------| +| XGBoost | `/deepdoc/parser/pdf_parser.py:88` | Text concatenation | +| ONNX OCR | `/deepdoc/vision/ocr.py:32` | Text recognition | +| ONNX Layout | `/deepdoc/vision/layout_recognizer.py` | Layout detection | +| ONNX TSR | `/deepdoc/vision/table_structure_recognizer.py` | Table structure | +| YOLOv10 | `/deepdoc/vision/layout_recognizer.py` | Object detection | + +### 9. VISION/IMAGE PROCESSING + +| Algorithm | File | Description | +|-----------|------|-------------| +| NMS | `/deepdoc/vision/operators.py:702` | Box filtering | +| IoU Filtering | `/deepdoc/vision/recognizer.py:359` | Overlap detection | +| Bounding Box Overlap | `/deepdoc/vision/layout_recognizer.py:94` | Spatial analysis | + +### 10. ADVANCED RAG + +| Algorithm | File | Description | +|-----------|------|-------------| +| RAPTOR | `/rag/raptor.py:37` | Hierarchical summarization | +| GraphRAG | `/graphrag/` | Knowledge graph RAG | +| Community Reports | `/graphrag/general/community_reports_extractor.py` | Graph summaries | + +### 11. OPTIMIZATION CRITERIA + +| Algorithm | File | Formula | +|-----------|------|---------| +| BIC | `/rag/raptor.py:92` | `k×log(n) - 2×log(L)` | +| Silhouette | `/deepdoc/parser/pdf_parser.py:400` | `(b-a) / max(a,b)` | + +## Statistics + +- **Total Algorithms**: 50+ +- **Categories**: 12 +- **Key Libraries**: sklearn, UMAP, XGBoost, NetworkX, graspologic, ONNX diff --git a/personal_analyze/06-ALGORITHMS/clustering_algorithms.md b/personal_analyze/06-ALGORITHMS/clustering_algorithms.md new file mode 100644 index 000000000..125816f37 --- /dev/null +++ b/personal_analyze/06-ALGORITHMS/clustering_algorithms.md @@ -0,0 +1,365 @@ +# Clustering Algorithms + +## Tong Quan + +RAGFlow sử dụng clustering algorithms cho PDF layout analysis và RAPTOR hierarchical summarization. + +## 1. K-Means Clustering + +### File Location +``` +/deepdoc/parser/pdf_parser.py (lines 36, 394, 425, 1047-1055) +``` + +### Purpose +Phát hiện cột (columns) trong PDF layout bằng cách clustering text boxes theo X-coordinate. + +### Implementation + +```python +from sklearn.cluster import KMeans + +def _assign_column(self): + """ + Detect columns using KMeans clustering on X coordinates. + """ + # Get X coordinates of text boxes + x_coords = np.array([[b["x0"]] for b in self.bxs]) + + best_k = 1 + best_score = -1 + + # Find optimal number of columns (1-5) + for k in range(1, min(5, len(self.bxs))): + if k >= len(self.bxs): + break + + km = KMeans(n_clusters=k, random_state=42, n_init="auto") + labels = km.fit_predict(x_coords) + + if k > 1: + score = silhouette_score(x_coords, labels) + if score > best_score: + best_score = score + best_k = k + + # Assign columns with optimal k + km = KMeans(n_clusters=best_k, random_state=42, n_init="auto") + labels = km.fit_predict(x_coords) + + for i, bx in enumerate(self.bxs): + bx["col_id"] = labels[i] +``` + +### Algorithm + +``` +K-Means Algorithm: +1. Initialize k centroids randomly +2. Repeat until convergence: + a. Assign each point to nearest centroid + b. Update centroids as mean of assigned points +3. Return cluster assignments + +Objective: minimize Σ ||xi - μci||² +where μci is centroid of cluster containing xi +``` + +### Parameters + +| Parameter | Value | Description | +|-----------|-------|-------------| +| n_clusters | 1-5 | Number of columns to detect | +| n_init | "auto" | Initialization runs | +| random_state | 42 | Reproducibility | + +--- + +## 2. Gaussian Mixture Model (GMM) + +### File Location +``` +/rag/raptor.py (lines 22, 102-106, 195-199) +``` + +### Purpose +RAPTOR algorithm sử dụng GMM để cluster document chunks trước khi summarization. + +### Implementation + +```python +from sklearn.mixture import GaussianMixture + +def _get_optimal_clusters(self, embeddings: np.ndarray, random_state: int): + """ + Find optimal number of clusters using BIC criterion. + """ + max_clusters = min(self._max_cluster, len(embeddings)) + n_clusters = np.arange(1, max_clusters) + + bics = [] + for n in n_clusters: + gm = GaussianMixture( + n_components=n, + random_state=random_state, + covariance_type='full' + ) + gm.fit(embeddings) + bics.append(gm.bic(embeddings)) + + # Select cluster count with minimum BIC + optimal_clusters = n_clusters[np.argmin(bics)] + return optimal_clusters + +def _cluster_chunks(self, chunks, embeddings): + """ + Cluster chunks using GMM with soft assignments. + """ + # Reduce dimensions first + reduced = self._reduce_dimensions(embeddings) + + # Find optimal k + n_clusters = self._get_optimal_clusters(reduced, random_state=42) + + # Fit GMM + gm = GaussianMixture(n_components=n_clusters, random_state=42) + gm.fit(reduced) + + # Get soft assignments (probabilities) + probs = gm.predict_proba(reduced) + + # Assign chunks to clusters with probability > threshold + clusters = [[] for _ in range(n_clusters)] + for i, prob in enumerate(probs): + for j, p in enumerate(prob): + if p > 0.1: # Threshold + clusters[j].append(i) + + return clusters +``` + +### GMM Formula + +``` +GMM Probability Density: +p(x) = Σ πk × N(x | μk, Σk) + +where: +- πk = mixture weight for component k +- N(x | μk, Σk) = Gaussian distribution with mean μk and covariance Σk + +BIC (Bayesian Information Criterion): +BIC = k × log(n) - 2 × log(L̂) + +where: +- k = number of parameters +- n = number of samples +- L̂ = maximum likelihood +``` + +### Soft Assignment + +GMM cho phép soft assignment (một chunk có thể thuộc nhiều clusters): + +``` +Chunk i belongs to Cluster j if P(j|xi) > threshold (0.1) +``` + +--- + +## 3. UMAP (Dimensionality Reduction) + +### File Location +``` +/rag/raptor.py (lines 21, 186-190) +``` + +### Purpose +Giảm số chiều của embeddings trước khi clustering để improve cluster quality. + +### Implementation + +```python +import umap + +def _reduce_dimensions(self, embeddings: np.ndarray) -> np.ndarray: + """ + Reduce embedding dimensions using UMAP. + """ + n_samples = len(embeddings) + + # Calculate neighbors based on sample size + n_neighbors = int((n_samples - 1) ** 0.8) + + # Target dimensions + n_components = min(12, n_samples - 2) + + reducer = umap.UMAP( + n_neighbors=max(2, n_neighbors), + n_components=n_components, + metric="cosine", + random_state=42 + ) + + return reducer.fit_transform(embeddings) +``` + +### UMAP Algorithm + +``` +UMAP (Uniform Manifold Approximation and Projection): + +1. Build high-dimensional graph: + - Compute k-nearest neighbors + - Create weighted edges based on distance + +2. Build low-dimensional representation: + - Initialize randomly + - Optimize layout using cross-entropy loss + - Preserve local structure (neighbors stay neighbors) + +Key idea: Preserve topological structure, not absolute distances +``` + +### Parameters + +| Parameter | Value | Description | +|-----------|-------|-------------| +| n_neighbors | (n-1)^0.8 | Local neighborhood size | +| n_components | min(12, n-2) | Output dimensions | +| metric | cosine | Distance metric | + +--- + +## 4. Silhouette Score + +### File Location +``` +/deepdoc/parser/pdf_parser.py (lines 37, 400, 1052) +``` + +### Purpose +Đánh giá cluster quality để chọn optimal k cho K-Means. + +### Formula + +``` +Silhouette Score: +s(i) = (b(i) - a(i)) / max(a(i), b(i)) + +where: +- a(i) = average distance to points in same cluster +- b(i) = average distance to points in nearest other cluster + +Range: [-1, 1] +- s ≈ 1: Point well-clustered +- s ≈ 0: Point on boundary +- s < 0: Point may be misclassified + +Overall score = mean(s(i)) for all points +``` + +### Usage + +```python +from sklearn.metrics import silhouette_score + +# Find optimal k +best_k = 1 +best_score = -1 + +for k in range(2, max_clusters): + km = KMeans(n_clusters=k) + labels = km.fit_predict(data) + + score = silhouette_score(data, labels) + + if score > best_score: + best_score = score + best_k = k +``` + +--- + +## 5. Node2Vec (Graph Embedding) + +### File Location +``` +/graphrag/general/entity_embedding.py (lines 24-44) +``` + +### Purpose +Generate embeddings cho graph nodes trong knowledge graph. + +### Implementation + +```python +from graspologic.embed import node2vec_embed + +def embed_node2vec(graph, dimensions=1536, num_walks=10, + walk_length=40, window_size=2, iterations=3): + """ + Generate node embeddings using Node2Vec algorithm. + """ + lcc_tensors, embedding = node2vec_embed( + graph=graph, + dimensions=dimensions, + num_walks=num_walks, + walk_length=walk_length, + window_size=window_size, + iterations=iterations, + random_seed=86 + ) + + return embedding +``` + +### Node2Vec Algorithm + +``` +Node2Vec Algorithm: + +1. Random Walk Generation: + - For each node, perform biased random walks + - Walk strategy controlled by p (return) and q (in-out) + +2. Skip-gram Training: + - Treat walks as sentences + - Train Word2Vec Skip-gram model + - Node → Embedding vector + +Walk probabilities: +- p: Return parameter (go back to previous node) +- q: In-out parameter (explore vs exploit) + +Low p, high q → BFS-like (local structure) +High p, low q → DFS-like (global structure) +``` + +### Parameters + +| Parameter | Value | Description | +|-----------|-------|-------------| +| dimensions | 1536 | Embedding size | +| num_walks | 10 | Walks per node | +| walk_length | 40 | Steps per walk | +| window_size | 2 | Skip-gram window | +| iterations | 3 | Training iterations | + +--- + +## Summary + +| Algorithm | Purpose | Library | +|-----------|---------|---------| +| K-Means | PDF column detection | sklearn | +| GMM | RAPTOR clustering | sklearn | +| UMAP | Dimension reduction | umap-learn | +| Silhouette | Cluster validation | sklearn | +| Node2Vec | Graph embedding | graspologic | + +## Related Files + +- `/deepdoc/parser/pdf_parser.py` - K-Means, Silhouette +- `/rag/raptor.py` - GMM, UMAP +- `/graphrag/general/entity_embedding.py` - Node2Vec diff --git a/personal_analyze/06-ALGORITHMS/graph_algorithms.md b/personal_analyze/06-ALGORITHMS/graph_algorithms.md new file mode 100644 index 000000000..1817a73e4 --- /dev/null +++ b/personal_analyze/06-ALGORITHMS/graph_algorithms.md @@ -0,0 +1,471 @@ +# Graph Algorithms + +## Tong Quan + +RAGFlow sử dụng graph algorithms cho knowledge graph construction và GraphRAG retrieval. + +## 1. PageRank Algorithm + +### File Location +``` +/graphrag/entity_resolution.py (line 150) +/graphrag/general/index.py (line 460) +/graphrag/search.py (line 83) +``` + +### Purpose +Tính importance score cho entities trong knowledge graph. + +### Implementation + +```python +import networkx as nx + +def compute_pagerank(graph): + """ + Compute PageRank scores for all nodes. + """ + pagerank = nx.pagerank(graph) + return pagerank + +# Usage in search ranking +def rank_entities(entities, pagerank_scores): + """ + Rank entities by similarity * pagerank. + """ + ranked = sorted( + entities.items(), + key=lambda x: x[1]["sim"] * x[1]["pagerank"], + reverse=True + ) + return ranked +``` + +### PageRank Formula + +``` +PageRank Algorithm: + +PR(u) = (1-d)/N + d × Σ PR(v)/L(v) + for all v linking to u + +where: +- d = damping factor (typically 0.85) +- N = total number of nodes +- L(v) = number of outbound links from v + +Iterative computation until convergence: +PR^(t+1)(u) = (1-d)/N + d × Σ PR^(t)(v)/L(v) +``` + +### Usage in RAGFlow + +```python +# In GraphRAG search +def get_relevant_entities(query, graph): + # 1. Get candidate entities by similarity + candidates = vector_search(query) + + # 2. Compute PageRank + pagerank = nx.pagerank(graph) + + # 3. Combine scores + for entity in candidates: + entity["final_score"] = entity["similarity"] * pagerank[entity["id"]] + + return sorted(candidates, key=lambda x: x["final_score"], reverse=True) +``` + +--- + +## 2. Leiden Community Detection + +### File Location +``` +/graphrag/general/leiden.py (lines 72-141) +``` + +### Purpose +Phát hiện communities trong knowledge graph để tổ chức entities thành groups. + +### Implementation + +```python +from graspologic.partition import hierarchical_leiden +from graspologic.utils import largest_connected_component + +def _compute_leiden_communities(graph, max_cluster_size=12, seed=0xDEADBEEF): + """ + Compute hierarchical communities using Leiden algorithm. + """ + # Extract largest connected component + lcc = largest_connected_component(graph) + + # Run hierarchical Leiden + community_mapping = hierarchical_leiden( + lcc, + max_cluster_size=max_cluster_size, + random_seed=seed + ) + + # Process results by level + results = {} + for level, communities in community_mapping.items(): + for community_id, nodes in communities.items(): + # Calculate community weight + weight = sum( + graph.nodes[n].get("rank", 1) * + graph.nodes[n].get("weight", 1) + for n in nodes + ) + results[(level, community_id)] = { + "nodes": nodes, + "weight": weight + } + + return results +``` + +### Leiden Algorithm + +``` +Leiden Algorithm (improvement over Louvain): + +1. Local Moving Phase: + - Move nodes between communities to improve modularity + - Refined node movement to avoid poorly connected communities + +2. Refinement Phase: + - Partition communities into smaller subcommunities + - Ensures well-connected communities + +3. Aggregation Phase: + - Create aggregate graph with communities as nodes + - Repeat from step 1 until no improvement + +Modularity: +Q = (1/2m) × Σ [Aij - (ki×kj)/(2m)] × δ(ci, cj) + +where: +- Aij = edge weight between i and j +- ki = degree of node i +- m = total edge weight +- δ(ci, cj) = 1 if same community, 0 otherwise +``` + +### Hierarchical Leiden + +``` +Hierarchical Leiden: +- Recursively applies Leiden to each community +- Creates multi-level community structure +- Controlled by max_cluster_size parameter + +Level 0: Root community (all nodes) +Level 1: First-level subcommunities +Level 2: Second-level subcommunities +... +``` + +--- + +## 3. Entity Extraction (LLM-based) + +### File Location +``` +/graphrag/general/extractor.py +/graphrag/light/graph_extractor.py +``` + +### Purpose +Extract entities và relationships từ text sử dụng LLM. + +### Implementation + +```python +class GraphExtractor: + DEFAULT_ENTITY_TYPES = [ + "organization", "person", "geo", "event", "category" + ] + + async def _process_single_content(self, content, entity_types): + """ + Extract entities from text using LLM with iterative gleaning. + """ + # Initial extraction + prompt = self._build_extraction_prompt(content, entity_types) + result = await self._llm_chat(prompt) + + entities, relations = self._parse_result(result) + + # Iterative gleaning (up to 2 times) + for _ in range(2): # ENTITY_EXTRACTION_MAX_GLEANINGS + glean_prompt = self._build_glean_prompt(result) + glean_result = await self._llm_chat(glean_prompt) + + # Check if more entities found + if "NO" in glean_result.upper(): + break + + new_entities, new_relations = self._parse_result(glean_result) + entities.extend(new_entities) + relations.extend(new_relations) + + return entities, relations + + def _parse_result(self, result): + """ + Parse LLM output into structured entities/relations. + + Format: (entity_type, entity_name, description) + Format: (source, target, relation_type, description) + """ + entities = [] + relations = [] + + for line in result.split("\n"): + if line.startswith("(") and line.endswith(")"): + parts = line[1:-1].split(TUPLE_DELIMITER) + if len(parts) == 3: # Entity + entities.append({ + "type": parts[0], + "name": parts[1], + "description": parts[2] + }) + elif len(parts) == 4: # Relation + relations.append({ + "source": parts[0], + "target": parts[1], + "type": parts[2], + "description": parts[3] + }) + + return entities, relations +``` + +### Extraction Pipeline + +``` +Entity Extraction Pipeline: + +1. Initial Extraction + └── LLM extracts entities using structured prompt + +2. Iterative Gleaning (max 2 iterations) + ├── Ask LLM if more entities exist + ├── If YES: extract additional entities + └── If NO: stop gleaning + +3. Relation Extraction + └── Extract relationships between entities + +4. Graph Construction + └── Build NetworkX graph from entities/relations +``` + +--- + +## 4. Entity Resolution + +### File Location +``` +/graphrag/entity_resolution.py +``` + +### Purpose +Merge duplicate entities trong knowledge graph. + +### Implementation + +```python +import editdistance +import networkx as nx + +class EntityResolution: + def is_similarity(self, a: str, b: str) -> bool: + """ + Check if two entity names are similar. + """ + a, b = a.lower(), b.lower() + + # Chinese: character set intersection + if self._is_chinese(a): + a_set, b_set = set(a), set(b) + max_len = max(len(a_set), len(b_set)) + overlap = len(a_set & b_set) + return overlap / max_len >= 0.8 + + # English: Edit distance + else: + threshold = min(len(a), len(b)) // 2 + distance = editdistance.eval(a, b) + return distance <= threshold + + async def resolve(self, graph): + """ + Resolve duplicate entities in graph. + """ + # 1. Find candidate pairs + nodes = list(graph.nodes()) + candidates = [] + + for i, a in enumerate(nodes): + for b in nodes[i+1:]: + if self.is_similarity(a, b): + candidates.append((a, b)) + + # 2. LLM verification (batch) + confirmed_pairs = [] + for batch in self._batch(candidates, size=100): + results = await self._llm_verify_batch(batch) + confirmed_pairs.extend([ + pair for pair, is_same in zip(batch, results) + if is_same + ]) + + # 3. Merge confirmed pairs + for a, b in confirmed_pairs: + self._merge_nodes(graph, a, b) + + # 4. Update PageRank + pagerank = nx.pagerank(graph) + for node in graph.nodes(): + graph.nodes[node]["pagerank"] = pagerank[node] + + return graph +``` + +### Similarity Metrics + +``` +English Similarity (Edit Distance): +distance(a, b) ≤ min(len(a), len(b)) // 2 + +Example: +- "microsoft" vs "microsft" → distance=1 ≤ 4 → Similar +- "google" vs "apple" → distance=5 > 2 → Not similar + +Chinese Similarity (Character Set): +|a ∩ b| / max(|a|, |b|) ≥ 0.8 + +Example: +- "北京大学" vs "北京大" → 3/4 = 0.75 → Not similar +- "清华大学" vs "清华" → 2/4 = 0.5 → Not similar +``` + +--- + +## 5. Largest Connected Component (LCC) + +### File Location +``` +/graphrag/general/leiden.py (line 67) +``` + +### Purpose +Extract largest connected subgraph trước khi community detection. + +### Implementation + +```python +from graspologic.utils import largest_connected_component + +def _stabilize_graph(graph): + """ + Extract and stabilize the largest connected component. + """ + # Get LCC + lcc = largest_connected_component(graph) + + # Sort nodes for reproducibility + sorted_nodes = sorted(lcc.nodes()) + sorted_graph = lcc.subgraph(sorted_nodes).copy() + + return sorted_graph +``` + +### LCC Algorithm + +``` +LCC Algorithm: + +1. Find all connected components using BFS/DFS +2. Select component with maximum number of nodes +3. Return subgraph of that component + +Complexity: O(V + E) +where V = vertices, E = edges +``` + +--- + +## 6. N-hop Path Scoring + +### File Location +``` +/graphrag/search.py (lines 181-187) +``` + +### Purpose +Score entities dựa trên path distance trong graph. + +### Implementation + +```python +def compute_nhop_scores(entity, neighbors, n_hops=2): + """ + Score entities based on graph distance. + """ + nhop_scores = {} + + for neighbor in neighbors: + path = neighbor["path"] + weights = neighbor["weights"] + + for i in range(len(path) - 1): + source, target = path[i], path[i + 1] + + # Decay by distance + score = entity["sim"] / (2 + i) + + if (source, target) in nhop_scores: + nhop_scores[(source, target)]["sim"] += score + else: + nhop_scores[(source, target)] = {"sim": score} + + return nhop_scores +``` + +### Scoring Formula + +``` +N-hop Score with Decay: + +score(e, path_i) = similarity(e) / (2 + distance_i) + +where: +- distance_i = number of hops from source entity +- 2 = base constant to prevent division issues + +Total score = Σ score(e, path_i) for all paths +``` + +--- + +## Summary + +| Algorithm | Purpose | Library | +|-----------|---------|---------| +| PageRank | Entity importance | NetworkX | +| Leiden | Community detection | graspologic | +| Entity Extraction | KG construction | LLM | +| Entity Resolution | Deduplication | editdistance + LLM | +| LCC | Graph preprocessing | graspologic | +| N-hop Scoring | Path-based ranking | Custom | + +## Related Files + +- `/graphrag/entity_resolution.py` - Entity resolution +- `/graphrag/general/leiden.py` - Community detection +- `/graphrag/general/extractor.py` - Entity extraction +- `/graphrag/search.py` - Graph search diff --git a/personal_analyze/06-ALGORITHMS/nlp_algorithms.md b/personal_analyze/06-ALGORITHMS/nlp_algorithms.md new file mode 100644 index 000000000..f0c7e4911 --- /dev/null +++ b/personal_analyze/06-ALGORITHMS/nlp_algorithms.md @@ -0,0 +1,571 @@ +# NLP Algorithms + +## Tong Quan + +RAGFlow sử dụng multiple NLP algorithms cho tokenization, term weighting, và query processing. + +## 1. Trie-based Tokenization + +### File Location +``` +/rag/nlp/rag_tokenizer.py (lines 72-90, 120-240) +``` + +### Purpose +Chinese word segmentation sử dụng Trie data structure. + +### Implementation + +```python +import datrie + +class RagTokenizer: + def __init__(self): + # Load dictionary into Trie + self.trie = datrie.Trie(string.printable + "".join( + chr(i) for i in range(0x4E00, 0x9FFF) # CJK characters + )) + + # Load from huqie.txt dictionary + for line in open("rag/res/huqie.txt"): + word, freq, pos = line.strip().split("\t") + self.trie[word] = (int(freq), pos) + + def _max_forward(self, text, start): + """ + Max-forward matching algorithm. + """ + end = len(text) + while end > start: + substr = text[start:end] + if substr in self.trie: + return substr, end + end -= 1 + return text[start], start + 1 + + def _max_backward(self, text, end): + """ + Max-backward matching algorithm. + """ + start = 0 + while start < end: + substr = text[start:end] + if substr in self.trie: + return substr, start + start += 1 + return text[end-1], end - 1 +``` + +### Trie Structure + +``` +Trie Data Structure: + root + / \ + 机 学 + / \ + 器 习 + / \ + 学 人 + +Words: 机器, 机器学习, 机器人, 学习 + +Lookup: O(m) where m = word length +Insert: O(m) +Space: O(n × m) where n = number of words +``` + +### Max-Forward/Backward Algorithm + +``` +Max-Forward Matching: +Input: "机器学习是人工智能" + +Step 1: Try "机器学习是人工智能" → Not found +Step 2: Try "机器学习是人工" → Not found +... +Step n: Try "机器学习" → Found! +Output: ["机器学习", ...] + +Max-Backward Matching: +Input: "机器学习" + +Step 1: Try "机器学习" from end → Found! +Output: ["机器学习"] +``` + +--- + +## 2. DFS with Memoization (Disambiguation) + +### File Location +``` +/rag/nlp/rag_tokenizer.py (lines 120-210) +``` + +### Purpose +Giải quyết ambiguity khi có nhiều cách tokenize. + +### Implementation + +```python +def dfs_(self, text, memo={}): + """ + DFS with memoization for tokenization disambiguation. + """ + if text in memo: + return memo[text] + + if not text: + return [[]] + + results = [] + for end in range(1, len(text) + 1): + prefix = text[:end] + if prefix in self.trie or len(prefix) == 1: + suffix_results = self.dfs_(text[end:], memo) + for suffix in suffix_results: + results.append([prefix] + suffix) + + # Score and select best tokenization + best = max(results, key=lambda x: self.score_(x)) + memo[text] = [best] + return [best] + +def score_(self, tokens): + """ + Score tokenization quality. + + Formula: score = B/len(tokens) + L + F + where: + B = 30 (bonus for fewer tokens) + L = sum of token lengths / total length + F = sum of frequencies + """ + B = 30 + L = sum(len(t) for t in tokens) / max(1, sum(len(t) for t in tokens)) + F = sum(self.trie.get(t, (1, ''))[0] for t in tokens) + + return B / len(tokens) + L + F +``` + +### Scoring Function + +``` +Tokenization Scoring: + +score(tokens) = B/n + L + F + +where: +- B = 30 (base bonus) +- n = number of tokens (fewer is better) +- L = coverage ratio +- F = sum of word frequencies (common words preferred) + +Example: +"北京大学" → + Option 1: ["北京", "大学"] → score = 30/2 + 1.0 + (1000+500) = 1516 + Option 2: ["北", "京", "大", "学"] → score = 30/4 + 1.0 + (10+10+10+10) = 48.5 + +Winner: Option 1 +``` + +--- + +## 3. TF-IDF Term Weighting + +### File Location +``` +/rag/nlp/term_weight.py (lines 162-244) +``` + +### Purpose +Tính importance weight cho mỗi term trong query. + +### Implementation + +```python +import math +import numpy as np + +class Dealer: + def weights(self, tokens, preprocess=True): + """ + Calculate TF-IDF based weights for tokens. + """ + def idf(s, N): + return math.log10(10 + ((N - s + 0.5) / (s + 0.5))) + + # IDF based on term frequency in corpus + idf1 = np.array([idf(self.freq(t), 10000000) for t in tokens]) + + # IDF based on document frequency + idf2 = np.array([idf(self.df(t), 1000000000) for t in tokens]) + + # NER and POS weights + ner_weights = np.array([self.ner(t) for t in tokens]) + pos_weights = np.array([self.postag(t) for t in tokens]) + + # Combined weight + weights = (0.3 * idf1 + 0.7 * idf2) * ner_weights * pos_weights + + # Normalize + total = np.sum(weights) + return [(t, w / total) for t, w in zip(tokens, weights)] +``` + +### Formula + +``` +TF-IDF Variant: + +IDF(term) = log₁₀(10 + (N - df + 0.5) / (df + 0.5)) + +where: +- N = total documents (10⁹ for df, 10⁷ for freq) +- df = document frequency of term + +Combined Weight: +weight(term) = (0.3 × IDF_freq + 0.7 × IDF_df) × NER × POS + +Normalization: +normalized_weight(term) = weight(term) / Σ weight(all_terms) +``` + +--- + +## 4. Named Entity Recognition (NER) + +### File Location +``` +/rag/nlp/term_weight.py (lines 84-86, 144-149) +``` + +### Purpose +Dictionary-based entity type detection với weight assignment. + +### Implementation + +```python +class Dealer: + def __init__(self): + # Load NER dictionary + self.ner_dict = json.load(open("rag/res/ner.json")) + + def ner(self, token): + """ + Get NER weight for token. + """ + NER_WEIGHTS = { + "toxic": 2.0, # Toxic/sensitive words + "func": 1.0, # Functional words + "corp": 3.0, # Corporation names + "loca": 3.0, # Location names + "sch": 3.0, # School names + "stock": 3.0, # Stock symbols + "firstnm": 1.0, # First names + } + + for entity_type, weight in NER_WEIGHTS.items(): + if token in self.ner_dict.get(entity_type, set()): + return weight + + return 1.0 # Default +``` + +### Entity Types + +``` +NER Categories: +┌──────────┬────────┬─────────────────────────────┐ +│ Type │ Weight │ Examples │ +├──────────┼────────┼─────────────────────────────┤ +│ corp │ 3.0 │ Microsoft, Google, Apple │ +│ loca │ 3.0 │ Beijing, New York │ +│ sch │ 3.0 │ MIT, Stanford │ +│ stock │ 3.0 │ AAPL, GOOG │ +│ toxic │ 2.0 │ (sensitive words) │ +│ func │ 1.0 │ the, is, are │ +│ firstnm │ 1.0 │ John, Mary │ +└──────────┴────────┴─────────────────────────────┘ +``` + +--- + +## 5. Part-of-Speech (POS) Tagging + +### File Location +``` +/rag/nlp/term_weight.py (lines 179-189) +``` + +### Purpose +Assign weights based on grammatical category. + +### Implementation + +```python +def postag(self, token): + """ + Get POS weight for token. + """ + POS_WEIGHTS = { + "r": 0.3, # Pronoun + "c": 0.3, # Conjunction + "d": 0.3, # Adverb + "ns": 3.0, # Place noun + "nt": 3.0, # Organization noun + "n": 2.0, # Common noun + } + + # Get POS tag from tokenizer + pos = self.tokenizer.tag(token) + + # Check for numeric patterns + if re.match(r"^[\d.]+$", token): + return 2.0 + + return POS_WEIGHTS.get(pos, 1.0) +``` + +### POS Weight Table + +``` +POS Weight Assignments: +┌───────┬────────┬─────────────────────┐ +│ Tag │ Weight │ Description │ +├───────┼────────┼─────────────────────┤ +│ n │ 2.0 │ Common noun │ +│ ns │ 3.0 │ Place noun │ +│ nt │ 3.0 │ Organization noun │ +│ v │ 1.0 │ Verb │ +│ a │ 1.0 │ Adjective │ +│ r │ 0.3 │ Pronoun │ +│ c │ 0.3 │ Conjunction │ +│ d │ 0.3 │ Adverb │ +│ num │ 2.0 │ Number │ +└───────┴────────┴─────────────────────┘ +``` + +--- + +## 6. Synonym Detection + +### File Location +``` +/rag/nlp/synonym.py (lines 71-93) +``` + +### Purpose +Query expansion qua synonym lookup. + +### Implementation + +```python +from nltk.corpus import wordnet + +class SynonymLookup: + def __init__(self): + # Load custom dictionary + self.custom_dict = json.load(open("rag/res/synonym.json")) + + def lookup(self, token, top_n=8): + """ + Find synonyms for token. + + Strategy: + 1. Check custom dictionary first + 2. Fall back to WordNet for English + """ + # Custom dictionary + if token in self.custom_dict: + return self.custom_dict[token][:top_n] + + # WordNet for English words + if re.match(r"^[a-z]+$", token.lower()): + synonyms = set() + for syn in wordnet.synsets(token): + for lemma in syn.lemmas(): + name = lemma.name().replace("_", " ") + if name.lower() != token.lower(): + synonyms.add(name) + + return list(synonyms)[:top_n] + + return [] +``` + +### Synonym Sources + +``` +Synonym Lookup Strategy: + +1. Custom Dictionary (highest priority) + - Domain-specific synonyms + - Chinese synonyms + - Technical terms + +2. WordNet (English only) + - General synonyms + - Lemma extraction from synsets + +Example: +"computer" → WordNet → ["machine", "calculator", "computing device"] +"机器学习" → Custom → ["ML", "machine learning", "深度学习"] +``` + +--- + +## 7. Query Expansion + +### File Location +``` +/rag/nlp/query.py (lines 85-218) +``` + +### Purpose +Build expanded query với weighted terms và synonyms. + +### Implementation + +```python +class FulltextQueryer: + QUERY_FIELDS = [ + "title_tks^10", # Title: 10x boost + "title_sm_tks^5", # Title sub-tokens: 5x + "important_kwd^30", # Keywords: 30x + "important_tks^20", # Keyword tokens: 20x + "question_tks^20", # Question tokens: 20x + "content_ltks^2", # Content: 2x + "content_sm_ltks^1", # Content sub-tokens: 1x + ] + + def question(self, text, min_match=0.6): + """ + Build expanded query. + """ + # 1. Tokenize + tokens = self.tokenizer.tokenize(text) + + # 2. Get weights + weighted_tokens = self.term_weight.weights(tokens) + + # 3. Get synonyms + synonyms = [self.synonym.lookup(t) for t, _ in weighted_tokens] + + # 4. Build query string + query_parts = [] + for (token, weight), syns in zip(weighted_tokens, synonyms): + if syns: + # Token with synonyms + syn_str = " ".join(syns) + query_parts.append(f"({token}^{weight:.4f} OR ({syn_str})^0.2)") + else: + query_parts.append(f"{token}^{weight:.4f}") + + # 5. Add phrase queries (bigrams) + for i in range(1, len(weighted_tokens)): + left, _ = weighted_tokens[i-1] + right, w = weighted_tokens[i] + query_parts.append(f'"{left} {right}"^{w*2:.4f}') + + return MatchTextExpr( + query=" ".join(query_parts), + fields=self.QUERY_FIELDS, + min_match=f"{int(min_match * 100)}%" + ) +``` + +### Query Expansion Example + +``` +Input: "machine learning tutorial" + +After expansion: +(machine^0.35 OR (computer device)^0.2) +(learning^0.40 OR (study education)^0.2) +(tutorial^0.25 OR (guide lesson)^0.2) +"machine learning"^0.80 +"learning tutorial"^0.50 + +With field boosting: +{ + "query_string": { + "query": "(machine^0.35 learning^0.40 tutorial^0.25)", + "fields": ["title_tks^10", "important_kwd^30", "content_ltks^2"], + "minimum_should_match": "60%" + } +} +``` + +--- + +## 8. Fine-Grained Tokenization + +### File Location +``` +/rag/nlp/rag_tokenizer.py (lines 395-420) +``` + +### Purpose +Secondary tokenization cho compound words. + +### Implementation + +```python +def fine_grained_tokenize(self, text): + """ + Break compound words into sub-tokens. + """ + # First pass: standard tokenization + tokens = self.tokenize(text) + + fine_tokens = [] + for token in tokens: + # Skip short tokens + if len(token) < 3: + fine_tokens.append(token) + continue + + # Try to break into sub-tokens + sub_tokens = self.dfs_(token) + if len(sub_tokens[0]) > 1: + fine_tokens.extend(sub_tokens[0]) + else: + fine_tokens.append(token) + + return fine_tokens +``` + +### Example + +``` +Standard: "机器学习" → ["机器学习"] +Fine-grained: "机器学习" → ["机器", "学习"] + +Standard: "人工智能" → ["人工智能"] +Fine-grained: "人工智能" → ["人工", "智能"] +``` + +--- + +## Summary + +| Algorithm | Purpose | File | +|-----------|---------|------| +| Trie Tokenization | Word segmentation | rag_tokenizer.py | +| Max-Forward/Backward | Matching strategy | rag_tokenizer.py | +| DFS + Memo | Disambiguation | rag_tokenizer.py | +| TF-IDF | Term weighting | term_weight.py | +| NER | Entity detection | term_weight.py | +| POS Tagging | Grammatical analysis | term_weight.py | +| Synonym | Query expansion | synonym.py | +| Query Expansion | Search enhancement | query.py | +| Fine-grained | Sub-tokenization | rag_tokenizer.py | + +## Related Files + +- `/rag/nlp/rag_tokenizer.py` - Tokenization +- `/rag/nlp/term_weight.py` - TF-IDF, NER, POS +- `/rag/nlp/synonym.py` - Synonym lookup +- `/rag/nlp/query.py` - Query processing diff --git a/personal_analyze/06-ALGORITHMS/similarity_metrics.md b/personal_analyze/06-ALGORITHMS/similarity_metrics.md new file mode 100644 index 000000000..9ad32dc41 --- /dev/null +++ b/personal_analyze/06-ALGORITHMS/similarity_metrics.md @@ -0,0 +1,455 @@ +# Similarity & Distance Metrics + +## Tong Quan + +RAGFlow sử dụng multiple similarity metrics cho search, ranking, và entity resolution. + +## 1. Cosine Similarity + +### File Location +``` +/rag/nlp/query.py (line 221) +/rag/raptor.py (line 189) +/rag/nlp/search.py (line 60) +``` + +### Purpose +Đo độ tương đồng giữa hai vectors (embeddings). + +### Formula + +``` +Cosine Similarity: + +cos(θ) = (A · B) / (||A|| × ||B||) + + = Σ(Ai × Bi) / (√Σ(Ai²) × √Σ(Bi²)) + +Range: [-1, 1] +- cos = 1: Identical direction +- cos = 0: Orthogonal +- cos = -1: Opposite direction + +For normalized vectors: +cos(θ) = A · B (dot product only) +``` + +### Implementation + +```python +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np + +def compute_cosine_similarity(vec1, vec2): + """ + Compute cosine similarity between two vectors. + """ + # Using sklearn + sim = cosine_similarity([vec1], [vec2])[0][0] + return sim + +def compute_batch_similarity(query_vec, doc_vecs): + """ + Compute similarity between query and multiple documents. + """ + # Returns array of similarities + sims = cosine_similarity([query_vec], doc_vecs)[0] + return sims + +# Manual implementation +def cosine_sim_manual(a, b): + dot_product = np.dot(a, b) + norm_a = np.linalg.norm(a) + norm_b = np.linalg.norm(b) + return dot_product / (norm_a * norm_b) +``` + +### Usage in RAGFlow + +```python +# Vector search scoring +def hybrid_similarity(self, query_vec, doc_vecs, tkweight=0.3, vtweight=0.7): + # Cosine similarity for vectors + vsim = cosine_similarity([query_vec], doc_vecs)[0] + + # Token similarity + tksim = self.token_similarity(query_tokens, doc_tokens) + + # Weighted combination + combined = vsim * vtweight + tksim * tkweight + + return combined +``` + +--- + +## 2. Edit Distance (Levenshtein) + +### File Location +``` +/graphrag/entity_resolution.py (line 28, 246) +``` + +### Purpose +Measure string similarity cho entity resolution. + +### Formula + +``` +Edit Distance (Levenshtein): + +d(a, b) = minimum number of single-character edits + (insertions, deletions, substitutions) + +Dynamic Programming: +d[i][j] = min( + d[i-1][j] + 1, # deletion + d[i][j-1] + 1, # insertion + d[i-1][j-1] + c # substitution (c=0 if same, 1 if different) +) + +Base cases: +d[i][0] = i +d[0][j] = j +``` + +### Implementation + +```python +import editdistance + +def is_similar_by_edit_distance(a: str, b: str) -> bool: + """ + Check if two strings are similar using edit distance. + + Threshold: distance ≤ min(len(a), len(b)) // 2 + """ + a, b = a.lower(), b.lower() + threshold = min(len(a), len(b)) // 2 + distance = editdistance.eval(a, b) + return distance <= threshold + +# Examples: +# "microsoft" vs "microsft" → distance=1, threshold=4 → Similar +# "google" vs "apple" → distance=5, threshold=2 → Not similar +``` + +### Similarity Threshold + +``` +Edit Distance Threshold Strategy: + +threshold = min(len(a), len(b)) // 2 + +Rationale: +- Allows ~50% character differences +- Handles typos and minor variations +- Stricter for short strings + +Examples: +| String A | String B | Distance | Threshold | Similar? | +|-------------|-------------|----------|-----------|----------| +| microsoft | microsft | 1 | 4 | Yes | +| google | googl | 1 | 2 | Yes | +| amazon | apple | 5 | 2 | No | +| ibm | ibm | 0 | 1 | Yes | +``` + +--- + +## 3. Chinese Character Similarity + +### File Location +``` +/graphrag/entity_resolution.py (lines 250-255) +``` + +### Purpose +Similarity measure cho Chinese entity names. + +### Formula + +``` +Chinese Character Similarity: + +sim(a, b) = |set(a) ∩ set(b)| / max(|set(a)|, |set(b)|) + +Threshold: sim ≥ 0.8 + +Example: +a = "北京大学" → set = {北, 京, 大, 学} +b = "北京大" → set = {北, 京, 大} +intersection = {北, 京, 大} +sim = 3 / max(4, 3) = 3/4 = 0.75 < 0.8 → Not similar +``` + +### Implementation + +```python +def is_similar_chinese(a: str, b: str) -> bool: + """ + Check if two Chinese strings are similar. + Uses character set intersection. + """ + a_set = set(a) + b_set = set(b) + + max_len = max(len(a_set), len(b_set)) + intersection = len(a_set & b_set) + + similarity = intersection / max_len + + return similarity >= 0.8 + +# Examples: +# "清华大学" vs "清华" → 2/4 = 0.5 → Not similar +# "人工智能" vs "人工智慧" → 3/4 = 0.75 → Not similar +# "机器学习" vs "机器学习研究" → 4/6 = 0.67 → Not similar +``` + +--- + +## 4. Token Similarity (Weighted) + +### File Location +``` +/rag/nlp/query.py (lines 230-242) +``` + +### Purpose +Measure similarity based on token overlap với weights. + +### Formula + +``` +Token Similarity: + +sim(query, doc) = Σ weight(t) for t ∈ (query ∩ doc) + ──────────────────────────────────── + Σ weight(t) for t ∈ query + +where weight(t) = TF-IDF weight of token t + +Range: [0, 1] +- 0: No token overlap +- 1: All query tokens in document +``` + +### Implementation + +```python +def token_similarity(self, query_tokens_weighted, doc_tokens): + """ + Compute weighted token similarity. + + Args: + query_tokens_weighted: [(token, weight), ...] + doc_tokens: set of document tokens + + Returns: + Similarity score in [0, 1] + """ + doc_set = set(doc_tokens) + + matched_weight = 0 + total_weight = 0 + + for token, weight in query_tokens_weighted: + total_weight += weight + if token in doc_set: + matched_weight += weight + + if total_weight == 0: + return 0 + + return matched_weight / total_weight + +# Example: +# query = [("machine", 0.4), ("learning", 0.35), ("tutorial", 0.25)] +# doc = {"machine", "learning", "introduction"} +# matched = 0.4 + 0.35 = 0.75 +# total = 1.0 +# similarity = 0.75 +``` + +--- + +## 5. Hybrid Similarity + +### File Location +``` +/rag/nlp/query.py (lines 220-228) +``` + +### Purpose +Combine token và vector similarity. + +### Formula + +``` +Hybrid Similarity: + +hybrid = α × token_sim + β × vector_sim + +where: +- α = text weight (default: 0.3) +- β = vector weight (default: 0.7) +- α + β = 1.0 + +Alternative with rank features: +hybrid = (α × token_sim + β × vector_sim) × (1 + γ × pagerank) +``` + +### Implementation + +```python +def hybrid_similarity(self, query_vec, doc_vecs, + query_tokens, doc_tokens_list, + tkweight=0.3, vtweight=0.7): + """ + Compute hybrid similarity combining token and vector similarity. + """ + from sklearn.metrics.pairwise import cosine_similarity + + # Vector similarity (cosine) + vsim = cosine_similarity([query_vec], doc_vecs)[0] + + # Token similarity + tksim = [] + for doc_tokens in doc_tokens_list: + sim = self.token_similarity(query_tokens, doc_tokens) + tksim.append(sim) + + tksim = np.array(tksim) + + # Handle edge case + if np.sum(vsim) == 0: + return tksim, tksim, vsim + + # Weighted combination + combined = vsim * vtweight + tksim * tkweight + + return combined, tksim, vsim +``` + +### Weight Recommendations + +``` +Hybrid Weights by Use Case: +┌─────────────────────────┬────────┬────────┐ +│ Use Case │ Token │ Vector │ +├─────────────────────────┼────────┼────────┤ +│ Conversational/Semantic │ 0.05 │ 0.95 │ +│ Technical Documentation │ 0.30 │ 0.70 │ +│ Legal/Exact Match │ 0.40 │ 0.60 │ +│ Code Search │ 0.50 │ 0.50 │ +│ Default │ 0.30 │ 0.70 │ +└─────────────────────────┴────────┴────────┘ +``` + +--- + +## 6. IoU (Intersection over Union) + +### File Location +``` +/deepdoc/vision/operators.py (lines 702-725) +``` + +### Purpose +Measure bounding box overlap. + +### Formula + +``` +IoU = Area(A ∩ B) / Area(A ∪ B) + + = Area(intersection) / (Area(A) + Area(B) - Area(intersection)) + +Range: [0, 1] +- IoU = 0: No overlap +- IoU = 1: Perfect overlap +``` + +### Implementation + +```python +def compute_iou(box1, box2): + """ + Compute IoU between two boxes [x1, y1, x2, y2]. + """ + # Intersection + x1 = max(box1[0], box2[0]) + y1 = max(box1[1], box2[1]) + x2 = min(box1[2], box2[2]) + y2 = min(box1[3], box2[3]) + + intersection = max(0, x2 - x1) * max(0, y2 - y1) + + # Union + area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) + area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) + union = area1 + area2 - intersection + + return intersection / union if union > 0 else 0 +``` + +--- + +## 7. N-gram Similarity + +### File Location +``` +/graphrag/entity_resolution.py (2-gram analysis) +``` + +### Purpose +Check digit differences trong entity names. + +### Implementation + +```python +def check_2gram_digit_difference(a: str, b: str) -> bool: + """ + Check if strings differ only in digit 2-grams. + """ + def get_2grams(s): + return [s[i:i+2] for i in range(len(s)-1)] + + a_grams = get_2grams(a) + b_grams = get_2grams(b) + + # Find different 2-grams + diff_grams = set(a_grams) ^ set(b_grams) + + # Check if all differences are digit-only + for gram in diff_grams: + if not gram.isdigit(): + return False + + return True + +# Example: +# "product2023" vs "product2024" → True (only digit diff) +# "productA" vs "productB" → False (letter diff) +``` + +--- + +## Summary Table + +| Metric | Formula | Range | Use Case | +|--------|---------|-------|----------| +| Cosine | A·B / (‖A‖×‖B‖) | [-1, 1] | Vector search | +| Edit Distance | min edits | [0, ∞) | String matching | +| Chinese Char | \|A∩B\| / max(\|A\|,\|B\|) | [0, 1] | Chinese entities | +| Token | Σw(matched) / Σw(all) | [0, 1] | Keyword matching | +| Hybrid | α×token + β×vector | [0, 1] | Combined search | +| IoU | intersection / union | [0, 1] | Box overlap | + +## Related Files + +- `/rag/nlp/query.py` - Similarity calculations +- `/rag/nlp/search.py` - Search ranking +- `/graphrag/entity_resolution.py` - Entity matching +- `/deepdoc/vision/operators.py` - Box metrics diff --git a/personal_analyze/06-ALGORITHMS/vision_algorithms.md b/personal_analyze/06-ALGORITHMS/vision_algorithms.md new file mode 100644 index 000000000..a72be4b8e --- /dev/null +++ b/personal_analyze/06-ALGORITHMS/vision_algorithms.md @@ -0,0 +1,637 @@ +# Vision Algorithms + +## Tong Quan + +RAGFlow sử dụng computer vision algorithms cho document understanding, OCR, và layout analysis. + +## 1. OCR (Optical Character Recognition) + +### File Location +``` +/deepdoc/vision/ocr.py (lines 30-120) +``` + +### Purpose +Text detection và recognition từ document images. + +### Implementation + +```python +import onnxruntime as ort + +class OCR: + def __init__(self): + # Load ONNX models + self.det_model = ort.InferenceSession("ocr_det.onnx") + self.rec_model = ort.InferenceSession("ocr_rec.onnx") + + def detect(self, image, device_id=0): + """ + Detect text regions in image. + + Returns: + List of bounding boxes with confidence scores + """ + # Preprocess + img = self._preprocess_det(image) + + # Run detection + outputs = self.det_model.run(None, {"input": img}) + + # Post-process to get boxes + boxes = self._postprocess_det(outputs[0]) + + return boxes + + def recognize(self, image, boxes): + """ + Recognize text in detected regions. + + Returns: + List of (text, confidence) tuples + """ + results = [] + + for box in boxes: + # Crop region + crop = self._crop_region(image, box) + + # Preprocess + img = self._preprocess_rec(crop) + + # Run recognition + outputs = self.rec_model.run(None, {"input": img}) + + # Decode to text + text, conf = self._decode_ctc(outputs[0]) + results.append((text, conf)) + + return results +``` + +### OCR Pipeline + +``` +OCR Pipeline: +┌─────────────────────────────────────────────────────────────────┐ +│ Input Image │ +└──────────────────────────┬──────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Detection Model (ONNX) │ +│ - DB (Differentiable Binarization) based │ +│ - Output: Text region polygons │ +└──────────────────────────┬──────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Post-processing │ +│ - Polygon to bounding box │ +│ - Filter by confidence │ +│ - NMS for overlapping boxes │ +└──────────────────────────┬──────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Recognition Model (ONNX) │ +│ - CRNN (CNN + RNN) based │ +│ - CTC decoding │ +│ - Output: Character sequence │ +└──────────────────────────┬──────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Output: [(text, confidence, box), ...] │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### CTC Decoding + +``` +CTC (Connectionist Temporal Classification): + +Input: Probability matrix P (T × C) + T = time steps, C = character classes + +Algorithm: +1. For each time step, get most probable character +2. Merge consecutive duplicates +3. Remove blank tokens + +Example: +Raw output: [a, a, -, b, b, b, -, c] +After merge: [a, -, b, -, c] +After blank removal: [a, b, c] +Final: "abc" +``` + +--- + +## 2. Layout Recognition (YOLOv10) + +### File Location +``` +/deepdoc/vision/layout_recognizer.py (lines 33-100) +``` + +### Purpose +Detect document layout elements (text, title, table, figure, etc.). + +### Implementation + +```python +class LayoutRecognizer: + LABELS = [ + "text", "title", "figure", "figure caption", + "table", "table caption", "header", "footer", + "reference", "equation" + ] + + def __init__(self): + self.model = ort.InferenceSession("layout_yolov10.onnx") + + def detect(self, image): + """ + Detect layout elements in document image. + """ + # Preprocess (resize, normalize) + img = self._preprocess(image) + + # Run inference + outputs = self.model.run(None, {"images": img}) + + # Post-process + boxes, labels, scores = self._postprocess(outputs[0]) + + # Filter by confidence + results = [] + for box, label, score in zip(boxes, labels, scores): + if score > 0.4: # Confidence threshold + results.append({ + "box": box, + "type": self.LABELS[label], + "confidence": score + }) + + return results +``` + +### Layout Types + +``` +Document Layout Categories: +┌──────────────────┬────────────────────────────────────┐ +│ Type │ Description │ +├──────────────────┼────────────────────────────────────┤ +│ text │ Body text paragraphs │ +│ title │ Section/document titles │ +│ figure │ Images, diagrams, charts │ +│ figure caption │ Text describing figures │ +│ table │ Data tables │ +│ table caption │ Text describing tables │ +│ header │ Page headers │ +│ footer │ Page footers │ +│ reference │ Bibliography, citations │ +│ equation │ Mathematical equations │ +└──────────────────┴────────────────────────────────────┘ +``` + +### YOLO Detection + +``` +YOLOv10 Detection: + +1. Backbone: Feature extraction (CSPDarknet) +2. Neck: Feature pyramid (PANet) +3. Head: Prediction heads for different scales + +Output format: +[x_center, y_center, width, height, confidence, class_probs...] + +Post-processing: +1. Apply sigmoid to confidence +2. Multiply conf × class_prob for class scores +3. Filter by score threshold +4. Apply NMS +``` + +--- + +## 3. Table Structure Recognition (TSR) + +### File Location +``` +/deepdoc/vision/table_structure_recognizer.py (lines 30-100) +``` + +### Purpose +Detect table structure (rows, columns, cells, headers). + +### Implementation + +```python +class TableStructureRecognizer: + LABELS = [ + "table", "table column", "table row", + "table column header", "projected row header", + "spanning cell" + ] + + def __init__(self): + self.model = ort.InferenceSession("table_structure.onnx") + + def recognize(self, table_image): + """ + Recognize structure of a table image. + """ + # Preprocess + img = self._preprocess(table_image) + + # Run inference + outputs = self.model.run(None, {"input": img}) + + # Parse structure + structure = self._parse_structure(outputs) + + return structure + + def _parse_structure(self, outputs): + """ + Parse model output into table structure. + """ + rows = [] + columns = [] + cells = [] + + for detection in outputs: + label = self.LABELS[detection["class"]] + + if label == "table row": + rows.append(detection["box"]) + elif label == "table column": + columns.append(detection["box"]) + elif label == "spanning cell": + cells.append({ + "box": detection["box"], + "colspan": self._estimate_colspan(detection, columns), + "rowspan": self._estimate_rowspan(detection, rows) + }) + + return { + "rows": sorted(rows, key=lambda x: x[1]), # Sort by Y + "columns": sorted(columns, key=lambda x: x[0]), # Sort by X + "cells": cells + } +``` + +### TSR Output + +``` +Table Structure Output: + +{ + "rows": [ + {"y": 10, "height": 30}, # Row 1 + {"y": 40, "height": 30}, # Row 2 + ... + ], + "columns": [ + {"x": 0, "width": 100}, # Col 1 + {"x": 100, "width": 150}, # Col 2 + ... + ], + "cells": [ + {"row": 0, "col": 0, "text": "Header 1"}, + {"row": 0, "col": 1, "text": "Header 2"}, + {"row": 1, "col": 0, "text": "Data 1", "colspan": 2}, + ... + ] +} +``` + +--- + +## 4. Non-Maximum Suppression (NMS) + +### File Location +``` +/deepdoc/vision/operators.py (lines 702-725) +``` + +### Purpose +Filter overlapping bounding boxes trong object detection. + +### Implementation + +```python +def nms(boxes, scores, iou_threshold=0.5): + """ + Non-Maximum Suppression algorithm. + + Args: + boxes: List of [x1, y1, x2, y2] + scores: Confidence scores + iou_threshold: IoU threshold for suppression + + Returns: + Indices of kept boxes + """ + # Sort by score (descending) + indices = np.argsort(scores)[::-1] + + keep = [] + while len(indices) > 0: + # Keep highest scoring box + current = indices[0] + keep.append(current) + + if len(indices) == 1: + break + + # Compute IoU with remaining boxes + remaining = indices[1:] + ious = compute_iou(boxes[current], boxes[remaining]) + + # Keep boxes with IoU below threshold + indices = remaining[ious < iou_threshold] + + return keep +``` + +### NMS Algorithm + +``` +NMS (Non-Maximum Suppression): + +Input: Boxes B, Scores S, Threshold θ +Output: Filtered boxes + +Algorithm: +1. Sort boxes by score (descending) +2. Select box with highest score → add to results +3. Remove boxes with IoU > θ with selected box +4. Repeat until no boxes remain + +Example: +Boxes: [A(0.9), B(0.8), C(0.7)] +IoU(A,B) = 0.7 > 0.5 → Remove B +IoU(A,C) = 0.3 < 0.5 → Keep C +Result: [A, C] +``` + +--- + +## 5. Intersection over Union (IoU) + +### File Location +``` +/deepdoc/vision/operators.py (lines 702-725) +/deepdoc/vision/recognizer.py (lines 339-357) +``` + +### Purpose +Measure overlap between bounding boxes. + +### Implementation + +```python +def compute_iou(box1, box2): + """ + Compute Intersection over Union. + + Args: + box1, box2: [x1, y1, x2, y2] format + + Returns: + IoU value in [0, 1] + """ + # Intersection coordinates + x1 = max(box1[0], box2[0]) + y1 = max(box1[1], box2[1]) + x2 = min(box1[2], box2[2]) + y2 = min(box1[3], box2[3]) + + # Intersection area + intersection = max(0, x2 - x1) * max(0, y2 - y1) + + # Union area + area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) + area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) + union = area1 + area2 - intersection + + # IoU + if union == 0: + return 0 + + return intersection / union +``` + +### IoU Formula + +``` +IoU (Intersection over Union): + +IoU = Area(A ∩ B) / Area(A ∪ B) + + = Area(A ∩ B) / (Area(A) + Area(B) - Area(A ∩ B)) + +Range: [0, 1] +- IoU = 0: No overlap +- IoU = 1: Perfect overlap + +Threshold Usage: +- Detection: IoU > 0.5 → Same object +- NMS: IoU > 0.5 → Suppress duplicate +``` + +--- + +## 6. Image Preprocessing + +### File Location +``` +/deepdoc/vision/operators.py +``` + +### Purpose +Prepare images for neural network input. + +### Implementation + +```python +class StandardizeImage: + """Normalize image to [0, 1] range.""" + + def __call__(self, image): + return image.astype(np.float32) / 255.0 + +class NormalizeImage: + """Apply mean/std normalization.""" + + def __init__(self, mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]): + self.mean = np.array(mean) + self.std = np.array(std) + + def __call__(self, image): + return (image - self.mean) / self.std + +class ToCHWImage: + """Convert HWC to CHW format.""" + + def __call__(self, image): + return image.transpose((2, 0, 1)) + +class LinearResize: + """Resize image maintaining aspect ratio.""" + + def __init__(self, target_size): + self.target = target_size + + def __call__(self, image): + h, w = image.shape[:2] + scale = self.target / max(h, w) + new_h, new_w = int(h * scale), int(w * scale) + return cv2.resize(image, (new_w, new_h), + interpolation=cv2.INTER_CUBIC) +``` + +### Preprocessing Pipeline + +``` +Image Preprocessing Pipeline: + +1. Resize (maintain aspect ratio) + - Target: 640 or 1280 depending on model + +2. Standardize (0-255 → 0-1) + - image = image / 255.0 + +3. Normalize (ImageNet stats) + - image = (image - mean) / std + - mean = [0.485, 0.456, 0.406] + - std = [0.229, 0.224, 0.225] + +4. Transpose (HWC → CHW) + - PyTorch format: (C, H, W) + +5. Pad (to square) + - Pad with zeros to square shape +``` + +--- + +## 7. XGBoost Text Concatenation + +### File Location +``` +/deepdoc/parser/pdf_parser.py (lines 88-101, 131-170) +``` + +### Purpose +Predict whether adjacent text boxes should be merged. + +### Implementation + +```python +import xgboost as xgb + +class PDFParser: + def __init__(self): + # Load pre-trained XGBoost model + self.concat_model = xgb.Booster() + self.concat_model.load_model("updown_concat_xgb.model") + + def should_concat(self, box1, box2): + """ + Predict if two text boxes should be concatenated. + """ + # Extract features + features = self._extract_concat_features(box1, box2) + + # Create DMatrix + dmatrix = xgb.DMatrix([features]) + + # Predict probability + prob = self.concat_model.predict(dmatrix)[0] + + return prob > 0.5 + + def _extract_concat_features(self, box1, box2): + """ + Extract 20+ features for concatenation decision. + """ + features = [] + + # Distance features + y_dist = box2["top"] - box1["bottom"] + char_height = box1["bottom"] - box1["top"] + features.append(y_dist / max(char_height, 1)) + + # Alignment features + x_overlap = min(box1["x1"], box2["x1"]) - max(box1["x0"], box2["x0"]) + features.append(x_overlap / max(box1["x1"] - box1["x0"], 1)) + + # Text pattern features + text1, text2 = box1["text"], box2["text"] + features.append(1 if text1.endswith((".", "。", "!", "?")) else 0) + features.append(1 if text2[0].isupper() else 0) + + # Layout features + features.append(1 if box1.get("layout_num") == box2.get("layout_num") else 0) + + # ... more features + + return features +``` + +### Feature List + +``` +XGBoost Concatenation Features: + +1. Spatial Features: + - Y-distance / char_height + - X-alignment overlap ratio + - Same page flag + +2. Text Pattern Features: + - Ends with sentence punctuation + - Ends with continuation punctuation + - Next starts with uppercase + - Next starts with number + - Chinese numbering pattern + +3. Layout Features: + - Same layout_type + - Same layout_num + - Same column + +4. Tokenization Features: + - Token count ratio + - Last/first token match + +Total: 20+ features +``` + +--- + +## Summary + +| Algorithm | Purpose | Model Type | +|-----------|---------|------------| +| OCR | Text detection + recognition | ONNX (DB + CRNN) | +| Layout Recognition | Element detection | ONNX (YOLOv10) | +| TSR | Table structure | ONNX | +| NMS | Box filtering | Classical | +| IoU | Overlap measure | Classical | +| XGBoost | Text concatenation | Gradient Boosting | + +## Related Files + +- `/deepdoc/vision/ocr.py` - OCR models +- `/deepdoc/vision/layout_recognizer.py` - Layout detection +- `/deepdoc/vision/table_structure_recognizer.py` - TSR +- `/deepdoc/vision/operators.py` - Image processing +- `/deepdoc/parser/pdf_parser.py` - XGBoost integration