diff --git a/personal_analyze/01_directory_structure.md b/personal_analyze/01_directory_structure.md new file mode 100644 index 000000000..96e03b1de --- /dev/null +++ b/personal_analyze/01_directory_structure.md @@ -0,0 +1,348 @@ +# RAGFlow - Cấu Trúc Thư Mục + +## Tổng Quan + +RAGFlow (v0.22.1) là một RAG (Retrieval-Augmented Generation) engine mã nguồn mở dựa trên deep document understanding. Dự án được xây dựng với kiến trúc full-stack bao gồm Python backend và React/TypeScript frontend. + +## Cấu Trúc Thư Mục Chi Tiết + +``` +ragflow/ +│ +├── api/ # [BACKEND] Flask API Server +│ ├── ragflow_server.py # Entry point chính +│ ├── settings.py # Cấu hình server +│ ├── constants.py # Hằng số API (API_VERSION = "v1") +│ ├── validation.py # Request validation +│ │ +│ ├── apps/ # Flask Blueprints - API endpoints +│ │ ├── kb_app.py # Knowledge Base management +│ │ ├── document_app.py # Document processing +│ │ ├── dialog_app.py # Chat/Dialog handling +│ │ ├── canvas_app.py # Agent workflow canvas +│ │ ├── file_app.py # File upload/management +│ │ ├── chunk_app.py # Document chunking +│ │ ├── conversation_app.py # Conversation management +│ │ ├── search_app.py # Search functionality +│ │ ├── system_app.py # System configuration +│ │ ├── llm_app.py # LLM model management +│ │ ├── connector_app.py # Data source connectors +│ │ ├── mcp_server_app.py # MCP server integration +│ │ ├── langfuse_app.py # Langfuse observability +│ │ ├── api_app.py # API key management +│ │ ├── plugin_app.py # Plugin management +│ │ ├── tenant_app.py # Multi-tenancy +│ │ ├── user_app.py # User management +│ │ │ +│ │ ├── auth/ # Authentication modules +│ │ │ ├── oauth.py # OAuth base +│ │ │ ├── github.py # GitHub OAuth +│ │ │ └── oidc.py # OpenID Connect +│ │ │ +│ │ └── sdk/ # SDK REST API endpoints +│ │ ├── dataset.py # Dataset API +│ │ ├── doc.py # Document API +│ │ ├── chat.py # Chat API +│ │ ├── session.py # Session API +│ │ ├── files.py # File API +│ │ ├── agents.py # Agent API +│ │ └── dify_retrieval.py # Dify integration +│ │ +│ ├── db/ # Database layer +│ │ ├── db_models.py # SQLAlchemy/Peewee models (54KB) +│ │ ├── db_utils.py # Database utilities +│ │ ├── init_data.py # Initial data seeding +│ │ ├── runtime_config.py # Runtime configuration +│ │ │ +│ │ ├── services/ # Business logic services +│ │ │ ├── user_service.py # User operations +│ │ │ ├── dialog_service.py # Dialog logic (37KB) +│ │ │ ├── document_service.py # Document processing (39KB) +│ │ │ ├── file_service.py # File handling (22KB) +│ │ │ ├── knowledgebase_service.py # KB management (21KB) +│ │ │ ├── task_service.py # Task queue (20KB) +│ │ │ ├── canvas_service.py # Canvas logic (12KB) +│ │ │ ├── conversation_service.py # Conversation handling +│ │ │ ├── connector_service.py # Connector management +│ │ │ ├── llm_service.py # LLM operations +│ │ │ ├── search_service.py # Search operations +│ │ │ └── api_service.py # API token service +│ │ │ +│ │ └── joint_services/ # Cross-service operations +│ │ +│ └── utils/ # API utilities +│ ├── api_utils.py # API helpers +│ ├── file_utils.py # File utilities +│ ├── crypt.py # Encryption +│ └── log_utils.py # Logging +│ +├── rag/ # [CORE] RAG Processing Engine +│ ├── settings.py # RAG configuration +│ ├── raptor.py # RAPTOR algorithm +│ ├── benchmark.py # Performance testing +│ │ +│ ├── llm/ # LLM Model Abstractions +│ │ ├── chat_model.py # Chat LLM interface +│ │ ├── embedding_model.py # Embedding models +│ │ ├── rerank_model.py # Reranking models +│ │ ├── cv_model.py # Computer vision +│ │ ├── tts_model.py # Text-to-speech +│ │ └── sequence2txt_model.py # Sequence to text +│ │ +│ ├── flow/ # RAG Pipeline +│ │ ├── pipeline.py # Main pipeline +│ │ ├── file.py # File processing +│ │ │ +│ │ ├── parser/ # Document parsing +│ │ │ ├── parser.py +│ │ │ └── schema.py +│ │ │ +│ │ ├── extractor/ # Information extraction +│ │ │ ├── extractor.py +│ │ │ └── schema.py +│ │ │ +│ │ ├── tokenizer/ # Text tokenization +│ │ │ ├── tokenizer.py +│ │ │ └── schema.py +│ │ │ +│ │ ├── splitter/ # Document chunking +│ │ │ ├── splitter.py +│ │ │ └── schema.py +│ │ │ +│ │ └── hierarchical_merger/ # Hierarchical merging +│ │ ├── hierarchical_merger.py +│ │ └── schema.py +│ │ +│ ├── app/ # RAG application logic +│ ├── nlp/ # NLP utilities +│ ├── utils/ # RAG utilities +│ └── prompts/ # LLM prompt templates +│ +├── deepdoc/ # [DOCUMENT] Deep Document Understanding +│ ├── parser/ # Multi-format parsers +│ │ ├── pdf_parser.py # PDF with layout analysis +│ │ ├── docx_parser.py # Word documents +│ │ ├── ppt_parser.py # PowerPoint +│ │ ├── excel_parser.py # Excel spreadsheets +│ │ ├── html_parser.py # HTML pages +│ │ ├── markdown_parser.py # Markdown files +│ │ ├── json_parser.py # JSON data +│ │ ├── txt_parser.py # Plain text +│ │ ├── figure_parser.py # Image/figure extraction +│ │ │ +│ │ └── resume/ # Resume parsing +│ │ ├── step_one.py +│ │ └── step_two.py +│ │ +│ └── vision/ # Computer vision modules +│ +├── agent/ # [AGENT] Agentic Workflow System +│ ├── canvas.py # Canvas orchestration (25KB) +│ ├── settings.py # Agent configuration +│ │ +│ ├── component/ # Workflow components +│ │ ├── begin.py # Workflow start +│ │ ├── llm.py # LLM invocation +│ │ ├── agent_with_tools.py # Agent with tools +│ │ ├── retrieval.py # Document retrieval +│ │ ├── categorize.py # Message categorization +│ │ ├── message.py # Message handling +│ │ ├── webhook.py # Webhook triggers +│ │ ├── iteration.py # Loop iteration +│ │ └── variable_assigner.py # Variable assignment +│ │ +│ ├── tools/ # External tool integrations +│ │ ├── tavily.py # Web search +│ │ ├── arxiv.py # Academic papers +│ │ ├── github.py # GitHub API +│ │ ├── google.py # Google Search +│ │ ├── wikipedia.py # Wikipedia +│ │ ├── email.py # Email sending +│ │ ├── code_exec.py # Code execution +│ │ └── yahoofinance.py # Financial data +│ │ +│ └── templates/ # Pre-built workflows +│ +├── graphrag/ # [GRAPH] Knowledge Graph RAG +│ ├── entity_resolution.py # Entity linking (12KB) +│ ├── search.py # Graph search (14KB) +│ ├── utils.py # Graph utilities (23KB) +│ ├── general/ # General graph operations +│ └── light/ # Lightweight implementations +│ +├── web/ # [FRONTEND] React/TypeScript +│ ├── package.json # NPM dependencies (172 packages) +│ ├── .umirc.ts # UmiJS configuration +│ ├── tailwind.config.js # Tailwind CSS config +│ │ +│ └── src/ +│ ├── pages/ # UmiJS page routes +│ │ ├── admin/ # Admin dashboard +│ │ ├── dataset/ # Knowledge base management +│ │ ├── datasets/ # Datasets list +│ │ ├── knowledge/ # Knowledge management +│ │ ├── next-chats/ # Chat interface +│ │ ├── next-searches/ # Search interface +│ │ ├── document-viewer/ # Document preview +│ │ ├── login/ # Authentication +│ │ └── register/ # User registration +│ │ +│ ├── components/ # React components +│ │ ├── file-upload-modal/ +│ │ ├── pdf-drawer/ +│ │ ├── prompt-editor/ +│ │ ├── document-preview/ +│ │ └── ui/ # Shadcn/UI components +│ │ +│ ├── services/ # API client services +│ ├── hooks/ # React hooks +│ ├── interfaces/ # TypeScript interfaces +│ ├── utils/ # Utility functions +│ ├── constants/ # Constants +│ └── locales/ # i18n translations +│ +├── common/ # [SHARED] Common Utilities +│ ├── settings.py # Main configuration (11KB) +│ ├── config_utils.py # Config utilities +│ ├── connection_utils.py # Database connections +│ ├── constants.py # Global constants +│ ├── exceptions.py # Exception definitions +│ │ +│ ├── Utilities: +│ │ ├── log_utils.py # Logging setup +│ │ ├── file_utils.py # File operations +│ │ ├── string_utils.py # String utilities +│ │ ├── token_utils.py # Token operations +│ │ └── time_utils.py # Time utilities +│ │ +│ └── data_source/ # Data source connectors +│ ├── confluence_connector.py (81KB) +│ ├── notion_connector.py (25KB) +│ ├── slack_connector.py (22KB) +│ ├── gmail_connector.py +│ ├── discord_connector.py +│ ├── sharepoint_connector.py +│ ├── dropbox_connector.py +│ └── google_drive/ +│ +├── sdk/ # [SDK] Python Client Library +│ └── python/ +│ └── ragflow_sdk/ # SDK implementation +│ +├── mcp/ # [MCP] Model Context Protocol +│ ├── server/ # MCP server +│ │ └── server.py +│ └── client/ # MCP client +│ └── client.py +│ +├── admin/ # [ADMIN] Admin Interface +│ ├── server/ # Admin backend +│ └── client/ # Admin frontend +│ +├── plugin/ # [PLUGIN] Plugin System +│ ├── plugin_manager.py # Plugin management +│ ├── llm_tool_plugin.py # LLM tool plugins +│ └── embedded_plugins/ # Built-in plugins +│ +├── docker/ # [DEPLOYMENT] Docker Configuration +│ ├── docker-compose.yml # Main compose file +│ ├── docker-compose-base.yml # Base services +│ ├── .env # Environment variables +│ ├── entrypoint.sh # Container entry +│ ├── service_conf.yaml.template # Service config +│ ├── nginx/ # Nginx configuration +│ │ └── nginx.conf +│ └── init.sql # Database init +│ +├── conf/ # [CONFIG] Configuration Files +│ ├── llm_factories.json # LLM providers +│ ├── mapping.json # Field mappings +│ ├── service_conf.yaml # Service configuration +│ ├── private.pem # RSA private key +│ └── public.pem # RSA public key +│ +├── test/ # [TEST] Testing Suite +│ ├── unit_test/ # Unit tests +│ │ └── common/ # Common utilities tests +│ │ +│ └── testcases/ # Integration tests +│ ├── test_http_api/ # HTTP API tests +│ ├── test_sdk_api/ # SDK tests +│ └── test_web_api/ # Web API tests +│ +├── example/ # [EXAMPLES] Usage Examples +│ ├── http/ # HTTP API examples +│ └── sdk/ # SDK examples +│ +├── intergrations/ # [INTEGRATIONS] Third-party +│ ├── chatgpt-on-wechat/ # WeChat integration +│ ├── extension_chrome/ # Chrome extension +│ └── firecrawl/ # Web scraping +│ +├── agentic_reasoning/ # [REASONING] Advanced reasoning +├── sandbox/ # [SANDBOX] Code execution +├── helm/ # [K8S] Kubernetes Helm charts +├── docs/ # [DOCS] Documentation +│ +├── pyproject.toml # Python project config +├── CLAUDE.md # Development guidelines +└── README.md # Project overview +``` + +## Mô Tả Chi Tiết Các Thư Mục Chính + +### 1. `/api/` - Backend API Server +- **Vai trò**: Xử lý tất cả HTTP requests, authentication, và business logic +- **Framework**: Flask/Quart (async ASGI) +- **Port mặc định**: 9380 +- **Entry point**: `ragflow_server.py` + +### 2. `/rag/` - RAG Processing Engine +- **Vai trò**: Xử lý pipeline RAG từ document parsing đến retrieval +- **Chức năng chính**: + - Document parsing và extraction + - Text tokenization + - Semantic chunking + - Embedding generation + - Reranking + +### 3. `/deepdoc/` - Document Understanding +- **Vai trò**: Deep document parsing với layout analysis +- **Hỗ trợ formats**: PDF, Word, PPT, Excel, HTML, Markdown, JSON, TXT +- **Đặc biệt**: OCR và layout analysis cho PDF + +### 4. `/agent/` - Agentic Workflow +- **Vai trò**: Hệ thống workflow agent với visual canvas +- **Components**: LLM, Retrieval, Categorize, Webhook, Iteration... +- **Tools**: Tavily, Google, Wikipedia, GitHub, Email... + +### 5. `/graphrag/` - Knowledge Graph +- **Vai trò**: Xây dựng và query knowledge graph +- **Chức năng**: Entity resolution, graph search, relationship extraction + +### 6. `/web/` - Frontend +- **Framework**: React + TypeScript + UmiJS +- **UI**: Ant Design + Shadcn/UI + Tailwind CSS +- **State**: Zustand +- **Port**: 80/443 (qua Nginx) + +### 7. `/common/` - Shared Utilities +- **Vai trò**: Utilities và connectors dùng chung +- **Data sources**: Confluence, Notion, Slack, Gmail, SharePoint... + +### 8. `/docker/` - Deployment +- **Services**: MySQL, Elasticsearch/Infinity, Redis, MinIO, Nginx +- **Modes**: CPU/GPU, single/cluster + +## Tóm Tắt Thống Kê + +| Thư mục | Số files | Mô tả | +|---------|----------|-------| +| api/ | ~100+ | Backend API | +| rag/ | ~50+ | RAG engine | +| deepdoc/ | ~30+ | Document parsers | +| agent/ | ~40+ | Agent system | +| graphrag/ | ~20+ | Knowledge graph | +| web/src/ | ~200+ | Frontend | +| common/ | ~50+ | Shared utilities | +| test/ | ~80+ | Test suite | diff --git a/personal_analyze/02_system_architecture.md b/personal_analyze/02_system_architecture.md new file mode 100644 index 000000000..f212769d3 --- /dev/null +++ b/personal_analyze/02_system_architecture.md @@ -0,0 +1,567 @@ +# RAGFlow - Kiến Trúc Hệ Thống + +## 1. Tổng Quan Kiến Trúc + +RAGFlow sử dụng kiến trúc **Microservices** với các thành phần được container hóa bằng Docker. Hệ thống được thiết kế theo mô hình **3-tier architecture** kết hợp với **event-driven architecture** cho xử lý bất đồng bộ. + +## 2. Sơ Đồ Kiến Trúc Tổng Quan + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ CLIENT LAYER │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Web App │ │ Mobile App │ │ Python SDK │ │ REST API │ │ +│ │ (React/TS) │ │ (Future) │ │ Client │ │ Client │ │ +│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ +│ │ │ │ │ │ +└─────────┼─────────────────┼─────────────────┼─────────────────┼──────────────────┘ + │ │ │ │ + └─────────────────┴────────┬────────┴─────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ GATEWAY LAYER │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ NGINX Reverse Proxy │ │ +│ │ (Load Balancing, SSL Termination) │ │ +│ │ Port: 80/443 │ │ +│ └─────────────────────────────────────┬───────────────────────────────────┘ │ +│ │ │ +└────────────────────────────────────────┼─────────────────────────────────────────┘ + │ + ┌──────────────────────────────┼──────────────────────────────┐ + │ │ │ + ▼ ▼ ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ APPLICATION LAYER │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌───────────────────────┐ ┌───────────────────────┐ ┌───────────────────┐ │ +│ │ RAGFlow Server │ │ Admin Server │ │ MCP Server │ │ +│ │ (Flask/Quart) │ │ (Flask) │ │ (MCP Protocol) │ │ +│ │ Port: 9380 │ │ Port: 9381 │ │ Port: 9382 │ │ +│ │ │ │ │ │ │ │ +│ │ ┌─────────────────┐ │ │ ┌─────────────────┐ │ │ ┌─────────────┐ │ │ +│ │ │ API Blueprints │ │ │ │ Admin APIs │ │ │ │ MCP Handler │ │ │ +│ │ │ - kb_app │ │ │ │ - User Mgmt │ │ │ │ - Tools │ │ │ +│ │ │ - document_app │ │ │ │ - System Cfg │ │ │ │ - Resources │ │ │ +│ │ │ - dialog_app │ │ │ │ - Monitoring │ │ │ └─────────────┘ │ │ +│ │ │ - canvas_app │ │ │ └─────────────────┘ │ │ │ │ +│ │ │ - search_app │ │ │ │ │ │ │ +│ │ │ - file_app │ │ │ │ │ │ │ +│ │ └─────────────────┘ │ │ │ │ │ │ +│ └───────────┬───────────┘ └───────────────────────┘ └───────────────────┘ │ +│ │ │ +└──────────────┼───────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ SERVICE LAYER │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ Business Logic │ │ RAG Pipeline │ │ Agent System │ │ +│ │ Services │ │ Engine │ │ Engine │ │ +│ │ │ │ │ │ │ │ +│ │ - UserService │ │ - Parser │ │ - Canvas │ │ +│ │ - DialogService │ │ - Tokenizer │ │ - Components │ │ +│ │ - DocService │ │ - Splitter │ │ - Tools │ │ +│ │ - KBService │ │ - Embedder │ │ - Workflows │ │ +│ │ - TaskService │ │ - Reranker │ │ │ │ +│ └────────┬────────┘ └────────┬────────┘ └────────┬────────┘ │ +│ │ │ │ │ +│ └────────────────────┼────────────────────┘ │ +│ │ │ +│ ┌─────────────────────────────┼─────────────────────────────────────────────┐ │ +│ │ DeepDoc Processing Engine │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │ PDF │ │ DOCX │ │ PPT │ │ Excel │ │ HTML │ │ │ +│ │ │ Parser │ │ Parser │ │ Parser │ │ Parser │ │ Parser │ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ │ +│ │ │ │ +│ │ ┌──────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Vision/OCR Processing (Layout Analysis) │ │ │ +│ │ └──────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└──────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ DATA LAYER │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ MySQL │ │ Redis/Valkey │ │ MinIO │ │ +│ │ (Primary DB) │ │ (Cache) │ │ (Object Store) │ │ +│ │ Port: 5455 │ │ Port: 6379 │ │ Port: 9000/9001 │ │ +│ │ │ │ │ │ │ │ +│ │ - Users │ │ - Sessions │ │ - Documents │ │ +│ │ - Tenants │ │ - Cache │ │ - Files │ │ +│ │ - Knowledgebase │ │ - Rate Limit │ │ - Chunks │ │ +│ │ - Documents │ │ - Task Queue │ │ - Images │ │ +│ │ - Dialogs │ │ │ │ │ │ +│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ Vector Database Layer │ │ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ │ +│ │ │ Elasticsearch │ │ Infinity │ │ OpenSearch │ │ │ +│ │ │ (Default) │ │ (Alternative) │ │ (Alternative) │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ - Vector Search │ │ - Hybrid Search │ │ - Vector Search │ │ │ +│ │ │ - Full-text │ │ - Full-text │ │ - Full-text │ │ │ +│ │ │ - BM25 │ │ - BM25 │ │ - BM25 │ │ │ +│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└──────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ EXTERNAL SERVICES │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ LLM Providers │ │ +│ │ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ │ +│ │ │ OpenAI │ │ Claude │ │ Gemini │ │ Qwen │ │ Groq │ │ Ollama │ │ │ +│ │ └────────┘ └────────┘ └────────┘ └────────┘ └────────┘ └────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Data Source Connectors │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │Confluence│ │ Notion │ │ Slack │ │ Gmail │ │SharePoint│ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Agent Tools & APIs │ │ +│ │ ┌────────┐ ┌─────────┐ ┌────────┐ ┌────────┐ ┌─────────┐ ┌────────┐ │ │ +│ │ │ Tavily │ │ Google │ │ ArXiv │ │ GitHub │ │Wikipedia│ │ Weather│ │ │ +│ │ └────────┘ └─────────┘ └────────┘ └────────┘ └─────────┘ └────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└──────────────────────────────────────────────────────────────────────────────────┘ +``` + +## 3. Kiến Trúc Chi Tiết Các Thành Phần + +### 3.1 API Server Architecture + +``` +┌──────────────────────────────────────────────────────────────┐ +│ RAGFlow API Server │ +│ (ragflow_server.py) │ +├──────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Flask/Quart Application │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │ │ +│ │ │ CORS │ │ Session │ │ JWT Auth │ │ │ +│ │ │ Middleware │ │ Middleware │ │ Middleware │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────────┘ │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌───────────────────────────┼───────────────────────────────┐ +│ │ API Blueprints │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ │ kb_app │ │ doc_app │ │dialog_app│ │canvas_app│ │ +│ │ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ +│ │ │ │ │ │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ │file_app │ │search_app│ │ llm_app │ │ user_app │ │ +│ │ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ +│ └───────┼────────────┼────────────┼────────────┼───────────┘ +│ │ │ │ │ │ +│ ┌───────┴────────────┴────────────┴────────────┴───────────┐ +│ │ Service Layer │ +│ │ ┌────────────────┐ ┌────────────────┐ │ +│ │ │ UserService │ │ DialogService │ │ +│ │ │ - register() │ │ - chat() │ │ +│ │ │ - login() │ │ - stream() │ │ +│ │ │ - get_user() │ │ - completion() │ │ +│ │ └────────────────┘ └────────────────┘ │ +│ │ │ +│ │ ┌────────────────┐ ┌────────────────┐ │ +│ │ │ DocumentService│ │ KBService │ │ +│ │ │ - upload() │ │ - create() │ │ +│ │ │ - parse() │ │ - list() │ │ +│ │ │ - chunk() │ │ - delete() │ │ +│ │ └────────────────┘ └────────────────┘ │ +│ └──────────────────────────────────────────────────────────┘ +│ │ │ +│ ┌───────────────────────────┴───────────────────────────────┐ +│ │ Database Layer (Peewee ORM) │ +│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ │ User │ │ Tenant │ │Document │ │ Dialog │ │ +│ │ │ Model │ │ Model │ │ Model │ │ Model │ │ +│ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ +│ └──────────────────────────────────────────────────────────┘ +│ │ +└──────────────────────────────────────────────────────────────┘ +``` + +### 3.2 RAG Pipeline Architecture + +``` +┌──────────────────────────────────────────────────────────────────────────┐ +│ RAG Processing Pipeline │ +├──────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ INGESTION PIPELINE │ │ +│ │ │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │ File │───▶│ Parser │───▶│Tokenizer │───▶│ Splitter │ │ │ +│ │ │ Upload │ │ │ │ │ │ (Chunker)│ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ └────┬─────┘ │ │ +│ │ │ │ │ +│ │ ┌──────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │ Embedding│───▶│ Index │───▶│ Store │ │ │ +│ │ │ Model │ │ Creation │ │ (ES/Inf) │ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ │ │ +│ │ │ │ +│ └───────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ RETRIEVAL PIPELINE │ │ +│ │ │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │ Query │───▶│ Query │───▶│ Embedding│───▶│ Hybrid │ │ │ +│ │ │ Input │ │ Analysis │ │ Query │ │ Search │ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ └────┬─────┘ │ │ +│ │ │ │ │ +│ │ ┌──────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │ Candidate│───▶│ Reranker │───▶│ Context │───▶│ LLM │ │ │ +│ │ │ Chunks │ │ │ │ Building │ │ Response │ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ │ +│ │ │ │ +│ └───────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +### 3.3 Agent Workflow Architecture + +``` +┌──────────────────────────────────────────────────────────────────────────┐ +│ Agent Canvas Architecture │ +├──────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Canvas Orchestrator │ │ +│ │ (canvas.py) │ │ +│ └──────────────────────────────┬──────────────────────────────────────┘ │ +│ │ │ +│ ┌───────────────────────────┼───────────────────────────┐ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ BEGIN │─────────────▶│ LLM │─────────────▶│RETRIEVAL│ │ +│ │Component│ │Component│ │Component│ │ +│ └─────────┘ └─────────┘ └─────────┘ │ +│ │ │ │ │ +│ │ ┌───────────────────┼───────────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌─────────────┐ ┌─────────┐ ┌─────────────┐ │ +│ │ CATEGORIZE │ │ MESSAGE │ │ WEBHOOK │ │ +│ │ Component │ │Component│ │ Component │ │ +│ └─────────────┘ └─────────┘ └─────────────┘ │ +│ │ │ │ │ +│ └────────────────────────┼────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ TOOLS INTEGRATION │ │ +│ │ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ │ +│ │ │ Tavily │ │ ArXiv │ │ GitHub │ │ Email │ │Code │ │ │ +│ │ │ Search │ │ Search │ │ API │ │ Send │ │Executor│ │ │ +│ │ └────────┘ └────────┘ └────────┘ └────────┘ └────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +└───────────────────────────────────────────────────────────────────────────┘ +``` + +## 4. Data Flow Architecture + +### 4.1 Document Ingestion Flow + +``` +┌────────────────────────────────────────────────────────────────────────────┐ +│ Document Ingestion Flow │ +├────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ User Upload │ +│ │ │ +│ ▼ │ +│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ API │────▶│ File │────▶│ MinIO │────▶│ Task │ │ +│ │ Endpoint │ │ Service │ │ Storage │ │ Queue │ │ +│ └──────────┘ └──────────┘ └──────────┘ └────┬─────┘ │ +│ │ │ +│ ┌────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────┐ │ +│ │ Background Task Processor │ │ +│ │ │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ +│ │ │ Parser │───▶│Extractor │───▶│ Chunker │───▶│ Embedder │ │ │ +│ │ │ │ │ │ │ │ │ │ │ │ +│ │ │ - PDF │ │ - Text │ │ - Token │ │ - OpenAI │ │ │ +│ │ │ - DOCX │ │ - Table │ │ - Sent │ │ - BGE │ │ │ +│ │ │ - HTML │ │ - Image │ │ - Page │ │ - Cohere │ │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ └────┬─────┘ │ │ +│ │ │ │ │ +│ └────────────────────────────────────────────────────────┼────────┘ │ +│ │ │ +│ ┌─────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────┐ │ +│ │ Storage Layer │ │ +│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ MySQL │ │ Elasticsearch│ │ MinIO │ │ │ +│ │ │ (Metadata) │ │ (Vectors) │ │ (Files) │ │ │ +│ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────┘ │ +│ │ +└───────────────────────────────────────────────────────────────────────────┘ +``` + +### 4.2 Query Processing Flow + +``` +┌────────────────────────────────────────────────────────────────────────────┐ +│ Query Processing Flow │ +├────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ User Query: "What is the revenue for Q3 2024?" │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ 1. QUERY UNDERSTANDING │ │ +│ │ ┌──────────────┐ │ │ +│ │ │ Query Parser │──▶ Extract: entities, intent, keywords │ │ +│ │ └──────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ 2. RETRIEVAL │ │ +│ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │ +│ │ │ Embedding │───▶│ Hybrid │───▶│ Candidate │ │ │ +│ │ │ Query │ │ Search │ │ Chunks │ │ │ +│ │ └────────────┘ │ │ │ (Top 100) │ │ │ +│ │ │ Vector+BM25│ └────────────┘ │ │ +│ │ └────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ 3. RERANKING │ │ +│ │ ┌────────────┐ ┌────────────┐ │ │ +│ │ │ Reranker │───▶│ Top-K │ │ │ +│ │ │ Model │ │ Chunks │ │ │ +│ │ │ │ │ (Top 5) │ │ │ +│ │ └────────────┘ └────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ 4. GENERATION │ │ +│ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │ +│ │ │ Prompt │───▶│ LLM │───▶│ Response │ │ │ +│ │ │ Builder │ │ (GPT-4) │ │ + Sources │ │ │ +│ │ └────────────┘ └────────────┘ └────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ Response: "The revenue for Q3 2024 was $X million... [source: doc.pdf]" │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## 5. Deployment Architecture + +### 5.1 Docker Compose Deployment + +``` +┌──────────────────────────────────────────────────────────────────────────────┐ +│ Docker Compose Deployment │ +├──────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Docker Network │ │ +│ │ (ragflow-network) │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌───────────────────────────────┼───────────────────────────────┐ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ nginx │ ◀──────────────▶│ ragflow- │◀──────────────────▶│ ragflow- │ │ +│ │ :80/443 │ │ server │ │ admin │ │ +│ └──────────┘ │ :9380 │ │ :9381 │ │ +│ │ └────┬─────┘ └──────────┘ │ +│ │ │ │ +│ │ ┌──────────────────┼──────────────────────┐ │ +│ │ │ │ │ │ +│ │ ▼ ▼ ▼ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ │ mysql │ │ redis │ │elasticsearch│ │ +│ │ │ :5455 │ │ :6379 │ │ :9200 │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ │ +│ │ │ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ │ minio │ │ sandbox │ │ tei │ │ +│ │ │:9000/9001│ │ :9385 │ │ :6380 │ │ +│ │ └──────────┘ └──────────┘ └──────────┘ │ +│ │ │ +│ ┌────┴─────────────────────────────────────────────────────────────────┐ │ +│ │ Volumes │ │ +│ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │ +│ │ │mysql_data │ │ es_data │ │minio_data │ │ redis_data │ │ │ +│ │ └────────────┘ └────────────┘ └────────────┘ └────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +│ │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## 6. Security Architecture + +``` +┌──────────────────────────────────────────────────────────────────────────────┐ +│ Security Architecture │ +├──────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Authentication Layer │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ JWT │ │ OAuth │ │ API │ │ │ +│ │ │ Tokens │ │ (GitHub, │ │ Tokens │ │ │ +│ │ │ │ │ OIDC) │ │ │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Authorization Layer │ │ +│ │ │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Multi-Tenancy Model │ │ │ +│ │ │ │ │ │ +│ │ │ Tenant A Tenant B Tenant C │ │ │ +│ │ │ ┌──────┐ ┌──────┐ ┌──────┐ │ │ │ +│ │ │ │Users │ │Users │ │Users │ │ │ │ +│ │ │ │KBs │ │KBs │ │KBs │ │ │ │ +│ │ │ │Docs │ │Docs │ │Docs │ │ │ │ +│ │ │ └──────┘ └──────┘ └──────┘ │ │ │ +│ │ │ │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ Role-Based │ │ Team │ │ Resource │ │ │ +│ │ │ Access │ │ Permissions│ │ Ownership │ │ │ +│ │ │ Control │ │ │ │ │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Encryption Layer │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ RSA │ │ HTTPS │ │ Password │ │ │ +│ │ │ Key Pair │ │ (TLS) │ │ Bcrypt │ │ │ +│ │ │ (conf/*.pem)│ │ │ │ │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## 7. Scalability Architecture + +``` +┌──────────────────────────────────────────────────────────────────────────────┐ +│ Scalability Architecture │ +├──────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Horizontal Scaling │ │ +│ │ │ │ +│ │ Load Balancer (Nginx) │ │ +│ │ │ │ │ +│ │ ┌──────────────────┼──────────────────┐ │ │ +│ │ │ │ │ │ │ +│ │ ▼ ▼ ▼ │ │ +│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ Server #1 │ │ Server #2 │ │ Server #N │ │ │ +│ │ │ (Instance) │ │ (Instance) │ │ (Instance) │ │ │ +│ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Database Scaling │ │ +│ │ │ │ +│ │ MySQL: Elasticsearch: Redis: │ │ +│ │ - Read Replicas - Cluster Mode - Sentinel │ │ +│ │ - Connection Pool - Sharding - Cluster Mode │ │ +│ │ - Index Partitioning │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Async Processing │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ Task │───▶│ Redis │───▶│ Worker │ │ │ +│ │ │ Producer │ │ Queue │ │ Consumer │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ +│ │ │ │ +│ │ Tasks: Document parsing, Embedding, Indexing │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└───────────────────────────────────────────────────────────────────────────────┘ +``` + +## 8. Tóm Tắt Kiến Trúc + +| Layer | Components | Technology | +|-------|------------|------------| +| Client | Web App, SDK, API | React, Python SDK, REST | +| Gateway | Reverse Proxy | Nginx | +| Application | API Server, Admin | Flask/Quart | +| Service | Business Logic | Python Services | +| Processing | RAG, DeepDoc, Agent | Python, ML Models | +| Data | Storage, Cache, Vector | MySQL, Redis, ES, MinIO | +| External | LLM, Connectors, Tools | OpenAI, Claude, APIs | + +### Đặc Điểm Nổi Bật + +1. **Microservices**: Các service độc lập, dễ scale +2. **Event-Driven**: Xử lý async cho document processing +3. **Multi-Tenant**: Hỗ trợ nhiều tenants với data isolation +4. **Hybrid Search**: Kết hợp vector search và full-text search +5. **Pluggable**: Hỗ trợ multiple LLM providers và vector stores +6. **Containerized**: Full Docker deployment với orchestration diff --git a/personal_analyze/03_sequence_diagrams.md b/personal_analyze/03_sequence_diagrams.md new file mode 100644 index 000000000..6f8cdc007 --- /dev/null +++ b/personal_analyze/03_sequence_diagrams.md @@ -0,0 +1,700 @@ +# RAGFlow - Sequence Diagrams + +Tài liệu này mô tả các luồng xử lý chính trong hệ thống RAGFlow thông qua sequence diagrams. + +## 1. User Authentication Flow + +### 1.1 User Registration + +```mermaid +sequenceDiagram + participant U as User + participant W as Web Frontend + participant A as API Server + participant DB as MySQL + participant R as Redis + + U->>W: Click Register + W->>W: Show registration form + U->>W: Enter email, password, nickname + W->>A: POST /api/v1/user/register + + A->>A: Validate input data + A->>DB: Check if email exists + + alt Email exists + DB-->>A: User found + A-->>W: 400 - Email already registered + W-->>U: Show error message + else Email not exists + DB-->>A: No user found + A->>A: Hash password (bcrypt) + A->>A: Generate user ID + A->>DB: INSERT User + A->>DB: CREATE Tenant for user + A->>DB: CREATE UserTenant association + DB-->>A: Success + A->>A: Generate JWT token + A->>R: Store session + A-->>W: 200 - Registration success + token + W->>W: Store token in localStorage + W-->>U: Redirect to dashboard + end +``` + +### 1.2 User Login + +```mermaid +sequenceDiagram + participant U as User + participant W as Web Frontend + participant A as API Server + participant DB as MySQL + participant R as Redis + + U->>W: Enter email/password + W->>A: POST /api/v1/user/login + + A->>DB: SELECT User WHERE email + + alt User not found + DB-->>A: No user + A-->>W: 401 - Invalid credentials + W-->>U: Show error + else User found + DB-->>A: User record + A->>A: Verify password (bcrypt) + + alt Password invalid + A-->>W: 401 - Invalid credentials + W-->>U: Show error + else Password valid + A->>A: Generate JWT (access_token) + A->>A: Generate refresh_token + A->>R: Store session data + A->>DB: Update last_login_time + A-->>W: 200 - Login success + Note over A,W: Response: {access_token, refresh_token, user_info} + W->>W: Store tokens + W-->>U: Redirect to dashboard + end + end +``` + +## 2. Knowledge Base Management + +### 2.1 Create Knowledge Base + +```mermaid +sequenceDiagram + participant U as User + participant W as Web Frontend + participant A as API Server + participant DB as MySQL + participant ES as Elasticsearch + + U->>W: Click "Create Knowledge Base" + W->>W: Show KB creation modal + U->>W: Enter name, description, settings + W->>A: POST /api/v1/kb/create + Note over W,A: Headers: Authorization: Bearer {token} + + A->>A: Validate JWT token + A->>A: Extract tenant_id from token + A->>DB: Check KB name uniqueness in tenant + + alt Name exists + A-->>W: 400 - Name already exists + W-->>U: Show error + else Name unique + A->>A: Generate KB ID + A->>DB: INSERT Knowledgebase + Note over A,DB: {id, name, tenant_id, embd_id, parser_id, ...} + + A->>ES: CREATE Index for KB + Note over A,ES: Index: ragflow_{kb_id} + ES-->>A: Index created + + DB-->>A: KB record saved + A-->>W: 200 - KB created + Note over A,W: {kb_id, name, created_at} + W-->>U: Show success, refresh KB list + end +``` + +### 2.2 List Knowledge Bases + +```mermaid +sequenceDiagram + participant U as User + participant W as Web Frontend + participant A as API Server + participant DB as MySQL + + U->>W: Open Knowledge Base page + W->>A: GET /api/v1/kb/list?page=1&size=10 + + A->>A: Validate JWT, extract tenant_id + A->>DB: SELECT * FROM knowledgebase WHERE tenant_id + A->>DB: COUNT total KBs + + DB-->>A: KB list + count + + loop For each KB + A->>DB: COUNT documents in KB + A->>DB: SUM chunk_num for KB + end + + A->>A: Build response with stats + A-->>W: 200 - KB list with pagination + Note over A,W: {data: [...], total, page, size} + + W->>W: Render KB cards + W-->>U: Display knowledge bases +``` + +## 3. Document Upload & Processing + +### 3.1 Document Upload Flow + +```mermaid +sequenceDiagram + participant U as User + participant W as Web Frontend + participant A as API Server + participant M as MinIO + participant DB as MySQL + participant Q as Task Queue (Redis) + + U->>W: Select files to upload + W->>W: Validate file types/sizes + + loop For each file + W->>A: POST /api/v1/document/upload + Note over W,A: multipart/form-data: file, kb_id + + A->>A: Validate file type + A->>A: Generate file_id, doc_id + + A->>M: Upload file to bucket + Note over A,M: Bucket: ragflow, Key: {tenant_id}/{kb_id}/{file_id} + M-->>A: Upload success, file_key + + A->>DB: INSERT File record + Note over A,DB: {id, name, size, location, tenant_id} + + A->>DB: INSERT Document record + Note over A,DB: {id, kb_id, name, status: 'UNSTART'} + + A->>Q: PUSH parsing task + Note over A,Q: {doc_id, file_location, parser_config} + + A-->>W: 200 - Upload success + Note over A,W: {doc_id, file_id, status} + end + + W-->>U: Show upload progress/success +``` + +### 3.2 Document Parsing Flow (Background Task) + +```mermaid +sequenceDiagram + participant Q as Task Queue + participant W as Worker + participant M as MinIO + participant P as Parser (DeepDoc) + participant E as Embedding Model + participant ES as Elasticsearch + participant DB as MySQL + + Q->>W: POP task from queue + W->>DB: UPDATE doc status = 'RUNNING' + + W->>M: Download file + M-->>W: File content + + W->>P: Parse document + Note over W,P: Based on file type (PDF, DOCX, etc.) + + P->>P: Extract text content + P->>P: Extract tables + P->>P: Extract images (if any) + P->>P: Layout analysis (for PDF) + P-->>W: Parsed content + + W->>W: Apply chunking strategy + Note over W: Token-based, sentence-based, or page-based + + W->>W: Generate chunks + + loop For each chunk batch + W->>E: Generate embeddings + Note over W,E: batch_size typically 32 + E-->>W: Vector embeddings [1536 dim] + + W->>ES: Bulk index chunks + Note over W,ES: {chunk_id, content, embedding, doc_id, kb_id} + ES-->>W: Index success + + W->>DB: INSERT Chunk records + end + + W->>DB: UPDATE Document + Note over W,DB: status='FINISHED', chunk_num, token_num + + W->>DB: UPDATE Task status = 'SUCCESS' +``` + +## 4. Chat/Dialog Flow + +### 4.1 Create Chat Session + +```mermaid +sequenceDiagram + participant U as User + participant W as Web Frontend + participant A as API Server + participant DB as MySQL + + U->>W: Click "New Chat" + W->>A: POST /api/v1/dialog/create + Note over W,A: {name, kb_ids[], llm_id, prompt_config} + + A->>A: Validate KB access + A->>DB: INSERT Dialog record + Note over A,DB: {id, name, tenant_id, kb_ids, llm_id, ...} + + DB-->>A: Dialog created + A-->>W: 200 - Dialog created + Note over A,W: {dialog_id, name, created_at} + + W-->>U: Open chat interface +``` + +### 4.2 Chat Message Flow (RAG) + +```mermaid +sequenceDiagram + participant U as User + participant W as Web Frontend + participant A as API Server + participant ES as Elasticsearch + participant RR as Reranker + participant LLM as LLM Provider + participant DB as MySQL + + U->>W: Type question + W->>A: POST /api/v1/dialog/chat (SSE) + Note over W,A: {dialog_id, conversation_id, question} + + A->>DB: Load dialog config + Note over A,DB: Get kb_ids, llm_config, prompt + + A->>DB: Load conversation history + + rect rgb(200, 220, 240) + Note over A,ES: RETRIEVAL PHASE + A->>A: Query understanding + A->>A: Generate query embedding + + A->>ES: Hybrid search + Note over A,ES: Vector similarity + BM25 full-text + ES-->>A: Top 100 candidates + + A->>RR: Rerank candidates + Note over A,RR: Cross-encoder scoring + RR-->>A: Top K chunks (typically 5-10) + end + + rect rgb(220, 240, 200) + Note over A,LLM: GENERATION PHASE + A->>A: Build prompt with context + Note over A: System prompt + Retrieved chunks + Question + + A->>LLM: Stream completion request + + loop Streaming response + LLM-->>A: Token chunk + A-->>W: SSE: data chunk + W-->>U: Display token + end + + LLM-->>A: [DONE] + end + + A->>DB: Save conversation message + Note over A,DB: {role, content, doc_ids[], conversation_id} + + A-->>W: SSE: [DONE] + sources + W-->>U: Show sources/citations +``` + +### 4.3 Streaming Response Detail + +```mermaid +sequenceDiagram + participant W as Web Frontend + participant A as API Server + participant LLM as LLM Provider + + W->>A: POST /api/v1/dialog/chat + Note over W,A: Accept: text/event-stream + + A->>A: Process retrieval... + + A->>LLM: POST /v1/chat/completions + Note over A,LLM: stream: true + + loop Until complete + LLM-->>A: data: {"choices":[{"delta":{"content":"..."}}]} + A->>A: Extract content + A-->>W: data: {"answer": "...", "reference": {...}} + W->>W: Append to display + end + + LLM-->>A: data: [DONE] + A-->>W: data: {"answer": "", "reference": {...}, "done": true} + W->>W: Show final state +``` + +## 5. Agent Workflow Execution + +### 5.1 Canvas Workflow Execution + +```mermaid +sequenceDiagram + participant U as User + participant W as Web Frontend + participant A as API Server + participant C as Canvas Engine + participant Comp as Components + participant LLM as LLM Provider + participant Tools as External Tools + + U->>W: Run workflow + W->>A: POST /api/v1/canvas/run + Note over W,A: {canvas_id, input_data} + + A->>C: Initialize canvas execution + C->>C: Parse workflow DSL + C->>C: Build execution graph + + rect rgb(240, 220, 200) + Note over C,Comp: BEGIN Component + C->>Comp: Execute BEGIN + Comp->>Comp: Initialize variables + Comp-->>C: {user_input: "..."} + end + + rect rgb(200, 220, 240) + Note over C,Comp: RETRIEVAL Component + C->>Comp: Execute RETRIEVAL + Comp->>A: Search knowledge bases + A-->>Comp: Retrieved chunks + Comp-->>C: {context: [...]} + end + + rect rgb(220, 240, 200) + Note over C,LLM: LLM Component + C->>Comp: Execute LLM + Comp->>Comp: Build prompt with variables + Comp->>LLM: Chat completion + LLM-->>Comp: Response + Comp-->>C: {llm_output: "..."} + end + + rect rgb(240, 240, 200) + Note over C,Tools: TOOL Component (optional) + C->>Comp: Execute TOOL (e.g., Tavily) + Comp->>Tools: API call + Tools-->>Comp: Tool result + Comp-->>C: {tool_output: {...}} + end + + rect rgb(220, 220, 240) + Note over C,Comp: CATEGORIZE Component + C->>Comp: Execute CATEGORIZE + Comp->>Comp: Evaluate conditions + Comp-->>C: {next_node: "node_id"} + end + + C->>C: Continue to next component... + + C-->>A: Workflow complete + A-->>W: SSE: Final output + W-->>U: Display result +``` + +### 5.2 Agent with Tools Flow + +```mermaid +sequenceDiagram + participant U as User + participant A as Agent Engine + participant LLM as LLM Provider + participant T1 as Tavily Search + participant T2 as Wikipedia + participant T3 as Code Executor + + U->>A: Question requiring tools + + A->>LLM: Initial prompt + available tools + Note over A,LLM: Tools: [tavily_search, wikipedia, code_exec] + + loop ReAct Loop + LLM-->>A: Thought + Action + Note over LLM,A: Action: {"tool": "tavily_search", "input": "..."} + + alt Tool: tavily_search + A->>T1: Search query + T1-->>A: Search results + else Tool: wikipedia + A->>T2: Page lookup + T2-->>A: Wikipedia content + else Tool: code_exec + A->>T3: Execute code + T3-->>A: Execution result + end + + A->>LLM: Observation from tool + + alt LLM decides more tools needed + LLM-->>A: Another Action + else LLM ready to answer + LLM-->>A: Final Answer + end + end + + A-->>U: Final response with sources +``` + +## 6. GraphRAG Flow + +### 6.1 Knowledge Graph Construction + +```mermaid +sequenceDiagram + participant D as Document + participant E as Entity Extractor + participant LLM as LLM Provider + participant ER as Entity Resolution + participant G as Graph Store + + D->>E: Document chunks + + loop For each chunk + E->>LLM: Extract entities prompt + Note over E,LLM: "Extract entities and relationships..." + LLM-->>E: Entities + Relations + Note over LLM,E: [{entity, type, properties}, {src, rel, dst}] + end + + E->>ER: All extracted entities + + ER->>ER: Cluster similar entities + ER->>LLM: Entity resolution prompt + Note over ER,LLM: "Are these the same entity?" + LLM-->>ER: Resolution decisions + + ER->>ER: Merge duplicate entities + ER-->>G: Resolved entities + relations + + G->>G: Build graph structure + G->>G: Create entity embeddings + G->>G: Index for search +``` + +### 6.2 GraphRAG Query Flow + +```mermaid +sequenceDiagram + participant U as User + participant Q as Query Analyzer + participant G as Graph Store + participant V as Vector Search + participant LLM as LLM Provider + + U->>Q: Natural language query + + Q->>LLM: Analyze query + Note over Q,LLM: Extract entities, intent, constraints + LLM-->>Q: Query analysis + + par Graph Search + Q->>G: Find related entities + G->>G: Traverse relationships + G-->>Q: Subgraph context + and Vector Search + Q->>V: Semantic search + V-->>Q: Relevant chunks + end + + Q->>Q: Merge graph + vector results + Q->>Q: Build unified context + + Q->>LLM: Generate with context + Note over Q,LLM: Context includes entity relations + LLM-->>Q: Response with graph insights + + Q-->>U: Answer + entity graph visualization +``` + +## 7. File Operations + +### 7.1 File Download Flow + +```mermaid +sequenceDiagram + participant U as User + participant W as Web Frontend + participant A as API Server + participant M as MinIO + participant DB as MySQL + + U->>W: Click download + W->>A: GET /api/v1/file/download/{file_id} + + A->>A: Validate JWT + A->>DB: Get file record + A->>A: Check user permission + + alt No permission + A-->>W: 403 Forbidden + else Has permission + A->>M: Get file from storage + M-->>A: File stream + A-->>W: File stream with headers + Note over A,W: Content-Disposition: attachment + W-->>U: Download starts + end +``` + +## 8. Search Operations + +### 8.1 Hybrid Search Flow + +```mermaid +sequenceDiagram + participant U as User + participant A as API Server + participant E as Embedding Model + participant ES as Elasticsearch + + U->>A: Search query + + A->>E: Embed query text + E-->>A: Query vector [1536] + + A->>ES: Hybrid query + Note over A,ES: script_score (vector) + bool (BM25) + + ES->>ES: Vector similarity search + Note over ES: cosine_similarity on dense_vector + + ES->>ES: BM25 full-text search + Note over ES: match on content field + + ES->>ES: Combine scores + Note over ES: final = vector_score * weight + bm25_score * weight + + ES-->>A: Ranked results + + A->>A: Post-process results + A->>A: Add highlights + A->>A: Group by document + + A-->>U: Search results with snippets +``` + +## 9. Multi-Tenancy Flow + +### 9.1 Tenant Data Isolation + +```mermaid +sequenceDiagram + participant U1 as User (Tenant A) + participant U2 as User (Tenant B) + participant A as API Server + participant DB as MySQL + + U1->>A: GET /api/v1/kb/list + A->>A: Extract tenant_id from JWT + Note over A: tenant_id = "tenant_a" + A->>DB: SELECT * FROM kb WHERE tenant_id = 'tenant_a' + DB-->>A: Tenant A's KBs only + A-->>U1: KBs for Tenant A + + U2->>A: GET /api/v1/kb/list + A->>A: Extract tenant_id from JWT + Note over A: tenant_id = "tenant_b" + A->>DB: SELECT * FROM kb WHERE tenant_id = 'tenant_b' + DB-->>A: Tenant B's KBs only + A-->>U2: KBs for Tenant B + + Note over U1,U2: Data is completely isolated +``` + +## 10. Connector Integration Flow + +### 10.1 Confluence Connector Sync + +```mermaid +sequenceDiagram + participant U as User + participant A as API Server + participant C as Confluence Connector + participant CF as Confluence API + participant DB as MySQL + participant Q as Task Queue + + U->>A: Setup Confluence connector + Note over U,A: {url, username, api_token, space_key} + + A->>C: Initialize connector + C->>CF: Authenticate + CF-->>C: Auth success + + A->>DB: Save connector config + A-->>U: Connector created + + U->>A: Start sync + A->>Q: Queue sync task + + Q->>C: Execute sync + C->>CF: GET /wiki/rest/api/content + CF-->>C: Content list + + loop For each page + C->>CF: GET page content + CF-->>C: Page HTML + C->>C: Convert to markdown + C->>A: Create document + A->>Q: Queue parsing task + end + + C->>DB: Update sync status + C-->>A: Sync complete + A-->>U: Show sync results +``` + +## Tóm Tắt + +| Flow | Thành phần chính | Mô tả | +|------|-----------------|-------| +| Authentication | User, API, DB, Redis | Đăng ký, đăng nhập với JWT | +| Knowledge Base | API, MySQL, ES | CRUD knowledge bases | +| Document Upload | API, MinIO, Queue, ES | Upload và index documents | +| Chat/Dialog | API, ES, Reranker, LLM | RAG-based chat với streaming | +| Agent Workflow | Canvas Engine, Components, LLM, Tools | Visual workflow execution | +| GraphRAG | Entity Extractor, Graph Store, LLM | Knowledge graph queries | +| Search | Embedding, ES | Hybrid vector + BM25 search | +| Connectors | Connector, External API | Sync external data sources | + +### Các Pattern Thiết Kế Sử Dụng + +1. **Event-Driven**: Task queue cho background processing +2. **Streaming**: SSE cho real-time chat responses +3. **Hybrid Search**: Kết hợp vector và text search +4. **ReAct Pattern**: Agent reasoning với tool use +5. **Multi-Tenancy**: Data isolation per tenant diff --git a/personal_analyze/04_modules_analysis.md b/personal_analyze/04_modules_analysis.md new file mode 100644 index 000000000..6be9dd2aa --- /dev/null +++ b/personal_analyze/04_modules_analysis.md @@ -0,0 +1,949 @@ +# RAGFlow - Phân Tích Chi Tiết Các Module + +## 1. Module API (`/api/`) + +### 1.1 Tổng Quan + +Module API là trung tâm xử lý tất cả HTTP requests của hệ thống. Được xây dựng trên Flask/Quart framework với kiến trúc Blueprint. + +### 1.2 Cấu Trúc + +``` +api/ +├── ragflow_server.py # Entry point - Khởi tạo Flask app +├── settings.py # Cấu hình server +├── constants.py # API_VERSION = "v1" +├── validation.py # Request validation +│ +├── apps/ # API Blueprints +├── db/ # Database layer +└── utils/ # Utilities +``` + +### 1.3 Chi Tiết Các Blueprint (API Apps) + +#### 1.3.1 `kb_app.py` - Knowledge Base Management +**Chức năng**: Quản lý Knowledge Base (tạo, xóa, sửa, liệt kê) + +**Endpoints chính**: +| Method | Endpoint | Mô tả | +|--------|----------|-------| +| POST | `/api/v1/kb/create` | Tạo KB mới | +| GET | `/api/v1/kb/list` | Liệt kê KBs | +| PUT | `/api/v1/kb/update` | Cập nhật KB | +| DELETE | `/api/v1/kb/delete` | Xóa KB | +| GET | `/api/v1/kb/{id}` | Chi tiết KB | + +**Logic chính**: +- Validation tenant permissions +- Tạo Elasticsearch index cho mỗi KB +- Quản lý embedding model settings +- Quản lý parser configurations + +#### 1.3.2 `document_app.py` - Document Management +**Chức năng**: Upload, parsing, và quản lý documents + +**Endpoints chính**: +| Method | Endpoint | Mô tả | +|--------|----------|-------| +| POST | `/api/v1/document/upload` | Upload file | +| POST | `/api/v1/document/run` | Trigger parsing | +| GET | `/api/v1/document/list` | Liệt kê docs | +| DELETE | `/api/v1/document/delete` | Xóa document | +| GET | `/api/v1/document/{id}/chunks` | Lấy chunks | + +**Logic chính**: +- File type validation +- MinIO storage integration +- Background task queuing +- Parsing status tracking + +#### 1.3.3 `dialog_app.py` - Chat/Dialog Management +**Chức năng**: Xử lý chat conversations với RAG + +**Endpoints chính**: +| Method | Endpoint | Mô tả | +|--------|----------|-------| +| POST | `/api/v1/dialog/create` | Tạo dialog | +| POST | `/api/v1/dialog/chat` | Chat (SSE streaming) | +| POST | `/api/v1/dialog/completion` | Non-streaming chat | +| GET | `/api/v1/dialog/list` | Liệt kê dialogs | + +**Logic chính**: +- RAG pipeline orchestration +- Streaming response (SSE) +- Conversation history management +- Multi-KB retrieval + +#### 1.3.4 `canvas_app.py` - Agent Workflow +**Chức năng**: Visual workflow builder cho AI agents + +**Endpoints chính**: +| Method | Endpoint | Mô tả | +|--------|----------|-------| +| POST | `/api/v1/canvas/create` | Tạo workflow | +| POST | `/api/v1/canvas/run` | Execute workflow | +| PUT | `/api/v1/canvas/update` | Cập nhật | +| GET | `/api/v1/canvas/list` | Liệt kê | + +**Logic chính**: +- DSL parsing và validation +- Component orchestration +- Tool integration +- Variable passing between nodes + +#### 1.3.5 `file_app.py` - File Management +**Chức năng**: Upload, download, quản lý files + +**Endpoints chính**: +| Method | Endpoint | Mô tả | +|--------|----------|-------| +| POST | `/api/v1/file/upload` | Upload file | +| GET | `/api/v1/file/download/{id}` | Download | +| GET | `/api/v1/file/list` | Liệt kê files | +| DELETE | `/api/v1/file/delete` | Xóa file | + +#### 1.3.6 `search_app.py` - Search Operations +**Chức năng**: Full-text và semantic search + +**Endpoints chính**: +| Method | Endpoint | Mô tả | +|--------|----------|-------| +| POST | `/api/v1/search` | Hybrid search | +| GET | `/api/v1/search/history` | Search history | + +### 1.4 Database Services (`/api/db/services/`) + +#### `dialog_service.py` (37KB - Service phức tạp nhất) +```python +class DialogService: + def chat(dialog_id, question, stream=True): + """ + Main RAG chat function + 1. Load dialog configuration + 2. Get relevant documents (retrieval) + 3. Rerank results + 4. Build prompt with context + 5. Call LLM (streaming) + 6. Save conversation + """ + + def retrieval(dialog, question): + """ + Hybrid retrieval from Elasticsearch + - Vector similarity search + - BM25 full-text search + - Score combination + """ + + def rerank(chunks, question): + """ + Cross-encoder reranking + - Score each chunk against question + - Return top-k + """ +``` + +#### `document_service.py` (39KB) +```python +class DocumentService: + def upload(file, kb_id): + """Upload file to MinIO, create DB record""" + + def parse(doc_id): + """Queue document for background parsing""" + + def chunk(doc_id, chunks): + """Save parsed chunks to ES and DB""" + + def delete(doc_id): + """Remove doc, chunks, and file""" +``` + +#### `knowledgebase_service.py` (21KB) +```python +class KnowledgebaseService: + def create(name, embedding_model, parser_id): + """Create KB with ES index""" + + def update_parser_config(kb_id, config): + """Update chunking/parsing settings""" + + def get_statistics(kb_id): + """Get doc count, chunk count, etc.""" +``` + +### 1.5 Database Models (`/api/db/db_models.py`) + +**25+ Models quan trọng**: + +```python +# User & Tenant +class User(BaseModel): + id, email, password, nickname, avatar, status, login_channel + +class Tenant(BaseModel): + id, name, public_key, llm_id, embd_id, parser_id, credit + +class UserTenant(BaseModel): + user_id, tenant_id, role # owner, admin, member + +# Knowledge Management +class Knowledgebase(BaseModel): + id, tenant_id, name, description, embd_id, parser_id, + similarity_threshold, vector_similarity_weight, ... + +class Document(BaseModel): + id, kb_id, name, location, size, type, parser_id, + status, progress, chunk_num, token_num, process_duation + +class File(BaseModel): + id, tenant_id, name, size, location, type, source_type + +# Chat & Dialog +class Dialog(BaseModel): + id, tenant_id, name, description, kb_ids, llm_id, + prompt_config, similarity_threshold, top_n, top_k + +class Conversation(BaseModel): + id, dialog_id, name, message # JSON array of messages + +# Workflow +class UserCanvas(BaseModel): + id, tenant_id, name, dsl, avatar # DSL is workflow definition + +class CanvasTemplate(BaseModel): + id, name, dsl, avatar # Pre-built templates + +# Integration +class APIToken(BaseModel): + id, tenant_id, token, dialog_id # For external API access + +class MCPServer(BaseModel): + id, tenant_id, name, host, tools # MCP server config +``` + +--- + +## 2. Module RAG (`/rag/`) + +### 2.1 Tổng Quan + +Core RAG processing engine - xử lý từ document parsing đến retrieval. + +### 2.2 LLM Abstractions (`/rag/llm/`) + +#### `chat_model.py` - Chat LLM Interface +```python +class Base: + """Abstract base for all chat models""" + def chat(messages, stream=True, **kwargs): + """Generate chat completion""" + +class OpenAIChat(Base): + """OpenAI GPT models""" + +class ClaudeChat(Base): + """Anthropic Claude models""" + +class QwenChat(Base): + """Alibaba Qwen models""" + +class OllamaChat(Base): + """Local Ollama models""" + +# Factory function +def get_chat_model(model_name, api_key, base_url): + """Return appropriate chat model instance""" +``` + +**Supported Providers** (20+): +- OpenAI (GPT-3.5, GPT-4, GPT-4V) +- Anthropic (Claude 3) +- Google (Gemini) +- Alibaba (Qwen, Qwen-VL) +- Groq +- Mistral +- Cohere +- DeepSeek +- Zhipu (GLM) +- Moonshot +- Ollama (local) +- NVIDIA +- Bedrock (AWS) +- Azure OpenAI +- Hugging Face +- ... + +#### `embedding_model.py` - Embedding Interface +```python +class Base: + """Abstract base for embeddings""" + def encode(texts: List[str]) -> List[List[float]]: + """Generate embeddings for texts""" + +class OpenAIEmbed(Base): + """text-embedding-ada-002, text-embedding-3-*""" + +class BGEEmbed(Base): + """BAAI BGE models""" + +class JinaEmbed(Base): + """Jina AI embeddings""" + +# Supported embedding models: +# - OpenAI: ada-002, embedding-3-small, embedding-3-large +# - BGE: bge-base, bge-large, bge-m3 +# - Jina: jina-embeddings-v2 +# - Cohere: embed-english-v3 +# - HuggingFace: sentence-transformers +# - Local: Ollama embeddings +``` + +#### `rerank_model.py` - Reranking Interface +```python +class Base: + """Abstract base for rerankers""" + def rerank(query: str, documents: List[str]) -> List[float]: + """Score documents against query""" + +class CohereRerank(Base): + """Cohere rerank models""" + +class JinaRerank(Base): + """Jina AI reranker""" + +class BGERerank(Base): + """BAAI BGE reranker""" +``` + +### 2.3 RAG Pipeline (`/rag/flow/`) + +#### Pipeline Architecture +``` +Document → Parser → Tokenizer → Splitter → Embedder → Index +``` + +#### `parser/parser.py` +```python +def parse(file_path, parser_config): + """ + Parse document based on file type + Returns: List of text segments with metadata + """ + # Supported parsers: + # - naive: Simple text extraction + # - paper: Academic paper structure + # - book: Book chapter detection + # - laws: Legal document parsing + # - presentation: PPT parsing + # - qa: Q&A format extraction + # - table: Table extraction + # - picture: Image description + # - one: Single chunk per doc + # - audio: Audio transcription + # - email: Email thread parsing +``` + +#### `splitter/splitter.py` +```python +class Splitter: + """Document chunking strategies""" + + def split_by_tokens(text, chunk_size=512, overlap=128): + """Token-based splitting""" + + def split_by_sentences(text, max_sentences=10): + """Sentence-based splitting""" + + def split_by_delimiter(text, delimiter='\n\n'): + """Delimiter-based splitting""" + + def split_semantic(text, threshold=0.5): + """Semantic similarity based splitting""" +``` + +#### `tokenizer/tokenizer.py` +```python +class Tokenizer: + """Text tokenization""" + + def tokenize(text): + """Convert text to tokens""" + + def count_tokens(text): + """Count tokens in text""" + + # Uses tiktoken for OpenAI models + # Uses model-specific tokenizers for others +``` + +### 2.4 RAPTOR (`/rag/raptor.py`) + +**RAPTOR** = Recursive Abstractive Processing for Tree-Organized Retrieval + +```python +class RAPTOR: + """ + Hierarchical document representation + - Clusters similar chunks + - Creates summaries of clusters + - Builds tree structure for retrieval + """ + + def build_tree(chunks): + """Build RAPTOR tree from chunks""" + + def retrieve(query, tree): + """Retrieve from tree structure""" +``` + +--- + +## 3. Module DeepDoc (`/deepdoc/`) + +### 3.1 Tổng Quan + +Deep document understanding với layout analysis và OCR. + +### 3.2 Document Parsers (`/deepdoc/parser/`) + +#### `pdf_parser.py` - PDF Processing +```python +class PdfParser: + """ + Advanced PDF parsing with: + - OCR for scanned pages + - Layout analysis (tables, figures, headers) + - Multi-column detection + - Image extraction + """ + + def __call__(file_path): + """Parse PDF file""" + # 1. Extract text with PyMuPDF + # 2. Apply OCR if needed (Tesseract) + # 3. Analyze layout (detectron2/layoutlm) + # 4. Extract tables (camelot/tabula) + # 5. Extract images + # Return structured content +``` + +#### `docx_parser.py` - Word Documents +```python +class DocxParser: + """ + Parse .docx files + - Text extraction + - Table extraction + - Image extraction + - Style preservation + """ +``` + +#### `excel_parser.py` - Spreadsheets +```python +class ExcelParser: + """ + Parse .xlsx/.xls files + - Sheet-by-sheet processing + - Table structure preservation + - Formula evaluation + """ +``` + +#### `html_parser.py` - Web Pages +```python +class HtmlParser: + """ + Parse HTML content + - Clean HTML + - Extract main content + - Handle tables + - Remove scripts/styles + """ +``` + +### 3.3 Vision Module (`/deepdoc/vision/`) + +```python +class LayoutAnalyzer: + """ + Document layout analysis using ML + - Detectron2 for object detection + - LayoutLM for document understanding + """ + + def analyze(image): + """ + Detect document regions: + - Title + - Paragraph + - Table + - Figure + - Header/Footer + - List + """ +``` + +--- + +## 4. Module Agent (`/agent/`) + +### 4.1 Tổng Quan + +Agentic workflow system với visual canvas builder. + +### 4.2 Canvas Engine (`/agent/canvas.py`) + +```python +class Canvas: + """ + Main workflow orchestrator + - Parse DSL definition + - Execute components in order + - Handle branching logic + - Manage variables + """ + + def __init__(self, dsl): + """Initialize from DSL""" + self.components = self._parse_dsl(dsl) + self.graph = self._build_graph() + + def run(self, input_data): + """Execute workflow""" + context = {"input": input_data} + + for component in self._topological_sort(): + result = component.execute(context) + context.update(result) + + return context["output"] +``` + +### 4.3 Components (`/agent/component/`) + +#### `begin.py` - Workflow Start +```python +class BeginComponent: + """ + Entry point of workflow + - Initialize variables + - Receive user input + """ + def execute(self, context): + return {"user_input": context["input"]} +``` + +#### `llm.py` - LLM Component +```python +class LLMComponent: + """ + Call LLM with configured prompt + - Template variable substitution + - Streaming support + - Output parsing + """ + def execute(self, context): + prompt = self.template.format(**context) + response = self.llm.chat(prompt) + return {"llm_output": response} +``` + +#### `retrieval.py` - Retrieval Component +```python +class RetrievalComponent: + """ + Search knowledge bases + - Multi-KB search + - Configurable top_k + - Score threshold + """ + def execute(self, context): + query = context["user_input"] + results = self.search(query, self.kb_ids) + return {"retrieved_docs": results} +``` + +#### `categorize.py` - Conditional Branching +```python +class CategorizeComponent: + """ + Route to different paths based on conditions + - LLM-based classification + - Rule-based matching + """ + def execute(self, context): + category = self._classify(context) + return {"next_node": self.routes[category]} +``` + +#### `agent_with_tools.py` - Tool-Using Agent +```python +class AgentWithToolsComponent: + """ + ReAct pattern agent + - Tool selection + - Iterative reasoning + - Observation handling + """ + def execute(self, context): + while not done: + action = self.llm.decide_action(context) + if action.type == "tool": + result = self.tools[action.tool].run(action.input) + context["observation"] = result + else: + return {"output": action.response} +``` + +### 4.4 Tools (`/agent/tools/`) + +#### External Tool Integrations + +| Tool | File | Chức năng | +|------|------|-----------| +| Tavily | `tavily.py` | Web search API | +| ArXiv | `arxiv.py` | Academic paper search | +| Google | `google.py` | Google search | +| Wikipedia | `wikipedia.py` | Wikipedia lookup | +| GitHub | `github.py` | GitHub API | +| Email | `email.py` | Send emails | +| Code Exec | `code_exec.py` | Execute Python code | +| DeepL | `deepl.py` | Translation | +| Jin10 | `jin10.py` | Financial news | +| TuShare | `tushare.py` | Chinese stock data | +| Yahoo Finance | `yahoofinance.py` | Stock data | +| QWeather | `qweather.py` | Weather data | + +```python +class BaseTool: + """Base class for all tools""" + name: str + description: str + + def run(self, input: str) -> str: + """Execute tool and return result""" + +class TavilySearch(BaseTool): + name = "tavily_search" + description = "Search the web for current information" + + def run(self, query): + response = tavily.search(query) + return format_results(response) +``` + +--- + +## 5. Module GraphRAG (`/graphrag/`) + +### 5.1 Tổng Quan + +Knowledge graph construction và querying. + +### 5.2 Entity Resolution (`/graphrag/entity_resolution.py`) + +```python +class EntityResolution: + """ + Entity extraction và linking + - Extract entities from text + - Cluster similar entities + - Resolve duplicates + """ + + def extract_entities(text): + """Extract named entities using LLM""" + prompt = f"Extract entities from: {text}" + return llm.chat(prompt) + + def resolve_entities(entities): + """Merge duplicate entities""" + clusters = self._cluster_similar(entities) + return self._merge_clusters(clusters) +``` + +### 5.3 Graph Search (`/graphrag/search.py`) + +```python +class GraphSearch: + """ + Query knowledge graph + - Entity-based search + - Relationship traversal + - Subgraph extraction + """ + + def search(query): + """Find relevant subgraph for query""" + # 1. Extract query entities + # 2. Find matching graph entities + # 3. Traverse relationships + # 4. Return context subgraph +``` + +--- + +## 6. Module Frontend (`/web/`) + +### 6.1 Tổng Quan + +React/TypeScript SPA với UmiJS framework. + +### 6.2 Pages (`/web/src/pages/`) + +| Page | Chức năng | +|------|-----------| +| `/dataset` | Knowledge base management | +| `/datasets` | Dataset list view | +| `/next-chats` | Chat interface | +| `/next-searches` | Search interface | +| `/document-viewer` | Document preview | +| `/admin` | Admin dashboard | +| `/login` | Authentication | +| `/register` | User registration | + +### 6.3 Components (`/web/src/components/`) + +**Core Components**: +- `file-upload-modal/` - File upload UI +- `pdf-drawer/` - PDF preview drawer +- `prompt-editor/` - Prompt template editor +- `document-preview/` - Document viewer +- `llm-setting-items/` - LLM configuration UI +- `ui/` - Shadcn/UI base components + +### 6.4 State Management + +```typescript +// Using Zustand for state +import { create } from 'zustand'; + +interface KnowledgebaseStore { + knowledgebases: Knowledgebase[]; + currentKb: Knowledgebase | null; + fetchKnowledgebases: () => Promise; + createKnowledgebase: (data: CreateKbRequest) => Promise; +} + +export const useKnowledgebaseStore = create((set) => ({ + knowledgebases: [], + currentKb: null, + fetchKnowledgebases: async () => { + const data = await api.get('/kb/list'); + set({ knowledgebases: data }); + }, + // ... +})); +``` + +### 6.5 API Services (`/web/src/services/`) + +```typescript +// API client using Axios +import { request } from 'umi'; + +export async function createKnowledgebase(data: CreateKbRequest) { + return request('/api/v1/kb/create', { + method: 'POST', + data, + }); +} + +export async function chat(dialogId: string, question: string) { + return request('/api/v1/dialog/chat', { + method: 'POST', + data: { dialog_id: dialogId, question }, + responseType: 'stream', + }); +} +``` + +--- + +## 7. Module Common (`/common/`) + +### 7.1 Configuration (`/common/settings.py`) + +```python +# Main configuration file +class Settings: + # Database + MYSQL_HOST = os.getenv('MYSQL_HOST', 'localhost') + MYSQL_PORT = int(os.getenv('MYSQL_PORT', 5455)) + MYSQL_USER = os.getenv('MYSQL_USER', 'root') + MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', 'infini_rag_flow') + MYSQL_DATABASE = os.getenv('MYSQL_DATABASE', 'ragflow') + + # Elasticsearch + ES_HOSTS = os.getenv('ES_HOSTS', 'http://localhost:9200').split(',') + + # Redis + REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') + REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) + + # MinIO + MINIO_HOST = os.getenv('MINIO_HOST', 'localhost:9000') + MINIO_ACCESS_KEY = os.getenv('MINIO_USER', 'rag_flow') + MINIO_SECRET_KEY = os.getenv('MINIO_PASSWORD', 'infini_rag_flow') + + # Document Engine + DOC_ENGINE = os.getenv('DOC_ENGINE', 'elasticsearch') # or 'infinity' +``` + +### 7.2 Data Source Connectors (`/common/data_source/`) + +**Supported Connectors**: + +| Connector | File | Chức năng | +|-----------|------|-----------| +| Confluence | `confluence_connector.py` (81KB) | Atlassian Confluence wiki | +| Notion | `notion_connector.py` (25KB) | Notion databases | +| Slack | `slack_connector.py` (22KB) | Slack messages | +| Gmail | `gmail_connector.py` | Gmail emails | +| Discord | `discord_connector.py` | Discord channels | +| SharePoint | `sharepoint_connector.py` | Microsoft SharePoint | +| Teams | `teams_connector.py` | Microsoft Teams | +| Dropbox | `dropbox_connector.py` | Dropbox files | +| Google Drive | `google_drive/` | Google Drive | +| WebDAV | `webdav_connector.py` | WebDAV servers | +| Moodle | `moodle_connector.py` | Moodle LMS | + +```python +class BaseConnector: + """Abstract base for connectors""" + + def authenticate(credentials): + """Authenticate with external service""" + + def list_items(): + """List available items""" + + def sync(): + """Sync data to RAGFlow""" + +class ConfluenceConnector(BaseConnector): + """Confluence integration""" + + def __init__(self, url, username, api_token): + self.client = Confluence(url, username, api_token) + + def sync_space(space_key): + """Sync all pages from a space""" + pages = self.client.get_all_pages(space_key) + for page in pages: + content = self._convert_to_markdown(page.body) + yield Document(content=content, metadata=page.metadata) +``` + +--- + +## 8. Module SDK (`/sdk/python/`) + +### 8.1 Python SDK + +```python +from ragflow import RAGFlow + +# Initialize client +client = RAGFlow( + api_key="your-api-key", + base_url="http://localhost:9380" +) + +# Create knowledge base +kb = client.create_knowledgebase( + name="My KB", + embedding_model="text-embedding-3-small" +) + +# Upload document +doc = kb.upload_document("path/to/document.pdf") + +# Wait for parsing +doc.wait_for_ready() + +# Create chat +chat = client.create_chat( + name="My Chat", + knowledgebase_ids=[kb.id] +) + +# Send message +response = chat.send_message("What is this document about?") +print(response.answer) +``` + +--- + +## 9. Tóm Tắt Module Dependencies + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Frontend (web/) │ +└─────────────────────────────┬───────────────────────────────────┘ + │ HTTP/SSE + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ API (api/) │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ kb_app │ │doc_app │ │dialog_ │ │canvas_ │ │ +│ │ │ │ │ │app │ │app │ │ +│ └────┬────┘ └────┬────┘ └────┬────┘ └────┬────┘ │ +│ └────────────┴───────────┴────────────┘ │ +│ │ │ +│ ┌──────────────────────────┴──────────────────────────┐ │ +│ │ Services Layer │ │ +│ │ DialogService │ DocumentService │ KBService │ │ +│ └───────────────────────────┬─────────────────────────┘ │ +└───────────────────────────────┼─────────────────────────────────┘ + │ + ┌───────────────────────┼───────────────────────┐ + │ │ │ + ▼ ▼ ▼ +┌───────────────┐ ┌──────────────────┐ ┌──────────────────┐ +│ RAG (rag/) │ │ Agent (agent/) │ │GraphRAG(graphrag)│ +│ │ │ │ │ │ +│ - LLM Models │ │ - Canvas Engine │ │ - Entity Res. │ +│ - Pipeline │ │ - Components │ │ - Graph Search │ +│ - Embeddings │ │ - Tools │ │ │ +└───────┬───────┘ └────────┬─────────┘ └────────┬─────────┘ + │ │ │ + └─────────────────────┼───────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ DeepDoc (deepdoc/) │ +│ │ +│ PDF Parser │ DOCX Parser │ HTML Parser │ Vision/OCR │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Common (common/) │ +│ │ +│ Settings │ Utilities │ Data Source Connectors │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Data Stores │ +│ │ +│ MySQL │ Elasticsearch/Infinity │ Redis │ MinIO │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## 10. Kích Thước Code Ước Tính + +| Module | Lines of Code | Complexity | +|--------|--------------|------------| +| api/ | ~15,000 | High | +| rag/ | ~8,000 | High | +| deepdoc/ | ~5,000 | Medium | +| agent/ | ~6,000 | High | +| graphrag/ | ~3,000 | Medium | +| web/src/ | ~20,000 | High | +| common/ | ~5,000 | Medium | +| **Total** | **~62,000** | - | diff --git a/personal_analyze/05_tech_stack.md b/personal_analyze/05_tech_stack.md new file mode 100644 index 000000000..c714709e3 --- /dev/null +++ b/personal_analyze/05_tech_stack.md @@ -0,0 +1,634 @@ +# RAGFlow - Tech Stack Analysis + +## 1. Tổng Quan Tech Stack + +RAGFlow sử dụng một tech stack hiện đại, được thiết kế để xử lý các workload AI/ML nặng với khả năng scale tốt. + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ TECH STACK OVERVIEW │ +├─────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────────────────────────────────────────────────────────────┐ │ +│ │ FRONTEND │ │ +│ │ React 18 │ TypeScript │ UmiJS │ Ant Design │ Tailwind CSS │ │ +│ │ Zustand │ TanStack Query │ XYFlow │ Monaco Editor │ │ +│ └────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌────────────────────────────────────────────────────────────────────┐ │ +│ │ BACKEND │ │ +│ │ Python 3.10-3.12 │ Flask/Quart │ Peewee ORM │ Celery │ │ +│ │ AsyncIO │ JWT │ SSE Streaming │ │ +│ └────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌────────────────────────────────────────────────────────────────────┐ │ +│ │ AI/ML │ │ +│ │ LangChain │ OpenAI │ Sentence Transformers │ Hugging Face │ │ +│ │ PyTorch │ Detectron2 │ Tesseract OCR │ │ +│ └────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌────────────────────────────────────────────────────────────────────┐ │ +│ │ DATA LAYER │ │ +│ │ MySQL 8 │ Elasticsearch 8 │ Redis │ MinIO │ Infinity │ │ +│ └────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌────────────────────────────────────────────────────────────────────┐ │ +│ │ INFRASTRUCTURE │ │ +│ │ Docker │ Docker Compose │ Kubernetes │ Nginx │ Helm │ │ +│ └────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 2. Frontend Technologies + +### 2.1 Core Framework + +| Technology | Version | Mục đích | +|------------|---------|----------| +| **React** | 18.x | UI library chính | +| **TypeScript** | 5.x | Type-safe JavaScript | +| **UmiJS** | 4.x | React framework (Ant Design ecosystem) | +| **Vite** | 5.x | Build tool (nhanh hơn Webpack) | + +### 2.2 UI Libraries + +| Library | Version | Mục đích | +|---------|---------|----------| +| **Ant Design** | 5.x | Primary UI component library | +| **Shadcn/UI** | Latest | Modern, customizable components | +| **Radix UI** | Latest | Headless UI primitives | +| **Tailwind CSS** | 3.x | Utility-first CSS framework | +| **LESS** | 4.x | CSS preprocessor (legacy) | + +### 2.3 State Management & Data Fetching + +| Library | Mục đích | +|---------|----------| +| **Zustand** | Lightweight state management | +| **TanStack React Query** | Server state & caching | +| **Axios** | HTTP client | + +### 2.4 Specialized Libraries + +| Library | Mục đích | +|---------|----------| +| **XYFlow (React Flow)** | Workflow/canvas visualization | +| **Monaco Editor** | Code editor (VS Code core) | +| **AntV G2/G6** | Data visualization & graphs | +| **Recharts** | Charts and analytics | +| **Lexical** | Rich text editor (Facebook) | +| **React Markdown** | Markdown rendering | +| **i18next** | Internationalization | +| **React Hook Form** | Form handling | +| **Zod** | Schema validation | + +### 2.5 Package.json Dependencies (172 packages) + +```json +{ + "dependencies": { + "react": "^18.2.0", + "react-dom": "^18.2.0", + "umi": "^4.0.0", + "antd": "^5.0.0", + "@tanstack/react-query": "^5.0.0", + "zustand": "^4.0.0", + "axios": "^1.0.0", + "tailwindcss": "^3.0.0", + "@xyflow/react": "^12.0.0", + "@monaco-editor/react": "^4.0.0", + "lexical": "^0.12.0", + "react-markdown": "^9.0.0", + "i18next": "^23.0.0", + "react-hook-form": "^7.0.0", + "zod": "^3.0.0", + "@radix-ui/react-*": "latest", + "@ant-design/icons": "^5.0.0", + "@antv/g2": "^5.0.0", + "@antv/g6": "^5.0.0" + } +} +``` + +--- + +## 3. Backend Technologies + +### 3.1 Core Framework + +| Technology | Version | Mục đích | +|------------|---------|----------| +| **Python** | 3.10-3.12 | Programming language | +| **Flask** | 3.x | Web framework | +| **Quart** | 0.19.x | Async Flask (ASGI) | +| **Hypercorn** | Latest | ASGI server | + +### 3.2 Database & ORM + +| Technology | Mục đích | +|------------|----------| +| **Peewee** | Lightweight ORM (primary) | +| **SQLAlchemy** | Advanced ORM operations | +| **PyMySQL** | MySQL driver | + +### 3.3 Authentication & Security + +| Library | Mục đích | +|---------|----------| +| **PyJWT** | JWT token handling | +| **bcrypt** | Password hashing | +| **python-jose** | JOSE implementation | +| **Authlib** | OAuth integration | + +### 3.4 Async & Background Tasks + +| Library | Mục đích | +|---------|----------| +| **asyncio** | Async I/O | +| **aiohttp** | Async HTTP client | +| **Redis/Valkey** | Task queue & caching | +| **APScheduler** | Job scheduling | + +### 3.5 API & Documentation + +| Library | Mục đích | +|---------|----------| +| **Flasgger** | Swagger/OpenAPI docs | +| **Flask-CORS** | CORS handling | +| **Werkzeug** | WSGI utilities | + +### 3.6 pyproject.toml Dependencies (150+ packages) + +```toml +[project] +name = "ragflow" +version = "0.22.1" +requires-python = ">=3.10,<3.13" + +dependencies = [ + # Web Framework + "flask>=3.0.0", + "quart>=0.19.0", + "hypercorn>=0.17.0", + "flask-cors>=4.0.0", + "flasgger>=0.9.0", + + # Database + "peewee>=3.17.0", + "pymysql>=1.1.0", + + # Authentication + "pyjwt>=2.8.0", + "bcrypt>=4.1.0", + + # Async + "aiohttp>=3.9.0", + "httpx>=0.27.0", + + # Data Processing + "pandas>=2.0.0", + "numpy>=1.26.0", + + # AI/ML (see section 4) + ... +] +``` + +--- + +## 4. AI/ML Technologies + +### 4.1 LLM Integration + +| Provider | Library | Models Supported | +|----------|---------|-----------------| +| **OpenAI** | `openai>=1.0` | GPT-3.5, GPT-4, GPT-4V | +| **Anthropic** | `anthropic>=0.20` | Claude 3 family | +| **Google** | `google-generativeai` | Gemini Pro | +| **Cohere** | `cohere>=5.0` | Command, Embed, Rerank | +| **Groq** | `groq>=0.4` | LLaMA, Mixtral | +| **Mistral** | `mistralai>=0.1` | Mistral 7B, Mixtral | +| **Ollama** | `ollama>=0.1` | Local models | +| **HuggingFace** | `huggingface_hub` | Open source models | + +### 4.2 Embedding Models + +| Library | Models | +|---------|--------| +| **Sentence Transformers** | all-MiniLM, all-mpnet, etc. | +| **OpenAI Embeddings** | text-embedding-3-small/large | +| **BGE** | bge-base, bge-large, bge-m3 | +| **Jina** | jina-embeddings-v2 | +| **Cohere** | embed-english-v3 | + +```python +# Embedding configuration +EMBEDDING_MODELS = { + "openai": { + "text-embedding-3-small": {"dim": 1536, "max_tokens": 8191}, + "text-embedding-3-large": {"dim": 3072, "max_tokens": 8191}, + }, + "bge": { + "bge-base-en-v1.5": {"dim": 768, "max_tokens": 512}, + "bge-large-en-v1.5": {"dim": 1024, "max_tokens": 512}, + "bge-m3": {"dim": 1024, "max_tokens": 8192}, + }, + "sentence-transformers": { + "all-MiniLM-L6-v2": {"dim": 384, "max_tokens": 256}, + "all-mpnet-base-v2": {"dim": 768, "max_tokens": 384}, + } +} +``` + +### 4.3 Document Processing + +| Library | Mục đích | +|---------|----------| +| **PyMuPDF (fitz)** | PDF text extraction | +| **pdf2image** | PDF to image conversion | +| **Tesseract (pytesseract)** | OCR | +| **python-docx** | Word document parsing | +| **openpyxl** | Excel parsing | +| **python-pptx** | PowerPoint parsing | +| **BeautifulSoup4** | HTML parsing | +| **markdown** | Markdown processing | +| **camelot-py** | Table extraction from PDF | +| **tabula-py** | Alternative table extraction | + +### 4.4 Computer Vision + +| Library | Mục đích | +|---------|----------| +| **Detectron2** | Layout analysis | +| **LayoutLM** | Document understanding | +| **OpenCV** | Image processing | +| **Pillow** | Image manipulation | +| **YOLO** | Object detection | + +### 4.5 NLP & Text Processing + +| Library | Mục đích | +|---------|----------| +| **tiktoken** | OpenAI tokenization | +| **nltk** | Natural language toolkit | +| **spaCy** | NLP pipeline | +| **regex** | Advanced regex | +| **chardet** | Character encoding detection | + +### 4.6 Vector Operations + +| Library | Mục đích | +|---------|----------| +| **NumPy** | Numerical operations | +| **SciPy** | Scientific computing | +| **scikit-learn** | ML utilities, clustering | +| **faiss-cpu/gpu** | Vector similarity search | + +--- + +## 5. Data Storage Technologies + +### 5.1 Relational Database + +| Technology | Mục đích | Configuration | +|------------|----------|---------------| +| **MySQL 8.0** | Primary database | Port 5455 | +| **PostgreSQL** | Alternative (supported) | - | + +**MySQL Schema Design**: +- InnoDB engine +- UTF8MB4 character set +- JSON columns for flexible data +- Foreign keys for integrity + +### 5.2 Vector/Search Database + +| Technology | Mục đích | Configuration | +|------------|----------|---------------| +| **Elasticsearch 8.12** | Default vector store | Port 9200 | +| **Infinity** | Alternative (in-house) | Port 23817 | +| **OpenSearch** | Alternative | Port 9200 | +| **OceanBase** | Alternative (distributed) | - | + +**Elasticsearch Configuration**: +```json +{ + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "analysis": { + "analyzer": { + "ik_smart": { "type": "ik_smart" }, + "ik_max_word": { "type": "ik_max_word" } + } + } + }, + "mappings": { + "properties": { + "content": { "type": "text", "analyzer": "ik_smart" }, + "embedding": { + "type": "dense_vector", + "dims": 1536, + "index": true, + "similarity": "cosine" + } + } + } +} +``` + +### 5.3 Cache & Message Queue + +| Technology | Mục đích | Configuration | +|------------|----------|---------------| +| **Redis 7.x** | Cache, sessions, queue | Port 6379 | +| **Valkey** | Redis alternative | Port 6379 | + +**Redis Usage**: +- Session storage +- Rate limiting +- Task queue (custom implementation) +- Cache layer + +### 5.4 Object Storage + +| Technology | Mục đích | Configuration | +|------------|----------|---------------| +| **MinIO** | S3-compatible storage | Port 9000/9001 | +| **AWS S3** | Cloud storage option | - | +| **Azure Blob** | Cloud storage option | - | + +**MinIO Structure**: +``` +ragflow/ # Bucket +├── {tenant_id}/ +│ ├── {kb_id}/ +│ │ ├── {file_id} # Original files +│ │ └── chunks/ # Processed chunks +│ └── temp/ # Temporary files +└── system/ # System files +``` + +--- + +## 6. Infrastructure Technologies + +### 6.1 Containerization + +| Technology | Mục đích | +|------------|----------| +| **Docker** | Container runtime | +| **Docker Compose** | Multi-container orchestration | +| **BuildKit** | Efficient image building | + +**Docker Images**: +```yaml +services: + ragflow-server: + image: infiniflow/ragflow:latest + # or: ragflow:nightly for development + + mysql: + image: mysql:8.0 + + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.12.0 + + redis: + image: redis:7-alpine + + minio: + image: minio/minio:latest +``` + +### 6.2 Web Server & Proxy + +| Technology | Mục đích | Configuration | +|------------|----------|---------------| +| **Nginx** | Reverse proxy, static files | Port 80/443 | +| **Hypercorn** | ASGI server | Port 9380 | + +**Nginx Configuration**: +```nginx +upstream ragflow { + server ragflow-server:9380; +} + +server { + listen 80; + + location /api/ { + proxy_pass http://ragflow; + proxy_http_version 1.1; + proxy_set_header Connection ""; + } + + location / { + root /usr/share/nginx/html; + try_files $uri $uri/ /index.html; + } +} +``` + +### 6.3 Kubernetes Deployment + +| Technology | Mục đích | +|------------|----------| +| **Kubernetes** | Container orchestration | +| **Helm** | K8s package manager | + +**Helm Chart Structure**: +``` +helm/ +├── Chart.yaml +├── values.yaml +├── templates/ +│ ├── deployment.yaml +│ ├── service.yaml +│ ├── configmap.yaml +│ └── ingress.yaml +``` + +--- + +## 7. Development Tools + +### 7.1 Python Development + +| Tool | Mục đích | +|------|----------| +| **uv** | Package manager (fast) | +| **pip** | Traditional package manager | +| **pre-commit** | Git hooks | +| **ruff** | Linter & formatter | +| **pytest** | Testing framework | +| **mypy** | Type checking | + +### 7.2 Frontend Development + +| Tool | Mục đích | +|------|----------| +| **npm/pnpm** | Package manager | +| **ESLint** | Linting | +| **Prettier** | Code formatting | +| **Jest** | Testing | +| **Storybook** | Component development | +| **Husky** | Git hooks | + +### 7.3 Version Control & CI/CD + +| Tool | Mục đích | +|------|----------| +| **Git** | Version control | +| **GitHub Actions** | CI/CD | +| **Docker Hub** | Image registry | + +--- + +## 8. Monitoring & Observability + +### 8.1 Logging + +| Library | Mục đích | +|---------|----------| +| **Python logging** | Standard logging | +| **structlog** | Structured logging | + +### 8.2 Tracing + +| Integration | Mục đích | +|-------------|----------| +| **Langfuse** | LLM observability | +| **OpenTelemetry** | Distributed tracing | + +### 8.3 Metrics + +| Tool | Mục đích | +|------|----------| +| **Prometheus** | Metrics collection | +| **Grafana** | Visualization | + +--- + +## 9. Third-party Integrations + +### 9.1 LLM Providers + +``` +┌─────────────────────────────────────────────────────────────┐ +│ LLM Provider Support │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ Commercial APIs: │ +│ ┌───────┐ ┌───────┐ ┌───────┐ ┌───────┐ ┌───────┐ │ +│ │OpenAI │ │Claude │ │Gemini │ │Cohere │ │ Groq │ │ +│ └───────┘ └───────┘ └───────┘ └───────┘ └───────┘ │ +│ │ +│ China Providers: │ +│ ┌───────┐ ┌───────┐ ┌───────┐ ┌───────┐ ┌───────┐ │ +│ │ Qwen │ │Zhipu │ │Baichuan│ │Spark │ │ERNIE │ │ +│ └───────┘ └───────┘ └───────┘ └───────┘ └───────┘ │ +│ │ +│ Self-hosted: │ +│ ┌───────┐ ┌───────┐ ┌───────┐ │ +│ │Ollama │ │ vLLM │ │LocalAI│ │ +│ └───────┘ └───────┘ └───────┘ │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +### 9.2 Data Source Connectors + +| Category | Services | +|----------|----------| +| **Enterprise Wiki** | Confluence, Notion, SharePoint | +| **Communication** | Slack, Discord, Gmail, Teams | +| **Cloud Storage** | Google Drive, Dropbox, S3, WebDAV | +| **Development** | GitHub, Jira | +| **Education** | Moodle | +| **Finance** | TuShare, AkShare, Yahoo Finance | + +### 9.3 Search APIs + +| Service | Mục đích | +|---------|----------| +| **Tavily** | AI-optimized web search | +| **Google Search** | Web search | +| **Google Scholar** | Academic search | +| **SearXNG** | Meta search | +| **ArXiv** | Academic papers | +| **Wikipedia** | Knowledge lookup | + +--- + +## 10. System Requirements + +### 10.1 Minimum Requirements + +| Resource | Minimum | Recommended | +|----------|---------|-------------| +| **CPU** | 4 cores | 8+ cores | +| **RAM** | 16 GB | 32+ GB | +| **Disk** | 50 GB | 200+ GB SSD | +| **GPU** | - | NVIDIA 8GB+ VRAM | + +### 10.2 Software Requirements + +| Software | Version | +|----------|---------| +| **Docker** | 20.10+ | +| **Docker Compose** | 2.0+ | +| **Python** | 3.10-3.12 | +| **Node.js** | 18.20.4+ | + +### 10.3 Port Requirements + +| Port | Service | +|------|---------| +| 80/443 | Nginx (HTTP/HTTPS) | +| 9380 | RAGFlow API | +| 9381 | Admin Server | +| 9200 | Elasticsearch | +| 5455 | MySQL | +| 6379 | Redis | +| 9000/9001 | MinIO | + +--- + +## 11. Tóm Tắt Tech Stack + +### Production Stack + +``` +Frontend: React 18 + TypeScript + UmiJS + Ant Design + Tailwind +Backend: Python 3.11 + Flask/Quart + Peewee +AI/ML: OpenAI + Sentence Transformers + Detectron2 +Database: MySQL 8 + Elasticsearch 8 +Cache: Redis 7 +Storage: MinIO +Proxy: Nginx +Container: Docker + Docker Compose +Orchestration: Kubernetes + Helm +``` + +### Development Stack + +``` +Package Mgmt: uv (Python), npm (Node.js) +Linting: ruff (Python), ESLint (JS/TS) +Testing: pytest (Python), Jest (JS/TS) +CI/CD: GitHub Actions +Version Ctrl: Git +``` + +### Key Architectural Choices + +1. **Async-first**: Quart ASGI cho high concurrency +2. **Hybrid Search**: Vector + BM25 trong Elasticsearch +3. **Multi-tenant**: Data isolation per tenant +4. **Pluggable LLMs**: Abstract interface cho nhiều providers +5. **Containerized**: Full Docker deployment +6. **Event-driven**: Background processing với Redis queue diff --git a/personal_analyze/06_source_code_analysis.md b/personal_analyze/06_source_code_analysis.md new file mode 100644 index 000000000..77bb6327b --- /dev/null +++ b/personal_analyze/06_source_code_analysis.md @@ -0,0 +1,1509 @@ +# RAGFlow - Phân Tích Source Code Chi Tiết + +## 1. Tổng Quan Codebase + +### 1.1 Thống Kê Code + +| Metric | Giá trị | +|--------|---------| +| **Total Lines of Code** | ~62,000+ | +| **Python Files** | ~300+ | +| **TypeScript/JavaScript Files** | ~400+ | +| **Test Files** | ~100+ | +| **Configuration Files** | ~50+ | + +### 1.2 Code Quality Metrics + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ CODE QUALITY OVERVIEW │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ Python Code: │ +│ ├── Linter: ruff (strict) │ +│ ├── Formatter: ruff format │ +│ ├── Type Hints: Partial (improving) │ +│ └── Test Coverage: ~60% │ +│ │ +│ TypeScript Code: │ +│ ├── Linter: ESLint (strict) │ +│ ├── Formatter: Prettier │ +│ ├── Type Safety: Strict mode │ +│ └── Test Coverage: ~40% │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 2. Backend Code Analysis + +### 2.1 Entry Point Analysis + +**File**: `api/ragflow_server.py` + +```python +# Simplified structure analysis + +from quart import Quart +from quart_cors import cors + +# Application factory pattern +def create_app(): + app = Quart(__name__) + + # CORS configuration + app = cors(app, allow_origin="*") + + # Session configuration + app.config['SECRET_KEY'] = ... + app.config['SESSION_TYPE'] = 'redis' + + # Register blueprints + from api.apps import ( + kb_app, document_app, dialog_app, + canvas_app, file_app, user_app, ... + ) + + app.register_blueprint(kb_app, url_prefix='/api/v1/kb') + app.register_blueprint(document_app, url_prefix='/api/v1/document') + # ... more blueprints + + # Swagger documentation + from flasgger import Swagger + Swagger(app) + + return app + +# Main entry +if __name__ == '__main__': + app = create_app() + app.run(host='0.0.0.0', port=9380) +``` + +**Key Patterns**: +- Application Factory Pattern +- Blueprint-based modular architecture +- ASGI với Quart (async Flask) +- Swagger/OpenAPI documentation + +### 2.2 API Blueprint Structure + +**Pattern sử dụng**: + +```python +# Typical blueprint structure (e.g., kb_app.py) + +from flask import Blueprint, request +from api.db.services import KnowledgebaseService +from api.utils.api_utils import get_data, validate_request + +kb_app = Blueprint('kb', __name__) + +@kb_app.route('/create', methods=['POST']) +@validate_request # Decorator for validation +@login_required # Authentication decorator +async def create(): + """ + Create a new knowledge base. + --- + tags: + - Knowledge Base + parameters: + - name: body + in: body + required: true + schema: + type: object + properties: + name: + type: string + description: + type: string + responses: + 200: + description: Success + """ + try: + req = await get_data(request) + tenant_id = get_tenant_id(request) + + # Validation + if not req.get('name'): + return error_response("Name is required") + + # Business logic + kb = KnowledgebaseService.create( + name=req['name'], + tenant_id=tenant_id, + description=req.get('description', '') + ) + + return success_response(kb.to_dict()) + + except Exception as e: + return error_response(str(e)) +``` + +**Design Patterns**: +- RESTful API design +- Decorator pattern cho cross-cutting concerns +- Service layer separation +- Consistent error handling + +### 2.3 Service Layer Analysis + +**File**: `api/db/services/dialog_service.py` (37KB - phức tạp nhất) + +```python +# Core RAG chat implementation + +class DialogService: + """ + Main service for RAG-based chat functionality. + Handles retrieval, reranking, and generation. + """ + + @classmethod + def chat(cls, dialog_id: str, question: str, + stream: bool = True, **kwargs): + """ + Main chat entry point. + + Flow: + 1. Load dialog configuration + 2. Get conversation history + 3. Perform retrieval + 4. Rerank results + 5. Build prompt with context + 6. Generate response (streaming) + 7. Save conversation + """ + + # 1. Load dialog + dialog = Dialog.get_by_id(dialog_id) + + # 2. Get history + conversation = Conversation.get_or_create(...) + history = conversation.messages[-10:] # Last 10 messages + + # 3. Retrieval + chunks = cls._retrieval(dialog, question) + + # 4. Reranking + if dialog.rerank_id: + chunks = cls._rerank(chunks, question, dialog.top_n) + + # 5. Build prompt + context = cls._build_context(chunks) + prompt = cls._build_prompt(dialog, question, context, history) + + # 6. Generate + if stream: + return cls._stream_generate(dialog, prompt) + else: + return cls._generate(dialog, prompt) + + @classmethod + def _retrieval(cls, dialog, question): + """ + Hybrid retrieval from Elasticsearch. + Combines vector similarity and BM25. + """ + # Generate query embedding + embedding = EmbeddingModel.encode(question) + + # Build ES query + query = { + "script_score": { + "query": { + "bool": { + "should": [ + {"match": {"content": question}}, # BM25 + ], + "filter": [ + {"terms": {"kb_id": dialog.kb_ids}} + ] + } + }, + "script": { + "source": """ + cosineSimilarity(params.query_vector, 'embedding') + 1.0 + """, + "params": {"query_vector": embedding} + } + } + } + + # Execute search + results = es.search(index="ragflow_*", body={"query": query}) + return results['hits']['hits'] + + @classmethod + def _stream_generate(cls, dialog, prompt): + """ + Streaming generation using SSE. + """ + llm = ChatModel.get(dialog.llm_id) + + for chunk in llm.chat(prompt, stream=True): + yield { + "answer": chunk.content, + "reference": {}, + "done": False + } + + yield {"answer": "", "done": True} +``` + +**Key Implementation Details**: +- Hybrid search (vector + BM25) +- Streaming response với SSE +- Conversation history management +- Configurable reranking + +### 2.4 Database Model Analysis + +**File**: `api/db/db_models.py` (54KB) + +```python +# Using Peewee ORM + +from peewee import * +from playhouse.shortcuts import model_to_dict + +# Base model with common fields +class BaseModel(Model): + id = CharField(primary_key=True, max_length=32) + create_time = BigIntegerField(default=lambda: int(time.time() * 1000)) + update_time = BigIntegerField(default=lambda: int(time.time() * 1000)) + create_date = DateTimeField(default=datetime.now) + update_date = DateTimeField(default=datetime.now) + + class Meta: + database = db + + def to_dict(self): + return model_to_dict(self) + +# User model +class User(BaseModel): + email = CharField(max_length=255, unique=True) + password = CharField(max_length=255) + nickname = CharField(max_length=255, null=True) + avatar = TextField(null=True) + status = CharField(max_length=16, default='active') + login_channel = CharField(max_length=32, default='password') + last_login_time = DateTimeField(null=True) + + class Meta: + table_name = 'user' + +# Knowledge Base model +class Knowledgebase(BaseModel): + tenant_id = CharField(max_length=32) + name = CharField(max_length=255) + description = TextField(null=True) + + # Embedding configuration + embd_id = CharField(max_length=128) + + # Parser configuration (JSON) + parser_id = CharField(max_length=32, default='naive') + parser_config = JSONField(default={}) + + # Search configuration + similarity_threshold = FloatField(default=0.2) + vector_similarity_weight = FloatField(default=0.3) + top_n = IntegerField(default=6) + + # Statistics + doc_num = IntegerField(default=0) + token_num = IntegerField(default=0) + chunk_num = IntegerField(default=0) + + class Meta: + table_name = 'knowledgebase' + indexes = ( + (('tenant_id', 'name'), True), # Unique constraint + ) + +# Document model +class Document(BaseModel): + kb_id = CharField(max_length=32) + name = CharField(max_length=512) + location = CharField(max_length=1024) # MinIO path + size = BigIntegerField(default=0) + type = CharField(max_length=32) + + # Processing status + status = CharField(max_length=16, default='UNSTART') + # UNSTART -> RUNNING -> FINISHED / FAIL + progress = FloatField(default=0) + progress_msg = TextField(null=True) + + # Parser configuration + parser_id = CharField(max_length=32) + parser_config = JSONField(default={}) + + # Statistics + chunk_num = IntegerField(default=0) + token_num = IntegerField(default=0) + process_duration = FloatField(default=0) + + class Meta: + table_name = 'document' + +# Dialog (Chat) model +class Dialog(BaseModel): + tenant_id = CharField(max_length=32) + name = CharField(max_length=255) + description = TextField(null=True) + + # Knowledge base references + kb_ids = JSONField(default=[]) # List of KB IDs + + # LLM configuration + llm_id = CharField(max_length=128) + llm_setting = JSONField(default={ + 'temperature': 0.7, + 'max_tokens': 2048, + 'top_p': 1.0 + }) + + # Prompt configuration + prompt_config = JSONField(default={ + 'system': 'You are a helpful assistant.', + 'prologue': '', + 'show_quote': True + }) + + # Retrieval configuration + similarity_threshold = FloatField(default=0.2) + vector_similarity_weight = FloatField(default=0.3) + top_n = IntegerField(default=6) + top_k = IntegerField(default=1024) + + # Reranking + rerank_id = CharField(max_length=128, null=True) + + class Meta: + table_name = 'dialog' +``` + +**ORM Patterns**: +- Active Record pattern (Peewee) +- JSON fields cho flexible data +- Soft timestamps (create/update) +- Index optimization + +### 2.5 RAG Pipeline Code Analysis + +**File**: `rag/flow/pipeline.py` + +```python +# Document processing pipeline + +class Pipeline: + """ + Main document processing pipeline. + Orchestrates: Parse → Tokenize → Split → Embed → Index + """ + + def __init__(self, document_id: str): + self.doc = Document.get_by_id(document_id) + self.kb = Knowledgebase.get_by_id(self.doc.kb_id) + + # Initialize components based on config + self.parser = self._get_parser() + self.tokenizer = self._get_tokenizer() + self.splitter = self._get_splitter() + self.embedder = self._get_embedder() + + def run(self): + """Execute the full pipeline.""" + try: + self._update_status('RUNNING') + + # 1. Download file from MinIO + file_content = self._download_file() + + # 2. Parse document + self._update_progress(0.1, "Parsing document...") + parsed = self.parser.parse(file_content) + + # 3. Extract and tokenize + self._update_progress(0.3, "Tokenizing...") + tokens = self.tokenizer.tokenize(parsed) + + # 4. Split into chunks + self._update_progress(0.5, "Chunking...") + chunks = self.splitter.split(tokens) + + # 5. Generate embeddings + self._update_progress(0.7, "Embedding...") + embedded_chunks = self._embed_chunks(chunks) + + # 6. Index to Elasticsearch + self._update_progress(0.9, "Indexing...") + self._index_chunks(embedded_chunks) + + # 7. Update statistics + self._update_status('FINISHED') + self._update_statistics(len(chunks)) + + except Exception as e: + self._update_status('FAIL', str(e)) + raise + + def _embed_chunks(self, chunks: List[str]) -> List[dict]: + """Generate embeddings for chunks in batches.""" + batch_size = 32 + results = [] + + for i in range(0, len(chunks), batch_size): + batch = chunks[i:i+batch_size] + embeddings = self.embedder.encode(batch) + + for chunk, embedding in zip(batch, embeddings): + results.append({ + 'content': chunk, + 'embedding': embedding, + 'kb_id': self.kb.id, + 'doc_id': self.doc.id + }) + + return results + + def _index_chunks(self, chunks: List[dict]): + """Bulk index chunks to Elasticsearch.""" + actions = [] + for i, chunk in enumerate(chunks): + actions.append({ + '_index': f'ragflow_{self.kb.id}', + '_id': f'{self.doc.id}_{i}', + '_source': chunk + }) + + # Bulk insert + helpers.bulk(es, actions) +``` + +**Pipeline Patterns**: +- Chain of Responsibility +- Strategy pattern cho parsers +- Batch processing +- Progress tracking + +--- + +## 3. Frontend Code Analysis + +### 3.1 Project Structure + +```typescript +// UmiJS project structure +web/src/ +├── pages/ // Route-based pages +├── components/ // Reusable components +├── services/ // API calls +├── hooks/ // Custom React hooks +├── interfaces/ // TypeScript types +├── utils/ // Utility functions +├── constants/ // Constants +├── locales/ // i18n translations +└── less/ // Global styles +``` + +### 3.2 Page Component Analysis + +**File**: `web/src/pages/dataset/index.tsx` + +```typescript +// Knowledge Base List Page + +import { useState, useEffect } from 'react'; +import { useRequest } from 'ahooks'; +import { Table, Button, Modal, message } from 'antd'; +import { useNavigate } from 'umi'; + +import { getKnowledgebases, deleteKnowledgebase } from '@/services/kb'; +import CreateKbModal from './components/CreateKbModal'; + +interface Knowledgebase { + id: string; + name: string; + description: string; + doc_num: number; + chunk_num: number; + create_time: number; +} + +const DatasetList: React.FC = () => { + const navigate = useNavigate(); + const [createModalVisible, setCreateModalVisible] = useState(false); + + // Data fetching with caching + const { data, loading, refresh } = useRequest(getKnowledgebases, { + refreshDeps: [], + }); + + // Table columns definition + const columns = [ + { + title: 'Name', + dataIndex: 'name', + key: 'name', + render: (text: string, record: Knowledgebase) => ( + navigate(`/dataset/${record.id}`)}>{text} + ), + }, + { + title: 'Documents', + dataIndex: 'doc_num', + key: 'doc_num', + }, + { + title: 'Chunks', + dataIndex: 'chunk_num', + key: 'chunk_num', + }, + { + title: 'Actions', + key: 'actions', + render: (_: any, record: Knowledgebase) => ( + + ), + }, + ]; + + const handleDelete = async (id: string) => { + Modal.confirm({ + title: 'Confirm Delete', + content: 'Are you sure you want to delete this knowledge base?', + onOk: async () => { + await deleteKnowledgebase(id); + message.success('Deleted successfully'); + refresh(); + }, + }); + }; + + return ( +
+
+

Knowledge Bases

+ +
+ + + + setCreateModalVisible(false)} + onSuccess={() => { + setCreateModalVisible(false); + refresh(); + }} + /> + + ); +}; + +export default DatasetList; +``` + +**React Patterns**: +- Functional components với hooks +- Custom hooks cho data fetching +- Controlled components +- Composition pattern + +### 3.3 State Management + +**File**: `web/src/hooks/useKnowledgebaseStore.ts` + +```typescript +// Zustand store for knowledge base state + +import { create } from 'zustand'; +import { devtools, persist } from 'zustand/middleware'; + +interface KnowledgebaseState { + knowledgebases: Knowledgebase[]; + currentKb: Knowledgebase | null; + loading: boolean; + error: string | null; + + // Actions + fetchKnowledgebases: () => Promise; + setCurrentKb: (kb: Knowledgebase | null) => void; + createKnowledgebase: (data: CreateKbRequest) => Promise; + updateKnowledgebase: (id: string, data: UpdateKbRequest) => Promise; + deleteKnowledgebase: (id: string) => Promise; +} + +export const useKnowledgebaseStore = create()( + devtools( + persist( + (set, get) => ({ + knowledgebases: [], + currentKb: null, + loading: false, + error: null, + + fetchKnowledgebases: async () => { + set({ loading: true, error: null }); + try { + const response = await api.get('/kb/list'); + set({ knowledgebases: response.data, loading: false }); + } catch (error) { + set({ error: error.message, loading: false }); + } + }, + + setCurrentKb: (kb) => set({ currentKb: kb }), + + createKnowledgebase: async (data) => { + const response = await api.post('/kb/create', data); + const newKb = response.data; + set((state) => ({ + knowledgebases: [...state.knowledgebases, newKb], + })); + return newKb; + }, + + updateKnowledgebase: async (id, data) => { + await api.put(`/kb/${id}`, data); + set((state) => ({ + knowledgebases: state.knowledgebases.map((kb) => + kb.id === id ? { ...kb, ...data } : kb + ), + })); + }, + + deleteKnowledgebase: async (id) => { + await api.delete(`/kb/${id}`); + set((state) => ({ + knowledgebases: state.knowledgebases.filter((kb) => kb.id !== id), + })); + }, + }), + { + name: 'knowledgebase-storage', + partialize: (state) => ({ currentKb: state.currentKb }), + } + ) + ) +); +``` + +**State Management Patterns**: +- Zustand cho global state +- React Query cho server state +- Middleware (devtools, persist) +- Immer-style updates + +### 3.4 API Service Layer + +**File**: `web/src/services/api.ts` + +```typescript +// API client configuration + +import axios, { AxiosInstance, AxiosRequestConfig } from 'axios'; +import { message } from 'antd'; + +class ApiClient { + private instance: AxiosInstance; + + constructor() { + this.instance = axios.create({ + baseURL: '/api/v1', + timeout: 30000, + headers: { + 'Content-Type': 'application/json', + }, + }); + + this.setupInterceptors(); + } + + private setupInterceptors() { + // Request interceptor + this.instance.interceptors.request.use( + (config) => { + const token = localStorage.getItem('access_token'); + if (token) { + config.headers.Authorization = `Bearer ${token}`; + } + return config; + }, + (error) => Promise.reject(error) + ); + + // Response interceptor + this.instance.interceptors.response.use( + (response) => response.data, + (error) => { + const { response } = error; + + if (response?.status === 401) { + // Token expired + localStorage.removeItem('access_token'); + window.location.href = '/login'; + } else if (response?.status === 403) { + message.error('Permission denied'); + } else if (response?.status >= 500) { + message.error('Server error'); + } else { + message.error(response?.data?.message || 'Request failed'); + } + + return Promise.reject(error); + } + ); + } + + // Streaming support for chat + async stream(url: string, data: any, onMessage: (data: any) => void) { + const response = await fetch(`/api/v1${url}`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${localStorage.getItem('access_token')}`, + }, + body: JSON.stringify(data), + }); + + const reader = response.body?.getReader(); + const decoder = new TextDecoder(); + + while (true) { + const { done, value } = await reader!.read(); + if (done) break; + + const text = decoder.decode(value); + const lines = text.split('\n'); + + for (const line of lines) { + if (line.startsWith('data: ')) { + const data = JSON.parse(line.slice(6)); + onMessage(data); + } + } + } + } + + get = (url: string, config?: AxiosRequestConfig) => + this.instance.get(url, config); + + post = (url: string, data?: any, config?: AxiosRequestConfig) => + this.instance.post(url, data, config); + + put = (url: string, data?: any, config?: AxiosRequestConfig) => + this.instance.put(url, data, config); + + delete = (url: string, config?: AxiosRequestConfig) => + this.instance.delete(url, config); +} + +export const api = new ApiClient(); +``` + +**API Patterns**: +- Axios interceptors +- Token management +- SSE streaming support +- Error handling + +### 3.5 Chat Component Analysis + +**File**: `web/src/pages/next-chats/components/ChatWindow.tsx` + +```typescript +// Streaming chat component + +import { useState, useRef, useEffect } from 'react'; +import { Input, Button, Spin } from 'antd'; +import { SendOutlined } from '@ant-design/icons'; +import ReactMarkdown from 'react-markdown'; + +interface Message { + role: 'user' | 'assistant'; + content: string; + sources?: Source[]; +} + +interface ChatWindowProps { + dialogId: string; +} + +const ChatWindow: React.FC = ({ dialogId }) => { + const [messages, setMessages] = useState([]); + const [input, setInput] = useState(''); + const [loading, setLoading] = useState(false); + const [streamingContent, setStreamingContent] = useState(''); + + const messagesEndRef = useRef(null); + + useEffect(() => { + messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' }); + }, [messages, streamingContent]); + + const handleSend = async () => { + if (!input.trim() || loading) return; + + const question = input.trim(); + setInput(''); + setLoading(true); + setStreamingContent(''); + + // Add user message + setMessages((prev) => [...prev, { role: 'user', content: question }]); + + try { + // Stream response + await api.stream( + '/dialog/chat', + { dialog_id: dialogId, question }, + (data) => { + if (data.done) { + // Finalize message + setMessages((prev) => [ + ...prev, + { + role: 'assistant', + content: streamingContent || data.answer, + sources: data.reference?.chunks || [], + }, + ]); + setStreamingContent(''); + } else { + // Stream content + setStreamingContent((prev) => prev + data.answer); + } + } + ); + } catch (error) { + setMessages((prev) => [ + ...prev, + { role: 'assistant', content: 'Error: Failed to get response' }, + ]); + } finally { + setLoading(false); + } + }; + + return ( +
+ {/* Messages */} +
+ {messages.map((msg, idx) => ( + + ))} + + {/* Streaming content */} + {streamingContent && ( +
+ {streamingContent} + +
+ )} + +
+
+ + {/* Input */} +
+
+ setInput(e.target.value)} + onPressEnter={(e) => { + if (!e.shiftKey) { + e.preventDefault(); + handleSend(); + } + }} + placeholder="Type your message..." + autoSize={{ minRows: 1, maxRows: 4 }} + /> +
+
+
+ ); +}; +``` + +**Chat UI Patterns**: +- Real-time streaming +- Auto-scroll +- Markdown rendering +- Loading states + +--- + +## 4. Agent System Code Analysis + +### 4.1 Canvas Engine + +**File**: `agent/canvas.py` + +```python +# Workflow execution engine + +from typing import Dict, Any, Generator +import json + +class Canvas: + """ + Visual workflow execution engine. + Executes DSL-defined workflows with components. + """ + + def __init__(self, dsl: dict, tenant_id: str): + self.dsl = dsl + self.tenant_id = tenant_id + self.components = self._parse_components() + self.graph = self._build_graph() + self.context = {} + + def _parse_components(self) -> Dict[str, 'Component']: + """Parse DSL into component instances.""" + components = {} + + for node in self.dsl.get('nodes', []): + node_type = node['type'] + component_class = COMPONENT_REGISTRY.get(node_type) + + if component_class: + components[node['id']] = component_class( + node_id=node['id'], + config=node.get('config', {}), + canvas=self + ) + + return components + + def _build_graph(self) -> Dict[str, list]: + """Build execution graph from edges.""" + graph = {node_id: [] for node_id in self.components} + + for edge in self.dsl.get('edges', []): + source = edge['source'] + target = edge['target'] + condition = edge.get('condition') + + graph[source].append({ + 'target': target, + 'condition': condition + }) + + return graph + + def run(self, input_data: dict) -> Generator[dict, None, None]: + """ + Execute workflow and yield streaming results. + """ + self.context = {'input': input_data} + + # Find start node + current_node = self._find_start_node() + + while current_node: + component = self.components[current_node] + + # Execute component + for output in component.execute(self.context): + yield { + 'node_id': current_node, + 'output': output, + 'done': False + } + + # Update context with component output + self.context.update(component.output) + + # Find next node + current_node = self._get_next_node(current_node) + + yield {'done': True, 'result': self.context.get('final_output')} + + def _get_next_node(self, current: str) -> str | None: + """Determine next node based on edges and conditions.""" + edges = self.graph.get(current, []) + + for edge in edges: + if edge['condition']: + # Evaluate condition + if self._evaluate_condition(edge['condition']): + return edge['target'] + else: + return edge['target'] + + return None + + def _evaluate_condition(self, condition: dict) -> bool: + """Evaluate edge condition.""" + var_name = condition.get('variable') + operator = condition.get('operator') + value = condition.get('value') + + actual = self.context.get(var_name) + + if operator == '==': + return actual == value + elif operator == '!=': + return actual != value + elif operator == 'contains': + return value in str(actual) + + return False + +# Component Registry +COMPONENT_REGISTRY = { + 'begin': BeginComponent, + 'llm': LLMComponent, + 'retrieval': RetrievalComponent, + 'categorize': CategorizeComponent, + 'message': MessageComponent, + 'webhook': WebhookComponent, + 'iteration': IterationComponent, + 'agent': AgentWithToolsComponent, +} +``` + +### 4.2 Component Base Class + +```python +# Base component implementation + +from abc import ABC, abstractmethod +from typing import Generator, Dict, Any + +class Component(ABC): + """Abstract base for workflow components.""" + + def __init__(self, node_id: str, config: dict, canvas: 'Canvas'): + self.node_id = node_id + self.config = config + self.canvas = canvas + self.output = {} + + @abstractmethod + def execute(self, context: dict) -> Generator[dict, None, None]: + """ + Execute component logic. + Yields intermediate outputs for streaming. + """ + pass + + def get_variable(self, name: str, context: dict) -> Any: + """Get variable from context or config.""" + if name.startswith('{{') and name.endswith('}}'): + var_path = name[2:-2].strip() + return self._resolve_path(var_path, context) + return name + + def _resolve_path(self, path: str, context: dict) -> Any: + """Resolve dot-notation path in context.""" + parts = path.split('.') + value = context + + for part in parts: + if isinstance(value, dict): + value = value.get(part) + else: + return None + + return value + +class LLMComponent(Component): + """LLM invocation component.""" + + def execute(self, context: dict) -> Generator[dict, None, None]: + # Get prompt template + prompt_template = self.config.get('prompt', '') + + # Substitute variables + prompt = self._substitute_variables(prompt_template, context) + + # Get LLM + llm_id = self.config.get('llm_id') + llm = ChatModel.get(llm_id) + + # Stream response + full_response = '' + for chunk in llm.chat(prompt, stream=True): + full_response += chunk.content + yield {'type': 'token', 'content': chunk.content} + + self.output = {'llm_output': full_response} + yield {'type': 'complete', 'content': full_response} +``` + +--- + +## 5. Code Patterns & Best Practices + +### 5.1 Design Patterns Used + +| Pattern | Location | Purpose | +|---------|----------|---------| +| **Factory** | `rag/llm/*.py` | Create LLM/Embedding instances | +| **Strategy** | `deepdoc/parser/` | Different parsing strategies | +| **Observer** | `agent/canvas.py` | Event streaming | +| **Chain of Responsibility** | `rag/flow/pipeline.py` | Processing pipeline | +| **Decorator** | `api/apps/*.py` | Auth, validation | +| **Singleton** | `common/settings.py` | Configuration | +| **Repository** | `api/db/services/` | Data access | +| **Builder** | Prompt construction | Build complex prompts | + +### 5.2 Error Handling Patterns + +```python +# Consistent error handling + +from api.common.exceptions import ( + ValidationError, + AuthenticationError, + NotFoundError, + ServiceError +) + +# API level +@app.errorhandler(ValidationError) +def handle_validation_error(e): + return jsonify({ + 'code': 400, + 'message': str(e) + }), 400 + +@app.errorhandler(Exception) +def handle_exception(e): + logger.exception("Unhandled exception") + return jsonify({ + 'code': 500, + 'message': 'Internal server error' + }), 500 + +# Service level +class DocumentService: + @classmethod + def get(cls, doc_id: str) -> Document: + doc = Document.get_or_none(Document.id == doc_id) + if not doc: + raise NotFoundError(f"Document {doc_id} not found") + return doc +``` + +### 5.3 Logging Patterns + +```python +# Structured logging + +import logging +from common.log_utils import setup_logger + +logger = setup_logger(__name__) + +class DialogService: + @classmethod + def chat(cls, dialog_id: str, question: str): + logger.info( + "Chat request", + extra={ + 'dialog_id': dialog_id, + 'question_length': len(question), + 'event': 'chat_start' + } + ) + + try: + result = cls._process_chat(dialog_id, question) + logger.info( + "Chat completed", + extra={ + 'dialog_id': dialog_id, + 'chunks_retrieved': len(result['chunks']), + 'event': 'chat_complete' + } + ) + return result + except Exception as e: + logger.error( + "Chat failed", + extra={ + 'dialog_id': dialog_id, + 'error': str(e), + 'event': 'chat_error' + }, + exc_info=True + ) + raise +``` + +### 5.4 Testing Patterns + +```python +# pytest test structure + +import pytest +from unittest.mock import Mock, patch + +class TestDialogService: + """Test cases for DialogService.""" + + @pytest.fixture + def mock_dialog(self): + """Create mock dialog for testing.""" + return Mock( + id='test-dialog', + kb_ids=['kb-1'], + llm_id='openai/gpt-4' + ) + + @pytest.fixture + def mock_es(self): + """Mock Elasticsearch client.""" + with patch('api.db.services.dialog_service.es') as mock: + yield mock + + def test_retrieval_returns_chunks(self, mock_dialog, mock_es): + """Test that retrieval returns expected chunks.""" + # Arrange + mock_es.search.return_value = { + 'hits': { + 'hits': [ + {'_source': {'content': 'chunk 1'}}, + {'_source': {'content': 'chunk 2'}} + ] + } + } + + # Act + chunks = DialogService._retrieval(mock_dialog, "test query") + + # Assert + assert len(chunks) == 2 + mock_es.search.assert_called_once() + + @pytest.mark.parametrize("question,expected_chunks", [ + ("simple query", 5), + ("complex multi-word query", 10), + ]) + def test_retrieval_with_different_queries( + self, mock_dialog, mock_es, question, expected_chunks + ): + """Parameterized test for different query types.""" + # Test implementation + pass +``` + +--- + +## 6. Security Analysis + +### 6.1 Authentication Implementation + +```python +# JWT authentication + +import jwt +from functools import wraps + +def login_required(f): + """Decorator to require authentication.""" + @wraps(f) + async def decorated(*args, **kwargs): + token = request.headers.get('Authorization', '').replace('Bearer ', '') + + if not token: + return jsonify({'error': 'Token required'}), 401 + + try: + payload = jwt.decode( + token, + current_app.config['SECRET_KEY'], + algorithms=['HS256'] + ) + g.user_id = payload['user_id'] + g.tenant_id = payload['tenant_id'] + except jwt.ExpiredSignatureError: + return jsonify({'error': 'Token expired'}), 401 + except jwt.InvalidTokenError: + return jsonify({'error': 'Invalid token'}), 401 + + return await f(*args, **kwargs) + + return decorated +``` + +### 6.2 Input Validation + +```python +# Request validation + +from marshmallow import Schema, fields, validate + +class CreateKbSchema(Schema): + name = fields.Str(required=True, validate=validate.Length(min=1, max=255)) + description = fields.Str(validate=validate.Length(max=1024)) + embedding_model = fields.Str(required=True) + parser_id = fields.Str(validate=validate.OneOf(['naive', 'paper', 'book'])) + +def validate_request(schema_class): + """Decorator for request validation.""" + def decorator(f): + @wraps(f) + async def decorated(*args, **kwargs): + schema = schema_class() + try: + data = await request.get_json() + validated = schema.load(data) + g.validated_data = validated + except ValidationError as e: + return jsonify({'error': e.messages}), 400 + return await f(*args, **kwargs) + return decorated + return decorator +``` + +### 6.3 SQL Injection Prevention + +```python +# Using Peewee ORM (parameterized queries) + +# Safe - uses parameterized query +documents = Document.select().where( + Document.kb_id == kb_id, + Document.status == 'FINISHED' +) + +# Unsafe - raw SQL (avoided in codebase) +# cursor.execute(f"SELECT * FROM document WHERE kb_id = '{kb_id}'") # DON'T DO THIS +``` + +--- + +## 7. Performance Optimizations + +### 7.1 Database Optimizations + +```python +# Batch operations + +def bulk_create_chunks(chunks: List[dict]): + """Bulk insert chunks for performance.""" + with db.atomic(): + for batch in chunked(chunks, 1000): + Chunk.insert_many(batch).execute() + +# Connection pooling +from playhouse.pool import PooledMySQLDatabase + +db = PooledMySQLDatabase( + 'ragflow', + max_connections=32, + stale_timeout=300, + **connection_params +) +``` + +### 7.2 Caching Strategies + +```python +# Redis caching + +import redis +from functools import lru_cache + +redis_client = redis.Redis(host='localhost', port=6379) + +def cache_result(ttl=3600): + """Decorator for Redis caching.""" + def decorator(f): + @wraps(f) + def decorated(*args, **kwargs): + cache_key = f"{f.__name__}:{hash(str(args) + str(kwargs))}" + + cached = redis_client.get(cache_key) + if cached: + return json.loads(cached) + + result = f(*args, **kwargs) + redis_client.setex(cache_key, ttl, json.dumps(result)) + return result + return decorated + return decorator + +@cache_result(ttl=600) +def get_embedding_model_config(model_id: str): + """Cached embedding model configuration.""" + return LLMFactories.get_model_config(model_id) +``` + +### 7.3 Async Operations + +```python +# Async document processing + +import asyncio +from concurrent.futures import ThreadPoolExecutor + +async def process_documents_async(doc_ids: List[str]): + """Process multiple documents concurrently.""" + + async def process_one(doc_id: str): + pipeline = Pipeline(doc_id) + await asyncio.to_thread(pipeline.run) + + tasks = [process_one(doc_id) for doc_id in doc_ids] + await asyncio.gather(*tasks, return_exceptions=True) +``` + +--- + +## 8. Tóm Tắt Code Quality + +### Strengths + +1. **Clean Architecture**: Separation of concerns với layers rõ ràng +2. **Consistent Patterns**: Decorator, factory patterns được sử dụng nhất quán +3. **Type Hints**: TypeScript strict mode, Python type hints improving +4. **Error Handling**: Consistent error handling across layers +5. **Async Support**: Full async support với Quart +6. **Streaming**: SSE streaming cho real-time responses + +### Areas for Improvement + +1. **Test Coverage**: Cần tăng coverage (hiện ~50-60%) +2. **Documentation**: Inline docs có thể chi tiết hơn +3. **Type Hints (Python)**: Chưa hoàn toàn consistent +4. **Error Messages**: Một số error messages chưa user-friendly + +### Code Metrics Summary + +| Metric | Value | Status | +|--------|-------|--------| +| Lines of Code | ~62,000 | Large | +| Cyclomatic Complexity | Moderate | OK | +| Technical Debt | Low-Medium | Acceptable | +| Test Coverage | ~50-60% | Needs improvement | +| Documentation | Partial | Needs improvement | diff --git a/personal_analyze/README.md b/personal_analyze/README.md new file mode 100644 index 000000000..2b71e89c5 --- /dev/null +++ b/personal_analyze/README.md @@ -0,0 +1,134 @@ +# RAGFlow Analysis Documentation + +Tài liệu phân tích chi tiết về RAGFlow - Open-source RAG Engine. + +## Tổng Quan RAGFlow + +**RAGFlow** (v0.22.1) là một **Retrieval-Augmented Generation (RAG) engine** mã nguồn mở, được xây dựng dựa trên **deep document understanding**. Đây là một ứng dụng full-stack với: + +- **Backend**: Python (Flask/Quart) +- **Frontend**: React/TypeScript (UmiJS) +- **Kiến trúc**: Microservices với Docker +- **Data Stores**: MySQL, Elasticsearch/Infinity, Redis, MinIO + +## Danh Sách Tài Liệu + +| File | Nội dung | +|------|----------| +| [01_directory_structure.md](./01_directory_structure.md) | Cấu trúc cây thư mục chi tiết | +| [02_system_architecture.md](./02_system_architecture.md) | Kiến trúc hệ thống với diagrams | +| [03_sequence_diagrams.md](./03_sequence_diagrams.md) | Sequence diagrams cho các flows chính | +| [04_modules_analysis.md](./04_modules_analysis.md) | Phân tích chi tiết từng module | +| [05_tech_stack.md](./05_tech_stack.md) | Tech stack và dependencies | +| [06_source_code_analysis.md](./06_source_code_analysis.md) | Phân tích source code chi tiết | + +## Tóm Tắt Chức Năng Chính + +### 1. Document Processing +- Upload và parse nhiều định dạng (PDF, Word, Excel, PPT, HTML...) +- OCR và layout analysis cho PDF +- Intelligent chunking strategies + +### 2. RAG Pipeline +- Hybrid search (Vector + BM25) +- Multiple embedding models support +- Reranking với cross-encoder + +### 3. Chat/Dialog +- Streaming responses (SSE) +- Multi-knowledge base retrieval +- Conversation history + +### 4. Agent Workflows +- Visual canvas builder +- 15+ built-in components +- 20+ external tool integrations + +### 5. Knowledge Graph (GraphRAG) +- Entity extraction và resolution +- Graph-based retrieval +- Relationship visualization + +## Kiến Trúc High-Level + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ CLIENTS │ +│ Web App │ Mobile │ Python SDK │ REST API │ +└────────────────────────────┬────────────────────────────────────┘ + │ +┌────────────────────────────┼────────────────────────────────────┐ +│ NGINX (Gateway) │ +└────────────────────────────┬────────────────────────────────────┘ + │ +┌────────────────────────────┼────────────────────────────────────┐ +│ APPLICATION LAYER │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │RAGFlow Server│ │ Admin Server │ │ MCP Server │ │ +│ │ (Port 9380) │ │ (Port 9381) │ │ (Port 9382) │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +└────────────────────────────┬────────────────────────────────────┘ + │ +┌────────────────────────────┼────────────────────────────────────┐ +│ SERVICE LAYER │ +│ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ +│ │ RAG │ │DeepDoc │ │ Agent │ │GraphRAG│ │Services│ │ +│ │Pipeline│ │Parsers │ │ Canvas │ │ Engine │ │ Layer │ │ +│ └────────┘ └────────┘ └────────┘ └────────┘ └────────┘ │ +└────────────────────────────┬────────────────────────────────────┘ + │ +┌────────────────────────────┼────────────────────────────────────┐ +│ DATA LAYER │ +│ ┌────────┐ ┌────────────┐ ┌────────┐ ┌────────┐ │ +│ │ MySQL │ │Elasticsearch│ │ Redis │ │ MinIO │ │ +│ │(5455) │ │ (9200) │ │ (6379) │ │ (9000) │ │ +│ └────────┘ └────────────┘ └────────┘ └────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Tech Stack Summary + +| Layer | Technologies | +|-------|-------------| +| **Frontend** | React 18, TypeScript, UmiJS, Ant Design, Tailwind CSS | +| **Backend** | Python 3.10-3.12, Flask/Quart, Peewee ORM | +| **AI/ML** | OpenAI, Sentence Transformers, Detectron2, PyTorch | +| **Database** | MySQL 8, Elasticsearch 8, Redis 7 | +| **Storage** | MinIO (S3-compatible) | +| **Infrastructure** | Docker, Nginx, Kubernetes/Helm | + +## LLM Providers Supported + +- OpenAI (GPT-3.5, GPT-4, GPT-4V) +- Anthropic (Claude 3) +- Google (Gemini) +- Alibaba (Qwen) +- Groq, Mistral, Cohere +- Ollama (local models) +- 20+ more providers + +## Data Connectors + +- Enterprise: Confluence, Notion, SharePoint, Jira +- Communication: Slack, Discord, Gmail, Teams +- Storage: Google Drive, Dropbox, S3, WebDAV + +## Quick Stats + +| Metric | Value | +|--------|-------| +| Total LOC | ~62,000+ | +| Python Files | ~300+ | +| TS/JS Files | ~400+ | +| Database Models | 25+ | +| API Endpoints | ~50+ | +| LLM Providers | 20+ | +| Data Connectors | 15+ | + +## License + +RAGFlow is open-source under Apache 2.0 license. + +--- + +*Documentation generated: 2025-11-26*