From a7d40e91328e35cf15d1ceacc5792a8332ff7f44 Mon Sep 17 00:00:00 2001 From: Jin Hai Date: Wed, 3 Dec 2025 18:32:15 +0800 Subject: [PATCH 1/3] Update since 'File manager' is renamed to 'File' (#11698) ### What problem does this PR solve? Update some docs and comments, since 'File manager' is rename to 'File' ### Type of change - [x] Documentation Update - [x] Refactoring --------- Signed-off-by: Jin Hai Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> --- api/apps/sdk/files.py | 20 +++++++++---------- docker/.env | 2 +- .../agent/agent_component_reference/begin.mdx | 2 +- docs/guides/chat/start_chat.md | 2 +- .../dataset/configure_knowledge_base.md | 8 ++++---- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/api/apps/sdk/files.py b/api/apps/sdk/files.py index fde3befa8..981d3975e 100644 --- a/api/apps/sdk/files.py +++ b/api/apps/sdk/files.py @@ -39,7 +39,7 @@ async def upload(tenant_id): Upload a file to the system. --- tags: - - File Management + - File security: - ApiKeyAuth: [] parameters: @@ -155,7 +155,7 @@ async def create(tenant_id): Create a new file or folder. --- tags: - - File Management + - File security: - ApiKeyAuth: [] parameters: @@ -233,7 +233,7 @@ async def list_files(tenant_id): List files under a specific folder. --- tags: - - File Management + - File security: - ApiKeyAuth: [] parameters: @@ -325,7 +325,7 @@ async def get_root_folder(tenant_id): Get user's root folder. --- tags: - - File Management + - File security: - ApiKeyAuth: [] responses: @@ -361,7 +361,7 @@ async def get_parent_folder(): Get parent folder info of a file. --- tags: - - File Management + - File security: - ApiKeyAuth: [] parameters: @@ -406,7 +406,7 @@ async def get_all_parent_folders(tenant_id): Get all parent folders of a file. --- tags: - - File Management + - File security: - ApiKeyAuth: [] parameters: @@ -454,7 +454,7 @@ async def rm(tenant_id): Delete one or multiple files/folders. --- tags: - - File Management + - File security: - ApiKeyAuth: [] parameters: @@ -528,7 +528,7 @@ async def rename(tenant_id): Rename a file. --- tags: - - File Management + - File security: - ApiKeyAuth: [] parameters: @@ -589,7 +589,7 @@ async def get(tenant_id, file_id): Download a file. --- tags: - - File Management + - File security: - ApiKeyAuth: [] produces: @@ -637,7 +637,7 @@ async def move(tenant_id): Move one or multiple files to another folder. --- tags: - - File Management + - File security: - ApiKeyAuth: [] parameters: diff --git a/docker/.env b/docker/.env index 6423b7824..3d90d2c55 100644 --- a/docker/.env +++ b/docker/.env @@ -170,7 +170,7 @@ TZ=Asia/Shanghai # Uncomment the following line if your operating system is MacOS: # MACOS=1 -# The maximum file size limit (in bytes) for each upload to your knowledge base or File Management. +# The maximum file size limit (in bytes) for each upload to your dataset or RAGFlow's File system. # To change the 1GB file size limit, uncomment the line below and update as needed. # MAX_CONTENT_LENGTH=1073741824 # After updating, ensure `client_max_body_size` in nginx/nginx.conf is updated accordingly. diff --git a/docs/guides/agent/agent_component_reference/begin.mdx b/docs/guides/agent/agent_component_reference/begin.mdx index 597d93905..c265bd2c6 100644 --- a/docs/guides/agent/agent_component_reference/begin.mdx +++ b/docs/guides/agent/agent_component_reference/begin.mdx @@ -76,5 +76,5 @@ No. Files uploaded to an agent as input are not stored in a dataset and hence wi There is no _specific_ file size limit for a file uploaded to an agent. However, note that model providers typically have a default or explicit maximum token setting, which can range from 8196 to 128k: The plain text part of the uploaded file will be passed in as the key value, but if the file's token count exceeds this limit, the string will be truncated and incomplete. :::tip NOTE -The variables `MAX_CONTENT_LENGTH` in `/docker/.env` and `client_max_body_size` in `/docker/nginx/nginx.conf` set the file size limit for each upload to a dataset or **File Management**. These settings DO NOT apply in this scenario. +The variables `MAX_CONTENT_LENGTH` in `/docker/.env` and `client_max_body_size` in `/docker/nginx/nginx.conf` set the file size limit for each upload to a dataset or RAGFlow's File system. These settings DO NOT apply in this scenario. ::: diff --git a/docs/guides/chat/start_chat.md b/docs/guides/chat/start_chat.md index e31c6b408..1e0dd0f10 100644 --- a/docs/guides/chat/start_chat.md +++ b/docs/guides/chat/start_chat.md @@ -9,7 +9,7 @@ Initiate an AI-powered chat with a configured chat assistant. --- -Knowledge base, hallucination-free chat, and file management are the three pillars of RAGFlow. Chats in RAGFlow are based on a particular dataset or multiple datasets. Once you have created your dataset, finished file parsing, and [run a retrieval test](../dataset/run_retrieval_test.md), you can go ahead and start an AI conversation. +Chats in RAGFlow are based on a particular dataset or multiple datasets. Once you have created your dataset, finished file parsing, and [run a retrieval test](../dataset/run_retrieval_test.md), you can go ahead and start an AI conversation. ## Start an AI chat diff --git a/docs/guides/dataset/configure_knowledge_base.md b/docs/guides/dataset/configure_knowledge_base.md index 8f0443244..464a5ca36 100644 --- a/docs/guides/dataset/configure_knowledge_base.md +++ b/docs/guides/dataset/configure_knowledge_base.md @@ -5,7 +5,7 @@ slug: /configure_knowledge_base # Configure dataset -Most of RAGFlow's chat assistants and Agents are based on datasets. Each of RAGFlow's datasets serves as a knowledge source, *parsing* files uploaded from your local machine and file references generated in **File Management** into the real 'knowledge' for future AI chats. This guide demonstrates some basic usages of the dataset feature, covering the following topics: +Most of RAGFlow's chat assistants and Agents are based on datasets. Each of RAGFlow's datasets serves as a knowledge source, *parsing* files uploaded from your local machine and file references generated in RAGFlow's File system into the real 'knowledge' for future AI chats. This guide demonstrates some basic usages of the dataset feature, covering the following topics: - Create a dataset - Configure a dataset @@ -82,10 +82,10 @@ Some embedding models are optimized for specific languages, so performance may b ### Upload file -- RAGFlow's **File Management** allows you to link a file to multiple datasets, in which case each target dataset holds a reference to the file. +- RAGFlow's File system allows you to link a file to multiple datasets, in which case each target dataset holds a reference to the file. - In **Knowledge Base**, you are also given the option of uploading a single file or a folder of files (bulk upload) from your local machine to a dataset, in which case the dataset holds file copies. -While uploading files directly to a dataset seems more convenient, we *highly* recommend uploading files to **File Management** and then linking them to the target datasets. This way, you can avoid permanently deleting files uploaded to the dataset. +While uploading files directly to a dataset seems more convenient, we *highly* recommend uploading files to RAGFlow's File system and then linking them to the target datasets. This way, you can avoid permanently deleting files uploaded to the dataset. ### Parse file @@ -142,6 +142,6 @@ As of RAGFlow v0.22.1, the search feature is still in a rudimentary form, suppor You are allowed to delete a dataset. Hover your mouse over the three dot of the intended dataset card and the **Delete** option appears. Once you delete a dataset, the associated folder under **root/.knowledge** directory is AUTOMATICALLY REMOVED. The consequence is: - The files uploaded directly to the dataset are gone; -- The file references, which you created from within **File Management**, are gone, but the associated files still exist in **File Management**. +- The file references, which you created from within RAGFlow's File system, are gone, but the associated files still exist. ![delete dataset](https://raw.githubusercontent.com/infiniflow/ragflow-docs/main/images/delete_datasets.jpg) From a3c940221891dafd7c145ec43d7c65fc91496d9d Mon Sep 17 00:00:00 2001 From: hsparks-codes <32576329+hsparks-codes@users.noreply.github.com> Date: Wed, 3 Dec 2025 06:17:47 -0500 Subject: [PATCH 2/3] Feat: confluence space key (#11706) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # PR Description: Add Space Key Configuration for Confluence Data Source ### What problem does this PR solve? This PR addresses issue #11638 where users requested the ability to specify Confluence Space Keys when configuring a Confluence data source connector. **Problem:** Currently, the RAGFlow UI for Confluence data sources only provides fields for: - Username - Access Token - Wiki Base URL - Is Cloud checkbox There is no way to specify which Confluence space(s) to sync, causing RAGFlow to attempt syncing all accessible spaces. This is problematic for users who: - Only want to index specific spaces (e.g., only the HR or Documentation space) - Have access to many spaces but only need a subset - Want to avoid unnecessary data transfer and processing **Solution:** The backend `ConfluenceConnector` class already supports a `space` parameter in its `__init__()` method (line 1282 in `common/data_source/confluence_connector.py`), but this parameter was never exposed in the UI. This PR adds the missing UI field to allow users to configure space filtering. **User Impact:** Users can now: - Leave the field empty to sync all accessible spaces (default behavior) - Specify a single space key (e.g., `DEV`) - Specify multiple space keys separated by commas (e.g., `DEV,DOCS,HR`) This gives users fine-grained control over which Confluence content gets indexed into their RAGFlow knowledge base. Fixes #11638 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- ## Implementation Details ### Changes Made **1. Frontend UI (`web/src/pages/user-setting/data-source/contant.tsx`)** - Added "Space Key" text input field to Confluence configuration form - Field is optional (not required) - Positioned after "Is Cloud" checkbox for logical grouping - Added to initial values with empty string default **2. Internationalization (`web/src/locales/*.ts`)** - **English (`en.ts`)**: Added `confluenceSpaceKeyTip` with clear instructions and examples - **Chinese (`zh.ts`)**: Added Chinese translation for the tooltip - **Russian (`ru.ts`)**: Added Russian translation for the tooltip - **Bonus Fix**: Removed duplicate `deleteModal` object in `zh.ts` that was causing TypeScript lint errors ### Backend Compatibility No backend changes were needed! The `ConfluenceConnector` class already supports the `space` parameter: ```python def __init__( self, wiki_base: str, is_cloud: bool, space: str = "", # ← Already supported! page_id: str = "", index_recursively: bool = False, cql_query: str | None = None, ... ) ``` The connector uses this parameter to filter the CQL query (line 1328-1330): ```python elif space: uri_safe_space = quote(space) base_cql_page_query += f" and space='{uri_safe_space}'" ``` ### User Experience **Before:** - Users could only sync ALL accessible spaces - No UI option to limit scope **After:** - Users see "Space Key" field with helpful tooltip - Tooltip explains: - Optional field (leave empty for all spaces) - Single space example: `DEV` - Multiple spaces example: `DEV,DOCS,HR` - Available in English, Chinese, and Russian ### Future Enhancements Potential improvements for future PRs: - Add validation to check if space key exists before saving - Add autocomplete/dropdown to show available spaces - Add UI hints about space key format requirements - Support for page_id filtering (already supported in backend) --- ## Related Issues - Fixes #11638 - [Confluence] How to specify Space Key when adding Confluence data source? --- web/src/locales/en.ts | 2 ++ web/src/locales/ru.ts | 2 ++ web/src/locales/zh.ts | 13 ++----------- web/src/pages/user-setting/data-source/contant.tsx | 8 ++++++++ 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index 9091833f5..479cafb64 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -714,6 +714,8 @@ This auto-tagging feature enhances retrieval by adding another layer of domain-s 'Check if this is a Confluence Cloud instance, uncheck for Confluence Server/Data Center', confluenceWikiBaseUrlTip: 'The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)', + confluenceSpaceKeyTip: + 'Optional: Specify a space key to limit syncing to a specific space. Leave empty to sync all accessible spaces. For multiple spaces, separate with commas (e.g., DEV,DOCS,HR)', s3PrefixTip: `Specify the folder path within your S3 bucket to fetch files from. Example: general/v2/`, S3CompatibleEndpointUrlTip: `Required for S3 compatible Storage Box. Specify the S3-compatible endpoint URL. diff --git a/web/src/locales/ru.ts b/web/src/locales/ru.ts index bbdfcc278..6c6141abc 100644 --- a/web/src/locales/ru.ts +++ b/web/src/locales/ru.ts @@ -711,6 +711,8 @@ export default { 'Отметьте, если это экземпляр Confluence Cloud, снимите для Confluence Server/Data Center', confluenceWikiBaseUrlTip: 'Базовый URL вашего экземпляра Confluence (например, https://your-domain.atlassian.net/wiki)', + confluenceSpaceKeyTip: + 'Необязательно: Укажите ключ пространства для синхронизации только определенного пространства. Оставьте пустым для синхронизации всех доступных пространств. Для нескольких пространств разделите запятыми (например, DEV,DOCS,HR)', s3PrefixTip: `Укажите путь к папке в вашем S3 бакете для получения файлов. Пример: general/v2/`, S3CompatibleEndpointUrlTip: `Требуется для S3 совместимого Storage Box. Укажите URL конечной точки, совместимой с S3. diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index baf1c75d9..4179557c3 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -701,6 +701,8 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 '检查这是否是 Confluence Cloud 实例,如果是 Confluence 服务/数据中心,则取消选中。', confluenceWikiBaseUrlTip: 'Confluence Wiki 的基础 URL(例如 https://your-domain.atlassian.net/wiki)', + confluenceSpaceKeyTip: + '可选:指定空间键以限制同步到特定空间。留空则同步所有可访问的空间。多个空间请用逗号分隔(例如:DEV,DOCS,HR)', s3PrefixTip: `指定 S3 存储桶内的文件夹路径,用于读取文件。 示例:general/v2/`, addDataSourceModalTital: '创建你的 {{name}} 链接', @@ -1903,16 +1905,5 @@ Tokenizer 会根据所选方式将内容存储为对应的数据结构。`, searchTitle: '尚未创建搜索应用', addNow: '立即添加', }, - - deleteModal: { - delAgent: '删除智能体', - delDataset: '删除知识库', - delSearch: '删除搜索', - delFile: '删除文件', - delFiles: '删除文件', - delFilesContent: '已选择 {{count}} 个文件', - delChat: '删除聊天', - delMember: '删除成员', - }, }, }; diff --git a/web/src/pages/user-setting/data-source/contant.tsx b/web/src/pages/user-setting/data-source/contant.tsx index f1359860a..db2392711 100644 --- a/web/src/pages/user-setting/data-source/contant.tsx +++ b/web/src/pages/user-setting/data-source/contant.tsx @@ -230,6 +230,13 @@ export const DataSourceFormFields = { required: false, tooltip: t('setting.confluenceIsCloudTip'), }, + { + label: 'Space Key', + name: 'config.space', + type: FormFieldType.Text, + required: false, + tooltip: t('setting.confluenceSpaceKeyTip'), + }, ], [DataSourceKey.GOOGLE_DRIVE]: [ { @@ -563,6 +570,7 @@ export const DataSourceFormDefaultValues = { config: { wiki_base: '', is_cloud: true, + space: '', credentials: { confluence_username: '', confluence_access_token: '', From 3c224c817b4a5c19cd26a35049866e00b5bbefda Mon Sep 17 00:00:00 2001 From: David Eberto Domenech Castillo Date: Wed, 3 Dec 2025 05:44:20 -0600 Subject: [PATCH 3/3] Fix: Correct pagination and early termination bugs in chunk_list() (#11692) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary This PR fixes two critical bugs in `chunk_list()` method that prevent processing large documents (>128 chunks) in GraphRAG and other workflows. ## Bugs Fixed ### Bug 1: Incorrect pagination offset calculation **Location:** `rag/nlp/search.py` lines 530-531 **Problem:** The loop variable `p` was used directly as offset, causing incorrect pagination: ```python # BEFORE (BUGGY): for p in range(offset, max_count, bs): # p = 0, 128, 256, 384... es_res = self.dataStore.search(..., p, bs, ...) # p used as offset Fix: Use page number multiplied by batch size: # AFTER (FIXED): for page_num, p in enumerate(range(offset, max_count, bs)): es_res = self.dataStore.search(..., page_num * bs, bs, ...) Bug 2: Premature loop termination Location: rag/nlp/search.py lines 538-539 Problem: Loop terminates when any page returns fewer than 128 chunks, even when thousands more remain: # BEFORE (BUGGY): if len(dict_chunks.values()) < bs: # Breaks at 126 chunks even if 3,000+ remain break Fix: Only terminate when zero chunks returned: # AFTER (FIXED): if len(dict_chunks.values()) == 0: break Enhancement: Add max_count parameter to GraphRAG Location: graphrag/general/index.py line 60 Added max_count=10000 parameter to chunk loading for both LightRAG and General GraphRAG paths to ensure all chunks are processed. Testing Validated with a 314-page legal document containing 3,207 chunks: Before fixes: - Only 2-126 chunks processed - GraphRAG generated 25 nodes, 8 edges After fixes: - All 3,209 chunks processed ✅ - GraphRAG processing complete dataset Impact These bugs affect any workflow using chunk_list() with large documents, particularly: - GraphRAG knowledge graph generation - RAPTOR hierarchical summarization - Document processing pipelines with >128 chunks Related Issue Fixes #11687 Checklist - Code follows project style guidelines - Tested with large documents (3,207+ chunks) - Both bugs validated by Dosu bot in issue #11687 - No breaking changes to API --------- Co-authored-by: Kevin Hu --- graphrag/general/index.py | 12 +++++++++--- rag/nlp/search.py | 3 ++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/graphrag/general/index.py b/graphrag/general/index.py index 12b39400e..f307e5d91 100644 --- a/graphrag/general/index.py +++ b/graphrag/general/index.py @@ -57,7 +57,7 @@ async def run_graphrag( start = trio.current_time() tenant_id, kb_id, doc_id = row["tenant_id"], str(row["kb_id"]), row["doc_id"] chunks = [] - for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], fields=["content_with_weight", "doc_id"], sort_by_position=True): + for d in settings.retriever.chunk_list(doc_id, tenant_id, [kb_id], max_count=10000, fields=["content_with_weight", "doc_id"], sort_by_position=True): chunks.append(d["content_with_weight"]) with trio.fail_after(max(120, len(chunks) * 60 * 10) if enable_timeout_assertion else 10000000000): @@ -174,13 +174,19 @@ async def run_graphrag_for_kb( chunks = [] current_chunk = "" - for d in settings.retriever.chunk_list( + # DEBUG: Obtener todos los chunks primero + raw_chunks = list(settings.retriever.chunk_list( doc_id, tenant_id, [kb_id], + max_count=10000, # FIX: Aumentar límite para procesar todos los chunks fields=fields_for_chunks, sort_by_position=True, - ): + )) + + callback(msg=f"[DEBUG] chunk_list() returned {len(raw_chunks)} raw chunks for doc {doc_id}") + + for d in raw_chunks: content = d["content_with_weight"] if num_tokens_from_string(current_chunk + content) < 1024: current_chunk += content diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 6cf3200b0..1ca70f678 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -537,7 +537,8 @@ class Dealer: doc["id"] = id if dict_chunks: res.extend(dict_chunks.values()) - if len(dict_chunks.values()) < bs: + # FIX: Solo terminar si no hay chunks, no si hay menos de bs + if len(dict_chunks.values()) == 0: break return res