From 871055b0fc5fc98ecc5fc5bdfc90f7012320ef09 Mon Sep 17 00:00:00 2001 From: buua436 <66937541+buua436@users.noreply.github.com> Date: Thu, 13 Nov 2025 15:17:52 +0800 Subject: [PATCH 1/3] Feat:support API for generating knowledge graph and raptor (#11229) ### What problem does this PR solve? issue: [#11195](https://github.com/infiniflow/ragflow/issues/11195) change: support API for generating knowledge graph and raptor ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update --- api/apps/sdk/dataset.py | 159 +++++++++++++++++- docs/references/http_api_reference.md | 231 ++++++++++++++++++++++++++ 2 files changed, 387 insertions(+), 3 deletions(-) diff --git a/api/apps/sdk/dataset.py b/api/apps/sdk/dataset.py index 8a315ce69..de5434de7 100644 --- a/api/apps/sdk/dataset.py +++ b/api/apps/sdk/dataset.py @@ -21,10 +21,11 @@ import json from flask import request from peewee import OperationalError from api.db.db_models import File -from api.db.services.document_service import DocumentService +from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService +from api.db.services.task_service import GRAPH_RAPTOR_FAKE_DOC_ID, TaskService from api.db.services.user_service import TenantService from common.constants import RetCode, FileSource, StatusEnum from api.utils.api_utils import ( @@ -118,7 +119,6 @@ def create(tenant_id): req, err = validate_and_parse_json_request(request, CreateDatasetReq) if err is not None: return get_error_argument_result(err) - req = KnowledgebaseService.create_with_name( name = req.pop("name", None), tenant_id = tenant_id, @@ -144,7 +144,6 @@ def create(tenant_id): ok, k = KnowledgebaseService.get_by_id(req["id"]) if not ok: return get_error_data_result(message="Dataset created failed") - response_data = remap_dictionary_keys(k.to_dict()) return get_result(data=response_data) except Exception as e: @@ -532,3 +531,157 @@ def delete_knowledge_graph(tenant_id, dataset_id): search.index_name(kb.tenant_id), dataset_id) return get_result(data=True) + + +@manager.route("/datasets//run_graphrag", methods=["POST"]) # noqa: F821 +@token_required +def run_graphrag(tenant_id,dataset_id): + if not dataset_id: + return get_error_data_result(message='Lack of "Dataset ID"') + if not KnowledgebaseService.accessible(dataset_id, tenant_id): + return get_result( + data=False, + message='No authorization.', + code=RetCode.AUTHENTICATION_ERROR + ) + + ok, kb = KnowledgebaseService.get_by_id(dataset_id) + if not ok: + return get_error_data_result(message="Invalid Dataset ID") + + task_id = kb.graphrag_task_id + if task_id: + ok, task = TaskService.get_by_id(task_id) + if not ok: + logging.warning(f"A valid GraphRAG task id is expected for Dataset {dataset_id}") + + if task and task.progress not in [-1, 1]: + return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A Graph Task is already running.") + + documents, _ = DocumentService.get_by_kb_id( + kb_id=dataset_id, + page_number=0, + items_per_page=0, + orderby="create_time", + desc=False, + keywords="", + run_status=[], + types=[], + suffix=[], + ) + if not documents: + return get_error_data_result(message=f"No documents in Dataset {dataset_id}") + + sample_document = documents[0] + document_ids = [document["id"] for document in documents] + + task_id = queue_raptor_o_graphrag_tasks(sample_doc_id=sample_document, ty="graphrag", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids)) + + if not KnowledgebaseService.update_by_id(kb.id, {"graphrag_task_id": task_id}): + logging.warning(f"Cannot save graphrag_task_id for Dataset {dataset_id}") + + return get_result(data={"graphrag_task_id": task_id}) + + +@manager.route("/datasets//trace_graphrag", methods=["GET"]) # noqa: F821 +@token_required +def trace_graphrag(tenant_id,dataset_id): + if not dataset_id: + return get_error_data_result(message='Lack of "Dataset ID"') + if not KnowledgebaseService.accessible(dataset_id, tenant_id): + return get_result( + data=False, + message='No authorization.', + code=RetCode.AUTHENTICATION_ERROR + ) + + ok, kb = KnowledgebaseService.get_by_id(dataset_id) + if not ok: + return get_error_data_result(message="Invalid Dataset ID") + + task_id = kb.graphrag_task_id + if not task_id: + return get_result(data={}) + + ok, task = TaskService.get_by_id(task_id) + if not ok: + return get_result(data={}) + + return get_result(data=task.to_dict()) + + +@manager.route("/datasets//run_raptor", methods=["POST"]) # noqa: F821 +@token_required +def run_raptor(tenant_id,dataset_id): + if not dataset_id: + return get_error_data_result(message='Lack of "Dataset ID"') + if not KnowledgebaseService.accessible(dataset_id, tenant_id): + return get_result( + data=False, + message='No authorization.', + code=RetCode.AUTHENTICATION_ERROR + ) + + ok, kb = KnowledgebaseService.get_by_id(dataset_id) + if not ok: + return get_error_data_result(message="Invalid Dataset ID") + + task_id = kb.raptor_task_id + if task_id: + ok, task = TaskService.get_by_id(task_id) + if not ok: + logging.warning(f"A valid RAPTOR task id is expected for Dataset {dataset_id}") + + if task and task.progress not in [-1, 1]: + return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A RAPTOR Task is already running.") + + documents, _ = DocumentService.get_by_kb_id( + kb_id=dataset_id, + page_number=0, + items_per_page=0, + orderby="create_time", + desc=False, + keywords="", + run_status=[], + types=[], + suffix=[], + ) + if not documents: + return get_error_data_result(message=f"No documents in Dataset {dataset_id}") + + sample_document = documents[0] + document_ids = [document["id"] for document in documents] + + task_id = queue_raptor_o_graphrag_tasks(sample_doc_id=sample_document, ty="raptor", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids)) + + if not KnowledgebaseService.update_by_id(kb.id, {"raptor_task_id": task_id}): + logging.warning(f"Cannot save raptor_task_id for Dataset {dataset_id}") + + return get_result(data={"raptor_task_id": task_id}) + + +@manager.route("/datasets//trace_raptor", methods=["GET"]) # noqa: F821 +@token_required +def trace_raptor(tenant_id,dataset_id): + if not dataset_id: + return get_error_data_result(message='Lack of "Dataset ID"') + + if not KnowledgebaseService.accessible(dataset_id, tenant_id): + return get_result( + data=False, + message='No authorization.', + code=RetCode.AUTHENTICATION_ERROR + ) + ok, kb = KnowledgebaseService.get_by_id(dataset_id) + if not ok: + return get_error_data_result(message="Invalid Dataset ID") + + task_id = kb.raptor_task_id + if not task_id: + return get_result(data={}) + + ok, task = TaskService.get_by_id(task_id) + if not ok: + return get_error_data_result(message="RAPTOR Task Not Found or Error Occurred") + + return get_result(data=task.to_dict()) \ No newline at end of file diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index f2b86a735..481614d13 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -974,6 +974,237 @@ Failure: --- +### Construct knowledge graph + +**POST** `/api/v1/datasets/{dataset_id}/run_graphrag` + +Constructs a knowledge graph from a specified dataset. + +#### Request + +- Method: POST +- URL: `/api/v1/datasets/{dataset_id}/run_graphrag` +- Headers: + - `'Authorization: Bearer '` + +##### Request example + +```bash +curl --request POST \ + --url http://{address}/api/v1/datasets/{dataset_id}/run_graphrag \ + --header 'Authorization: Bearer ' +``` + +##### Request parameters + +- `dataset_id`: (*Path parameter*) + The ID of the target dataset. + +#### Response + +Success: + +```json +{ + "code":0, + "data":{ + "graphrag_task_id":"e498de54bfbb11f0ba028f704583b57b" + } +} +``` + +Failure: + +```json +{ + "code": 102, + "message": "Invalid Dataset ID" +} +``` + +--- + +### Get knowledge graph construction status + +**GET** `/api/v1/datasets/{dataset_id}/trace_graphrag` + +Retrieves the knowledge graph construction status for a specified dataset. + +#### Request + +- Method: GET +- URL: `/api/v1/datasets/{dataset_id}/trace_graphrag` +- Headers: + - `'Authorization: Bearer '` + +##### Request example + +```bash +curl --request GET \ + --url http://{address}/api/v1/datasets/{dataset_id}/trace_graphrag \ + --header 'Authorization: Bearer ' +``` + +##### Request parameters + +- `dataset_id`: (*Path parameter*) + The ID of the target dataset. + +#### Response + +Success: + +```json +{ + "code":0, + "data":{ + "begin_at":"Wed, 12 Nov 2025 19:36:56 GMT", + "chunk_ids":"", + "create_date":"Wed, 12 Nov 2025 19:36:56 GMT", + "create_time":1762947416350, + "digest":"39e43572e3dcd84f", + "doc_id":"44661c10bde211f0bc93c164a47ffc40", + "from_page":100000000, + "id":"e498de54bfbb11f0ba028f704583b57b", + "priority":0, + "process_duration":2.45419, + "progress":1.0, + "progress_msg":"19:36:56 created task graphrag\n19:36:57 Task has been received.\n19:36:58 [GraphRAG] doc:083661febe2411f0bc79456921e5745f has no available chunks, skip generation.\n19:36:58 [GraphRAG] build_subgraph doc:44661c10bde211f0bc93c164a47ffc40 start (chunks=1, timeout=10000000000s)\n19:36:58 Graph already contains 44661c10bde211f0bc93c164a47ffc40\n19:36:58 [GraphRAG] build_subgraph doc:44661c10bde211f0bc93c164a47ffc40 empty\n19:36:58 [GraphRAG] kb:33137ed0bde211f0bc93c164a47ffc40 no subgraphs generated successfully, end.\n19:36:58 Knowledge Graph done (0.72s)","retry_count":1, + "task_type":"graphrag", + "to_page":100000000, + "update_date":"Wed, 12 Nov 2025 19:36:58 GMT", + "update_time":1762947418454 + } +} +``` + +Failure: + +```json +{ + "code": 102, + "message": "Invalid Dataset ID" +} +``` + +--- + +### Construct RAPTOR + +**POST** `/api/v1/datasets/{dataset_id}/run_raptor` + +Construct a RAPTOR from a specified dataset. + +#### Request + +- Method: POST +- URL: `/api/v1/datasets/{dataset_id}/run_raptor` +- Headers: + - `'Authorization: Bearer '` + +##### Request example + +```bash +curl --request POST \ + --url http://{address}/api/v1/datasets/{dataset_id}/run_raptor \ + --header 'Authorization: Bearer ' +``` + +##### Request parameters + +- `dataset_id`: (*Path parameter*) + The ID of the target dataset. + +#### Response + +Success: + +```json +{ + "code":0, + "data":{ + "raptor_task_id":"50d3c31cbfbd11f0ba028f704583b57b" + } +} +``` + +Failure: + +```json +{ + "code": 102, + "message": "Invalid Dataset ID" +} +``` + +--- + +### Get RAPTOR construction status + +**GET** `/api/v1/datasets/{dataset_id}/trace_raptor` + +Retrieves the RAPTOR construction status for a specified dataset. + +#### Request + +- Method: GET +- URL: `/api/v1/datasets/{dataset_id}/trace_raptor` +- Headers: + - `'Authorization: Bearer '` + +##### Request example + +```bash +curl --request GET \ + --url http://{address}/api/v1/datasets/{dataset_id}/trace_raptor \ + --header 'Authorization: Bearer ' +``` + +##### Request parameters + +- `dataset_id`: (*Path parameter*) + The ID of the target dataset. + +#### Response + +Success: + +```json +{ + "code":0, + "data":{ + "begin_at":"Wed, 12 Nov 2025 19:47:07 GMT", + "chunk_ids":"", + "create_date":"Wed, 12 Nov 2025 19:47:07 GMT", + "create_time":1762948027427, + "digest":"8b279a6248cb8fc6", + "doc_id":"44661c10bde211f0bc93c164a47ffc40", + "from_page":100000000, + "id":"50d3c31cbfbd11f0ba028f704583b57b", + "priority":0, + "process_duration":0.948244, + "progress":1.0, + "progress_msg":"19:47:07 created task raptor\n19:47:07 Task has been received.\n19:47:07 Processing...\n19:47:07 Processing...\n19:47:07 Indexing done (0.01s).\n19:47:07 Task done (0.29s)", + "retry_count":1, + "task_type":"raptor", + "to_page":100000000, + "update_date":"Wed, 12 Nov 2025 19:47:07 GMT", + "update_time":1762948027948 + } +} +``` + +Failure: + +```json +{ + "code": 102, + "message": "Invalid Dataset ID" +} +``` + +--- + ## FILE MANAGEMENT WITHIN DATASET --- From bfc84ba95bff47eb839076eb01077add64400205 Mon Sep 17 00:00:00 2001 From: Liu An Date: Thu, 13 Nov 2025 15:18:32 +0800 Subject: [PATCH 2/3] Test: handle duplicate names by appending "(1)" (#11244) ### What problem does this PR solve? - Updated tests to reflect new behavior of handling duplicate dataset names - Instead of returning an error, the system now appends "(1)" to duplicate names - This problem was introduced by PR #10960 ### Type of change - [x] Testcase update --- .../test_dataset_mangement/test_create_dataset.py | 15 ++++++++------- .../test_dataset_mangement/test_create_dataset.py | 14 ++++++-------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py index 507570fba..559b41f3c 100644 --- a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py +++ b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py @@ -16,14 +16,15 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import pytest -from common import create_dataset -from configs import DATASET_NAME_LIMIT, INVALID_API_TOKEN +from configs import DATASET_NAME_LIMIT, DEFAULT_PARSER_CONFIG, INVALID_API_TOKEN from hypothesis import example, given, settings from libs.auth import RAGFlowHttpApiAuth from utils import encode_avatar from utils.file_utils import create_image_file from utils.hypothesis_utils import valid_names -from configs import DEFAULT_PARSER_CONFIG + +from common import create_dataset + @pytest.mark.usefixtures("clear_datasets") class TestAuthorization: @@ -125,8 +126,8 @@ class TestDatasetCreate: assert res["code"] == 0, res res = create_dataset(HttpApiAuth, payload) - assert res["code"] == 103, res - assert res["message"] == f"Dataset name '{name}' already exists", res + assert res["code"] == 0, res + assert res["data"]["name"] == name + "(1)", res @pytest.mark.p3 def test_name_case_insensitive(self, HttpApiAuth): @@ -137,8 +138,8 @@ class TestDatasetCreate: payload = {"name": name.lower()} res = create_dataset(HttpApiAuth, payload) - assert res["code"] == 103, res - assert res["message"] == f"Dataset name '{name.lower()}' already exists", res + assert res["code"] == 0, res + assert res["data"]["name"] == name.lower() + "(1)", res @pytest.mark.p2 def test_avatar(self, HttpApiAuth, tmp_path): diff --git a/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py b/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py index 049f288b3..a97cb66c8 100644 --- a/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py +++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py @@ -17,13 +17,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from operator import attrgetter import pytest -from configs import DATASET_NAME_LIMIT, HOST_ADDRESS, INVALID_API_TOKEN +from configs import DATASET_NAME_LIMIT, DEFAULT_PARSER_CONFIG, HOST_ADDRESS, INVALID_API_TOKEN from hypothesis import example, given, settings from ragflow_sdk import DataSet, RAGFlow from utils import encode_avatar from utils.file_utils import create_image_file from utils.hypothesis_utils import valid_names -from configs import DEFAULT_PARSER_CONFIG + @pytest.mark.usefixtures("clear_datasets") class TestAuthorization: @@ -95,9 +95,8 @@ class TestDatasetCreate: payload = {"name": name} client.create_dataset(**payload) - with pytest.raises(Exception) as excinfo: - client.create_dataset(**payload) - assert str(excinfo.value) == f"Dataset name '{name}' already exists", str(excinfo.value) + dataset = client.create_dataset(**payload) + assert dataset.name == name + "(1)", str(dataset) @pytest.mark.p3 def test_name_case_insensitive(self, client): @@ -106,9 +105,8 @@ class TestDatasetCreate: client.create_dataset(**payload) payload = {"name": name.lower()} - with pytest.raises(Exception) as excinfo: - client.create_dataset(**payload) - assert str(excinfo.value) == f"Dataset name '{name.lower()}' already exists", str(excinfo.value) + dataset = client.create_dataset(**payload) + assert dataset.name == name.lower() + "(1)", str(dataset) @pytest.mark.p2 def test_avatar(self, client, tmp_path): From 93422fa8cc121148dfd5ff76b1a1bdc32c42ff48 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Thu, 13 Nov 2025 15:19:02 +0800 Subject: [PATCH 3/3] Fix: Law parser (#11246) ### What problem does this PR solve? Fix: Law parser ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/nlp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 61a3b6f3a..80acf1d8f 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -482,7 +482,7 @@ def tree_merge(bull, sections, depth): root = Node(level=0, depth=target_level, texts=[]) root.build_tree(lines) - return [("\n").join(element) for element in root.get_tree() if element] + return [element for element in root.get_tree() if element] def hierarchical_merge(bull, sections, depth):