diff --git a/README.md b/README.md
index d82721d98..ded81f099 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ Try our demo at [https://demo.ragflow.io](https://demo.ragflow.io).
## 🔥 Latest Updates
- 2025-11-19 Supports Gemini 3 Pro.
-- 2025-11-12 Supports data synchronization from Confluence, AWS S3, Discord, Google Drive.
+- 2025-11-12 Supports data synchronization from Confluence, S3, Notion, Discord, Google Drive.
- 2025-10-23 Supports MinerU & Docling as document parsing methods.
- 2025-10-15 Supports orchestrable ingestion pipeline.
- 2025-08-08 Supports OpenAI's latest GPT-5 series models.
diff --git a/README_id.md b/README_id.md
index 953fce4c5..11b09b4fb 100644
--- a/README_id.md
+++ b/README_id.md
@@ -86,7 +86,7 @@ Coba demo kami di [https://demo.ragflow.io](https://demo.ragflow.io).
## 🔥 Pembaruan Terbaru
- 2025-11-19 Mendukung Gemini 3 Pro.
-- 2025-11-12 Mendukung sinkronisasi data dari Confluence, AWS S3, Discord, Google Drive.
+- 2025-11-12 Mendukung sinkronisasi data dari Confluence, S3, Notion, Discord, Google Drive.
- 2025-10-23 Mendukung MinerU & Docling sebagai metode penguraian dokumen.
- 2025-10-15 Dukungan untuk jalur data yang terorkestrasi.
- 2025-08-08 Mendukung model seri GPT-5 terbaru dari OpenAI.
diff --git a/README_ja.md b/README_ja.md
index 7711d3ff0..5e471b5c2 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -67,7 +67,7 @@
## 🔥 最新情報
- 2025-11-19 Gemini 3 Proをサポートしています
-- 2025-11-12 Confluence、AWS S3、Discord、Google Drive からのデータ同期をサポートします。
+- 2025-11-12 Confluence、S3、Notion、Discord、Google Drive からのデータ同期をサポートします。
- 2025-10-23 ドキュメント解析方法として MinerU と Docling をサポートします。
- 2025-10-15 オーケストレーションされたデータパイプラインのサポート。
- 2025-08-08 OpenAI の最新 GPT-5 シリーズモデルをサポートします。
diff --git a/README_ko.md b/README_ko.md
index 386fd2faa..f34f23279 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -68,7 +68,7 @@
## 🔥 업데이트
- 2025-11-19 Gemini 3 Pro를 지원합니다.
-- 2025-11-12 Confluence, AWS S3, Discord, Google Drive에서 데이터 동기화를 지원합니다.
+- 2025-11-12 Confluence, S3, Notion, Discord, Google Drive에서 데이터 동기화를 지원합니다.
- 2025-10-23 문서 파싱 방법으로 MinerU 및 Docling을 지원합니다.
- 2025-10-15 조정된 데이터 파이프라인 지원.
- 2025-08-08 OpenAI의 최신 GPT-5 시리즈 모델을 지원합니다.
diff --git a/README_pt_br.md b/README_pt_br.md
index 487ec5530..71690ebb9 100644
--- a/README_pt_br.md
+++ b/README_pt_br.md
@@ -87,7 +87,7 @@ Experimente nossa demo em [https://demo.ragflow.io](https://demo.ragflow.io).
## 🔥 Últimas Atualizações
- 19-11-2025 Suporta Gemini 3 Pro.
-- 12-11-2025 Suporta a sincronização de dados do Confluence, AWS S3, Discord e Google Drive.
+- 12-11-2025 Suporta a sincronização de dados do Confluence, S3, Notion, Discord e Google Drive.
- 23-10-2025 Suporta MinerU e Docling como métodos de análise de documentos.
- 15-10-2025 Suporte para pipelines de dados orquestrados.
- 08-08-2025 Suporta a mais recente série GPT-5 da OpenAI.
diff --git a/README_tzh.md b/README_tzh.md
index eab5938e4..7756aacc8 100644
--- a/README_tzh.md
+++ b/README_tzh.md
@@ -86,7 +86,7 @@
## 🔥 近期更新
- 2025-11-19 支援 Gemini 3 Pro.
-- 2025-11-12 支援從 Confluence、AWS S3、Discord、Google Drive 進行資料同步。
+- 2025-11-12 支援從 Confluence、S3、Notion、Discord、Google Drive 進行資料同步。
- 2025-10-23 支援 MinerU 和 Docling 作為文件解析方法。
- 2025-10-15 支援可編排的資料管道。
- 2025-08-08 支援 OpenAI 最新的 GPT-5 系列模型。
diff --git a/README_zh.md b/README_zh.md
index 58394b5fd..799c3aaea 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -86,7 +86,7 @@
## 🔥 近期更新
- 2025-11-19 支持 Gemini 3 Pro.
-- 2025-11-12 支持从 Confluence、AWS S3、Discord、Google Drive 进行数据同步。
+- 2025-11-12 支持从 Confluence、S3、Notion、Discord、Google Drive 进行数据同步。
- 2025-10-23 支持 MinerU 和 Docling 作为文档解析方法。
- 2025-10-15 支持可编排的数据管道。
- 2025-08-08 支持 OpenAI 最新的 GPT-5 系列模型。
diff --git a/agent/component/iteration.py b/agent/component/iteration.py
index cff09d622..ae5c0b677 100644
--- a/agent/component/iteration.py
+++ b/agent/component/iteration.py
@@ -32,7 +32,7 @@ class IterationParam(ComponentParamBase):
def __init__(self):
super().__init__()
self.items_ref = ""
- self.veriable={}
+ self.variable={}
def get_input_form(self) -> dict[str, dict]:
return {
diff --git a/agent/tools/retrieval.py b/agent/tools/retrieval.py
index c3a01e517..fd9096cb6 100644
--- a/agent/tools/retrieval.py
+++ b/agent/tools/retrieval.py
@@ -137,7 +137,7 @@ class Retrieval(ToolBase, ABC):
if not doc_ids:
doc_ids = None
elif self._param.meta_data_filter.get("method") == "manual":
- filters=self._param.meta_data_filter["manual"]
+ filters = self._param.meta_data_filter["manual"]
for flt in filters:
pat = re.compile(self.variable_ref_patt)
s = flt["value"]
@@ -166,8 +166,8 @@ class Retrieval(ToolBase, ABC):
out_parts.append(s[last:])
flt["value"] = "".join(out_parts)
doc_ids.extend(meta_filter(metas, filters, self._param.meta_data_filter.get("logic", "and")))
- if not doc_ids:
- doc_ids = None
+ if filters and not doc_ids:
+ doc_ids = ["-999"]
if self._param.cross_languages:
query = cross_languages(kbs[0].tenant_id, None, query, self._param.cross_languages)
diff --git a/api/apps/__init__.py b/api/apps/__init__.py
index a53f67c06..a6e33c13b 100644
--- a/api/apps/__init__.py
+++ b/api/apps/__init__.py
@@ -24,7 +24,7 @@ from flasgger import Swagger
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
from quart_cors import cors
from common.constants import StatusEnum
-from api.db.db_models import close_connection
+from api.db.db_models import close_connection, APIToken
from api.db.services import UserService
from api.utils.json_encode import CustomJSONEncoder
from api.utils import commands
@@ -124,6 +124,10 @@ def _load_user():
user = UserService.query(
access_token=access_token, status=StatusEnum.VALID.value
)
+ if not user and len(authorization.split()) == 2:
+ objs = APIToken.query(token=authorization.split()[1])
+ if objs:
+ user = UserService.query(id=objs[0].tenant_id, status=StatusEnum.VALID.value)
if user:
if not user[0].access_token or not user[0].access_token.strip():
logging.warning(f"User {user[0].email} has empty access_token in database")
diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py
index e121bcba7..b43fb9af1 100644
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
@@ -311,8 +311,8 @@ async def retrieval_test():
doc_ids = None
elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
- if not doc_ids:
- doc_ids = None
+ if meta_data_filter["manual"] and not doc_ids:
+ doc_ids = ["-999"]
try:
tenants = UserTenantService.query(user_id=current_user.id)
diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py
index 84300ac3c..52acebc43 100644
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -1434,6 +1434,7 @@ async def retrieval_test(tenant_id):
question = req["question"]
doc_ids = req.get("document_ids", [])
use_kg = req.get("use_kg", False)
+ toc_enhance = req.get("toc_enhance", False)
langs = req.get("cross_languages", [])
if not isinstance(doc_ids, list):
return get_error_data_result("`documents` should be a list")
@@ -1445,6 +1446,8 @@ async def retrieval_test(tenant_id):
metadata_condition = req.get("metadata_condition", {}) or {}
metas = DocumentService.get_meta_by_kbs(kb_ids)
doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
+ if metadata_condition and not doc_ids:
+ doc_ids = ["-999"]
similarity_threshold = float(req.get("similarity_threshold", 0.2))
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
top = int(req.get("top_k", 1024))
@@ -1485,6 +1488,11 @@ async def retrieval_test(tenant_id):
highlight=highlight,
rank_feature=label_question(question, kbs),
)
+ if toc_enhance:
+ chat_mdl = LLMBundle(kb.tenant_id, LLMType.CHAT)
+ cks = settings.retriever.retrieval_by_toc(question, ranks["chunks"], tenant_ids, chat_mdl, size)
+ if cks:
+ ranks["chunks"] = cks
if use_kg:
ck = settings.kg_retriever.retrieval(question, [k.tenant_id for k in kbs], kb_ids, embd_mdl, LLMBundle(kb.tenant_id, LLMType.CHAT))
if ck["content_with_weight"]:
diff --git a/api/apps/sdk/session.py b/api/apps/sdk/session.py
index 533375622..074401ede 100644
--- a/api/apps/sdk/session.py
+++ b/api/apps/sdk/session.py
@@ -446,8 +446,8 @@ async def agent_completions(tenant_id, agent_id):
if req.get("stream", True):
- def generate():
- for answer in agent_completion(tenant_id=tenant_id, agent_id=agent_id, **req):
+ async def generate():
+ async for answer in agent_completion(tenant_id=tenant_id, agent_id=agent_id, **req):
if isinstance(answer, str):
try:
ans = json.loads(answer[5:]) # remove "data:"
@@ -471,7 +471,7 @@ async def agent_completions(tenant_id, agent_id):
full_content = ""
reference = {}
final_ans = ""
- for answer in agent_completion(tenant_id=tenant_id, agent_id=agent_id, **req):
+ async for answer in agent_completion(tenant_id=tenant_id, agent_id=agent_id, **req):
try:
ans = json.loads(answer[5:])
@@ -873,7 +873,7 @@ async def agent_bot_completions(agent_id):
resp.headers.add_header("Content-Type", "text/event-stream; charset=utf-8")
return resp
- for answer in agent_completion(objs[0].tenant_id, agent_id, **req):
+ async for answer in agent_completion(objs[0].tenant_id, agent_id, **req):
return get_result(data=answer)
@@ -981,8 +981,8 @@ async def retrieval_test_embedded():
doc_ids = None
elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
- if not doc_ids:
- doc_ids = None
+ if meta_data_filter["manual"] and not doc_ids:
+ doc_ids = ["-999"]
try:
tenants = UserTenantService.query(user_id=tenant_id)
diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py
index db878574d..0a09ea532 100644
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@@ -415,9 +415,10 @@ def chat(dialog, messages, stream=True, **kwargs):
if not attachments:
attachments = None
elif dialog.meta_data_filter.get("method") == "manual":
- attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"], dialog.meta_data_filter.get("logic", "and")))
- if not attachments:
- attachments = None
+ conds = dialog.meta_data_filter["manual"]
+ attachments.extend(meta_filter(metas, conds, dialog.meta_data_filter.get("logic", "and")))
+ if conds and not attachments:
+ attachments = ["-999"]
if prompt_config.get("keyword", False):
questions[-1] += keyword_extraction(chat_mdl, questions[-1])
@@ -787,8 +788,8 @@ def ask(question, kb_ids, tenant_id, chat_llm_name=None, search_config={}):
doc_ids = None
elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
- if not doc_ids:
- doc_ids = None
+ if meta_data_filter["manual"] and not doc_ids:
+ doc_ids = ["-999"]
kbinfos = retriever.retrieval(
question=question,
@@ -862,8 +863,8 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
doc_ids = None
elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
- if not doc_ids:
- doc_ids = None
+ if meta_data_filter["manual"] and not doc_ids:
+ doc_ids = ["-999"]
ranks = settings.retriever.retrieval(
question=question,
diff --git a/common/data_source/notion_connector.py b/common/data_source/notion_connector.py
index 8c6a522ad..e29bbbe76 100644
--- a/common/data_source/notion_connector.py
+++ b/common/data_source/notion_connector.py
@@ -1,38 +1,45 @@
+import html
import logging
from collections.abc import Generator
+from datetime import datetime, timezone
+from pathlib import Path
from typing import Any, Optional
+from urllib.parse import urlparse
+
from retry import retry
from common.data_source.config import (
INDEX_BATCH_SIZE,
- DocumentSource, NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP
+ NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP,
+ DocumentSource,
+)
+from common.data_source.exceptions import (
+ ConnectorMissingCredentialError,
+ ConnectorValidationError,
+ CredentialExpiredError,
+ InsufficientPermissionsError,
+ UnexpectedValidationError,
)
from common.data_source.interfaces import (
LoadConnector,
PollConnector,
- SecondsSinceUnixEpoch
+ SecondsSinceUnixEpoch,
)
from common.data_source.models import (
Document,
- TextSection, GenerateDocumentsOutput
-)
-from common.data_source.exceptions import (
- ConnectorValidationError,
- CredentialExpiredError,
- InsufficientPermissionsError,
- UnexpectedValidationError, ConnectorMissingCredentialError
-)
-from common.data_source.models import (
- NotionPage,
+ GenerateDocumentsOutput,
NotionBlock,
- NotionSearchResponse
+ NotionPage,
+ NotionSearchResponse,
+ TextSection,
)
from common.data_source.utils import (
- rl_requests,
batch_generator,
+ datetime_from_string,
fetch_notion_data,
+ filter_pages_by_time,
properties_to_str,
- filter_pages_by_time, datetime_from_string
+ rl_requests,
)
@@ -61,11 +68,9 @@ class NotionConnector(LoadConnector, PollConnector):
self.recursive_index_enabled = recursive_index_enabled or bool(root_page_id)
@retry(tries=3, delay=1, backoff=2)
- def _fetch_child_blocks(
- self, block_id: str, cursor: Optional[str] = None
- ) -> dict[str, Any] | None:
+ def _fetch_child_blocks(self, block_id: str, cursor: Optional[str] = None) -> dict[str, Any] | None:
"""Fetch all child blocks via the Notion API."""
- logging.debug(f"Fetching children of block with ID '{block_id}'")
+ logging.debug(f"[Notion]: Fetching children of block with ID {block_id}")
block_url = f"https://api.notion.com/v1/blocks/{block_id}/children"
query_params = {"start_cursor": cursor} if cursor else None
@@ -79,49 +84,42 @@ class NotionConnector(LoadConnector, PollConnector):
response.raise_for_status()
return response.json()
except Exception as e:
- if hasattr(e, 'response') and e.response.status_code == 404:
- logging.error(
- f"Unable to access block with ID '{block_id}'. "
- f"This is likely due to the block not being shared with the integration."
- )
+ if hasattr(e, "response") and e.response.status_code == 404:
+ logging.error(f"[Notion]: Unable to access block with ID {block_id}. This is likely due to the block not being shared with the integration.")
return None
else:
- logging.exception(f"Error fetching blocks: {e}")
+ logging.exception(f"[Notion]: Error fetching blocks: {e}")
raise
@retry(tries=3, delay=1, backoff=2)
def _fetch_page(self, page_id: str) -> NotionPage:
"""Fetch a page from its ID via the Notion API."""
- logging.debug(f"Fetching page for ID '{page_id}'")
+ logging.debug(f"[Notion]: Fetching page for ID {page_id}")
page_url = f"https://api.notion.com/v1/pages/{page_id}"
try:
data = fetch_notion_data(page_url, self.headers, "GET")
return NotionPage(**data)
except Exception as e:
- logging.warning(f"Failed to fetch page, trying database for ID '{page_id}': {e}")
+ logging.warning(f"[Notion]: Failed to fetch page, trying database for ID {page_id}: {e}")
return self._fetch_database_as_page(page_id)
@retry(tries=3, delay=1, backoff=2)
def _fetch_database_as_page(self, database_id: str) -> NotionPage:
"""Attempt to fetch a database as a page."""
- logging.debug(f"Fetching database for ID '{database_id}' as a page")
+ logging.debug(f"[Notion]: Fetching database for ID {database_id} as a page")
database_url = f"https://api.notion.com/v1/databases/{database_id}"
data = fetch_notion_data(database_url, self.headers, "GET")
database_name = data.get("title")
- database_name = (
- database_name[0].get("text", {}).get("content") if database_name else None
- )
+ database_name = database_name[0].get("text", {}).get("content") if database_name else None
return NotionPage(**data, database_name=database_name)
@retry(tries=3, delay=1, backoff=2)
- def _fetch_database(
- self, database_id: str, cursor: Optional[str] = None
- ) -> dict[str, Any]:
+ def _fetch_database(self, database_id: str, cursor: Optional[str] = None) -> dict[str, Any]:
"""Fetch a database from its ID via the Notion API."""
- logging.debug(f"Fetching database for ID '{database_id}'")
+ logging.debug(f"[Notion]: Fetching database for ID {database_id}")
block_url = f"https://api.notion.com/v1/databases/{database_id}/query"
body = {"start_cursor": cursor} if cursor else None
@@ -129,17 +127,12 @@ class NotionConnector(LoadConnector, PollConnector):
data = fetch_notion_data(block_url, self.headers, "POST", body)
return data
except Exception as e:
- if hasattr(e, 'response') and e.response.status_code in [404, 400]:
- logging.error(
- f"Unable to access database with ID '{database_id}'. "
- f"This is likely due to the database not being shared with the integration."
- )
+ if hasattr(e, "response") and e.response.status_code in [404, 400]:
+ logging.error(f"[Notion]: Unable to access database with ID {database_id}. This is likely due to the database not being shared with the integration.")
return {"results": [], "next_cursor": None}
raise
- def _read_pages_from_database(
- self, database_id: str
- ) -> tuple[list[NotionBlock], list[str]]:
+ def _read_pages_from_database(self, database_id: str) -> tuple[list[NotionBlock], list[str]]:
"""Returns a list of top level blocks and all page IDs in the database."""
result_blocks: list[NotionBlock] = []
result_pages: list[str] = []
@@ -158,10 +151,10 @@ class NotionConnector(LoadConnector, PollConnector):
if self.recursive_index_enabled:
if obj_type == "page":
- logging.debug(f"Found page with ID '{obj_id}' in database '{database_id}'")
+ logging.debug(f"[Notion]: Found page with ID {obj_id} in database {database_id}")
result_pages.append(result["id"])
elif obj_type == "database":
- logging.debug(f"Found database with ID '{obj_id}' in database '{database_id}'")
+ logging.debug(f"[Notion]: Found database with ID {obj_id} in database {database_id}")
_, child_pages = self._read_pages_from_database(obj_id)
result_pages.extend(child_pages)
@@ -172,44 +165,229 @@ class NotionConnector(LoadConnector, PollConnector):
return result_blocks, result_pages
- def _read_blocks(self, base_block_id: str) -> tuple[list[NotionBlock], list[str]]:
- """Reads all child blocks for the specified block, returns blocks and child page ids."""
+ def _extract_rich_text(self, rich_text_array: list[dict[str, Any]]) -> str:
+ collected_text: list[str] = []
+ for rich_text in rich_text_array:
+ content = ""
+ r_type = rich_text.get("type")
+
+ if r_type == "equation":
+ expr = rich_text.get("equation", {}).get("expression")
+ if expr:
+ content = expr
+ elif r_type == "mention":
+ mention = rich_text.get("mention", {}) or {}
+ mention_type = mention.get("type")
+ mention_value = mention.get(mention_type, {}) if mention_type else {}
+ if mention_type == "date":
+ start = mention_value.get("start")
+ end = mention_value.get("end")
+ if start and end:
+ content = f"{start} - {end}"
+ elif start:
+ content = start
+ elif mention_type in {"page", "database"}:
+ content = mention_value.get("id", rich_text.get("plain_text", ""))
+ elif mention_type == "link_preview":
+ content = mention_value.get("url", rich_text.get("plain_text", ""))
+ else:
+ content = rich_text.get("plain_text", "") or str(mention_value)
+ else:
+ if rich_text.get("plain_text"):
+ content = rich_text["plain_text"]
+ elif "text" in rich_text and rich_text["text"].get("content"):
+ content = rich_text["text"]["content"]
+
+ href = rich_text.get("href")
+ if content and href:
+ content = f"{content} ({href})"
+
+ if content:
+ collected_text.append(content)
+
+ return "".join(collected_text).strip()
+
+ def _build_table_html(self, table_block_id: str) -> str | None:
+ rows: list[str] = []
+ cursor = None
+ while True:
+ data = self._fetch_child_blocks(table_block_id, cursor)
+ if data is None:
+ break
+
+ for result in data["results"]:
+ if result.get("type") != "table_row":
+ continue
+ cells_html: list[str] = []
+ for cell in result["table_row"].get("cells", []):
+ cell_text = self._extract_rich_text(cell)
+ cell_html = html.escape(cell_text) if cell_text else ""
+ cells_html.append(f"
{cell_html} | ")
+ rows.append(f"{''.join(cells_html)}
")
+
+ if data.get("next_cursor") is None:
+ break
+ cursor = data["next_cursor"]
+
+ if not rows:
+ return None
+ return "\n" + "\n".join(rows) + "\n
"
+
+ def _download_file(self, url: str) -> bytes | None:
+ try:
+ response = rl_requests.get(url, timeout=60)
+ response.raise_for_status()
+ return response.content
+ except Exception as exc:
+ logging.warning(f"[Notion]: Failed to download Notion file from {url}: {exc}")
+ return None
+
+ def _extract_file_metadata(self, result_obj: dict[str, Any], block_id: str) -> tuple[str | None, str, str | None]:
+ file_source_type = result_obj.get("type")
+ file_source = result_obj.get(file_source_type, {}) if file_source_type else {}
+ url = file_source.get("url")
+
+ name = result_obj.get("name") or file_source.get("name")
+ if url and not name:
+ parsed_name = Path(urlparse(url).path).name
+ name = parsed_name or f"notion_file_{block_id}"
+ elif not name:
+ name = f"notion_file_{block_id}"
+
+ caption = self._extract_rich_text(result_obj.get("caption", [])) if "caption" in result_obj else None
+
+ return url, name, caption
+
+ def _build_attachment_document(
+ self,
+ block_id: str,
+ url: str,
+ name: str,
+ caption: Optional[str],
+ page_last_edited_time: Optional[str],
+ ) -> Document | None:
+ file_bytes = self._download_file(url)
+ if file_bytes is None:
+ return None
+
+ extension = Path(name).suffix or Path(urlparse(url).path).suffix or ".bin"
+ if extension and not extension.startswith("."):
+ extension = f".{extension}"
+ if not extension:
+ extension = ".bin"
+
+ updated_at = datetime_from_string(page_last_edited_time) if page_last_edited_time else datetime.now(timezone.utc)
+ semantic_identifier = caption or name or f"Notion file {block_id}"
+
+ return Document(
+ id=block_id,
+ blob=file_bytes,
+ source=DocumentSource.NOTION,
+ semantic_identifier=semantic_identifier,
+ extension=extension,
+ size_bytes=len(file_bytes),
+ doc_updated_at=updated_at,
+ )
+
+ def _read_blocks(self, base_block_id: str, page_last_edited_time: Optional[str] = None) -> tuple[list[NotionBlock], list[str], list[Document]]:
result_blocks: list[NotionBlock] = []
child_pages: list[str] = []
+ attachments: list[Document] = []
cursor = None
while True:
data = self._fetch_child_blocks(base_block_id, cursor)
if data is None:
- return result_blocks, child_pages
+ return result_blocks, child_pages, attachments
for result in data["results"]:
- logging.debug(f"Found child block for block with ID '{base_block_id}': {result}")
+ logging.debug(f"[Notion]: Found child block for block with ID {base_block_id}: {result}")
result_block_id = result["id"]
result_type = result["type"]
result_obj = result[result_type]
if result_type in ["ai_block", "unsupported", "external_object_instance_page"]:
- logging.warning(f"Skipping unsupported block type '{result_type}'")
+ logging.warning(f"[Notion]: Skipping unsupported block type {result_type}")
+ continue
+
+ if result_type == "table":
+ table_html = self._build_table_html(result_block_id)
+ if table_html:
+ result_blocks.append(
+ NotionBlock(
+ id=result_block_id,
+ text=table_html,
+ prefix="\n\n",
+ )
+ )
+ continue
+
+ if result_type == "equation":
+ expr = result_obj.get("expression")
+ if expr:
+ result_blocks.append(
+ NotionBlock(
+ id=result_block_id,
+ text=expr,
+ prefix="\n",
+ )
+ )
continue
cur_result_text_arr = []
if "rich_text" in result_obj:
- for rich_text in result_obj["rich_text"]:
- if "text" in rich_text:
- text = rich_text["text"]["content"]
- cur_result_text_arr.append(text)
+ text = self._extract_rich_text(result_obj["rich_text"])
+ if text:
+ cur_result_text_arr.append(text)
+
+ if result_type == "bulleted_list_item":
+ if cur_result_text_arr:
+ cur_result_text_arr[0] = f"- {cur_result_text_arr[0]}"
+ else:
+ cur_result_text_arr = ["- "]
+
+ if result_type == "numbered_list_item":
+ if cur_result_text_arr:
+ cur_result_text_arr[0] = f"1. {cur_result_text_arr[0]}"
+ else:
+ cur_result_text_arr = ["1. "]
+
+ if result_type == "to_do":
+ checked = result_obj.get("checked")
+ checkbox_prefix = "[x]" if checked else "[ ]"
+ if cur_result_text_arr:
+ cur_result_text_arr = [f"{checkbox_prefix} {cur_result_text_arr[0]}"] + cur_result_text_arr[1:]
+ else:
+ cur_result_text_arr = [checkbox_prefix]
+
+ if result_type in {"file", "image", "pdf", "video", "audio"}:
+ file_url, file_name, caption = self._extract_file_metadata(result_obj, result_block_id)
+ if file_url:
+ attachment_doc = self._build_attachment_document(
+ block_id=result_block_id,
+ url=file_url,
+ name=file_name,
+ caption=caption,
+ page_last_edited_time=page_last_edited_time,
+ )
+ if attachment_doc:
+ attachments.append(attachment_doc)
+
+ attachment_label = caption or file_name
+ if attachment_label:
+ cur_result_text_arr.append(f"{result_type.capitalize()}: {attachment_label}")
if result["has_children"]:
if result_type == "child_page":
child_pages.append(result_block_id)
else:
- logging.debug(f"Entering sub-block: {result_block_id}")
- subblocks, subblock_child_pages = self._read_blocks(result_block_id)
- logging.debug(f"Finished sub-block: {result_block_id}")
+ logging.debug(f"[Notion]: Entering sub-block: {result_block_id}")
+ subblocks, subblock_child_pages, subblock_attachments = self._read_blocks(result_block_id, page_last_edited_time)
+ logging.debug(f"[Notion]: Finished sub-block: {result_block_id}")
result_blocks.extend(subblocks)
child_pages.extend(subblock_child_pages)
+ attachments.extend(subblock_attachments)
if result_type == "child_database":
inner_blocks, inner_child_pages = self._read_pages_from_database(result_block_id)
@@ -231,7 +409,7 @@ class NotionConnector(LoadConnector, PollConnector):
cursor = data["next_cursor"]
- return result_blocks, child_pages
+ return result_blocks, child_pages, attachments
def _read_page_title(self, page: NotionPage) -> Optional[str]:
"""Extracts the title from a Notion page."""
@@ -245,9 +423,7 @@ class NotionConnector(LoadConnector, PollConnector):
return None
- def _read_pages(
- self, pages: list[NotionPage]
- ) -> Generator[Document, None, None]:
+ def _read_pages(self, pages: list[NotionPage], start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None) -> Generator[Document, None, None]:
"""Reads pages for rich text content and generates Documents."""
all_child_page_ids: list[str] = []
@@ -255,11 +431,17 @@ class NotionConnector(LoadConnector, PollConnector):
if isinstance(page, dict):
page = NotionPage(**page)
if page.id in self.indexed_pages:
- logging.debug(f"Already indexed page with ID '{page.id}'. Skipping.")
+ logging.debug(f"[Notion]: Already indexed page with ID {page.id}. Skipping.")
continue
- logging.info(f"Reading page with ID '{page.id}', with url {page.url}")
- page_blocks, child_page_ids = self._read_blocks(page.id)
+ if start is not None and end is not None:
+ page_ts = datetime_from_string(page.last_edited_time).timestamp()
+ if not (page_ts > start and page_ts <= end):
+ logging.debug(f"[Notion]: Skipping page {page.id} outside polling window.")
+ continue
+
+ logging.info(f"[Notion]: Reading page with ID {page.id}, with url {page.url}")
+ page_blocks, child_page_ids, attachment_docs = self._read_blocks(page.id, page.last_edited_time)
all_child_page_ids.extend(child_page_ids)
self.indexed_pages.add(page.id)
@@ -268,14 +450,12 @@ class NotionConnector(LoadConnector, PollConnector):
if not page_blocks:
if not raw_page_title:
- logging.warning(f"No blocks OR title found for page with ID '{page.id}'. Skipping.")
+ logging.warning(f"[Notion]: No blocks OR title found for page with ID {page.id}. Skipping.")
continue
text = page_title
if page.properties:
- text += "\n\n" + "\n".join(
- [f"{key}: {value}" for key, value in page.properties.items()]
- )
+ text += "\n\n" + "\n".join([f"{key}: {value}" for key, value in page.properties.items()])
sections = [TextSection(link=page.url, text=text)]
else:
sections = [
@@ -286,45 +466,39 @@ class NotionConnector(LoadConnector, PollConnector):
for block in page_blocks
]
- blob = ("\n".join([sec.text for sec in sections])).encode("utf-8")
+ joined_text = "\n".join(sec.text for sec in sections)
+ blob = joined_text.encode("utf-8")
yield Document(
- id=page.id,
- blob=blob,
- source=DocumentSource.NOTION,
- semantic_identifier=page_title,
- extension=".txt",
- size_bytes=len(blob),
- doc_updated_at=datetime_from_string(page.last_edited_time)
+ id=page.id, blob=blob, source=DocumentSource.NOTION, semantic_identifier=page_title, extension=".txt", size_bytes=len(blob), doc_updated_at=datetime_from_string(page.last_edited_time)
)
+ for attachment_doc in attachment_docs:
+ yield attachment_doc
+
if self.recursive_index_enabled and all_child_page_ids:
for child_page_batch_ids in batch_generator(all_child_page_ids, INDEX_BATCH_SIZE):
- child_page_batch = [
- self._fetch_page(page_id)
- for page_id in child_page_batch_ids
- if page_id not in self.indexed_pages
- ]
- yield from self._read_pages(child_page_batch)
+ child_page_batch = [self._fetch_page(page_id) for page_id in child_page_batch_ids if page_id not in self.indexed_pages]
+ yield from self._read_pages(child_page_batch, start, end)
@retry(tries=3, delay=1, backoff=2)
def _search_notion(self, query_dict: dict[str, Any]) -> NotionSearchResponse:
"""Search for pages from a Notion database."""
- logging.debug(f"Searching for pages in Notion with query_dict: {query_dict}")
+ logging.debug(f"[Notion]: Searching for pages in Notion with query_dict: {query_dict}")
data = fetch_notion_data("https://api.notion.com/v1/search", self.headers, "POST", query_dict)
return NotionSearchResponse(**data)
- def _recursive_load(self) -> Generator[list[Document], None, None]:
+ def _recursive_load(self, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None) -> Generator[list[Document], None, None]:
"""Recursively load pages starting from root page ID."""
if self.root_page_id is None or not self.recursive_index_enabled:
raise RuntimeError("Recursive page lookup is not enabled")
- logging.info(f"Recursively loading pages from Notion based on root page with ID: {self.root_page_id}")
+ logging.info(f"[Notion]: Recursively loading pages from Notion based on root page with ID: {self.root_page_id}")
pages = [self._fetch_page(page_id=self.root_page_id)]
- yield from batch_generator(self._read_pages(pages), self.batch_size)
+ yield from batch_generator(self._read_pages(pages, start, end), self.batch_size)
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""Applies integration token to headers."""
- self.headers["Authorization"] = f'Bearer {credentials["notion_integration_token"]}'
+ self.headers["Authorization"] = f"Bearer {credentials['notion_integration_token']}"
return None
def load_from_state(self) -> GenerateDocumentsOutput:
@@ -348,12 +522,10 @@ class NotionConnector(LoadConnector, PollConnector):
else:
break
- def poll_source(
- self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
- ) -> GenerateDocumentsOutput:
+ def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput:
"""Poll Notion for updated pages within a time period."""
if self.recursive_index_enabled and self.root_page_id:
- yield from self._recursive_load()
+ yield from self._recursive_load(start, end)
return
query_dict = {
@@ -367,7 +539,7 @@ class NotionConnector(LoadConnector, PollConnector):
pages = filter_pages_by_time(db_res.results, start, end, "last_edited_time")
if pages:
- yield from batch_generator(self._read_pages(pages), self.batch_size)
+ yield from batch_generator(self._read_pages(pages, start, end), self.batch_size)
if db_res.has_more:
query_dict["start_cursor"] = db_res.next_cursor
else:
diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py
index 6d8431c82..f6613c2f5 100644
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -1091,7 +1091,7 @@ class RAGFlowPdfParser:
logging.debug("Images converted.")
self.is_english = [
- re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
+ re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
for i in range(len(self.page_chars))
]
if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
@@ -1148,7 +1148,7 @@ class RAGFlowPdfParser:
if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
bxes = [b for bxs in self.boxes for b in bxs]
- self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
+ self.is_english = re.search(r"[ \na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
logging.debug(f"Is it English: {self.is_english}")
diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md
index bc1b15670..253745432 100644
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@@ -2072,6 +2072,7 @@ Retrieves chunks from specified datasets.
- `"cross_languages"`: `list[string]`
- `"metadata_condition"`: `object`
- `"use_kg"`: `boolean`
+ - `"toc_enhance"`: `boolean`
##### Request example
```bash
@@ -2122,6 +2123,8 @@ curl --request POST \
The number of chunks engaged in vector cosine computation. Defaults to `1024`.
- `"use_kg"`: (*Body parameter*), `boolean`
The search includes text chunks related to the knowledge graph of the selected dataset to handle complex multi-hop queries. Defaults to `False`.
+- `"toc_enhance"`: (*Body parameter*), `boolean`
+ The search includes table of content enhancement in order to boost rank of relevant chunks. Files parsed with `TOC Enhance` enabled is prerequisite. Defaults to `False`.
- `"rerank_id"`: (*Body parameter*), `integer`
The ID of the rerank model.
- `"keyword"`: (*Body parameter*), `boolean`
@@ -2136,6 +2139,9 @@ curl --request POST \
The languages that should be translated into, in order to achieve keywords retrievals in different languages.
- `"metadata_condition"`: (*Body parameter*), `object`
The metadata condition used for filtering chunks:
+ - `"logic"`: (*Body parameter*), `string`
+ - `"and"` Intersection of the result from each condition (default).
+ - `"or"` union of the result from each condition.
- `"conditions"`: (*Body parameter*), `array`
A list of metadata filter conditions.
- `"name"`: `string` - The metadata field name to filter by, e.g., `"author"`, `"company"`, `"url"`. Ensure this parameter before use. See [Set metadata](../guides/dataset/set_metadata.md) for details.
diff --git a/rag/app/book.py b/rag/app/book.py
index 5ea28d40d..5bdaec72d 100644
--- a/rag/app/book.py
+++ b/rag/app/book.py
@@ -113,6 +113,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
lang = lang,
callback = callback,
pdf_cls = Pdf,
+ layout_recognizer = layout_recognizer,
**kwargs
)
diff --git a/rag/app/laws.py b/rag/app/laws.py
index dd97e4e3a..ba2592833 100644
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@@ -172,6 +172,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
lang = lang,
callback = callback,
pdf_cls = Pdf,
+ layout_recognizer = layout_recognizer,
**kwargs
)
diff --git a/rag/app/manual.py b/rag/app/manual.py
index 124864041..b3a4ae38d 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -213,6 +213,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
lang = lang,
callback = callback,
pdf_cls = Pdf,
+ layout_recognizer = layout_recognizer,
parse_method = "manual",
**kwargs
)
diff --git a/rag/app/one.py b/rag/app/one.py
index 5574aaa51..7cd1bb785 100644
--- a/rag/app/one.py
+++ b/rag/app/one.py
@@ -99,6 +99,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
lang = lang,
callback = callback,
pdf_cls = Pdf,
+ layout_recognizer = layout_recognizer,
**kwargs
)
diff --git a/rag/app/presentation.py b/rag/app/presentation.py
index cd1d308ec..6a872528f 100644
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@@ -142,6 +142,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
lang = lang,
callback = callback,
pdf_cls = Pdf,
+ layout_recognizer = layout_recognizer,
**kwargs
)
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index f61019377..add454ade 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -437,16 +437,16 @@ def not_title(txt):
return re.search(r"[,;,。;!!]", txt)
def tree_merge(bull, sections, depth):
-
+
if not sections or bull < 0:
return sections
if isinstance(sections[0], type("")):
sections = [(s, "") for s in sections]
-
+
# filter out position information in pdf sections
sections = [(t, o) for t, o in sections if
t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
-
+
def get_level(bull, section):
text, layout = section
text = re.sub(r"\u3000", " ", text).strip()
@@ -465,7 +465,7 @@ def tree_merge(bull, sections, depth):
level, text = get_level(bull, section)
if not text.strip("\n"):
continue
-
+
lines.append((level, text))
level_set.add(level)
@@ -608,6 +608,26 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
cks[-1] += t
tk_nums[-1] += tnum
+ custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
+ has_custom = bool(custom_delimiters)
+ if has_custom:
+ custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
+ cks, tk_nums = [], []
+ for sec, pos in sections:
+ split_sec = re.split(r"(%s)" % custom_pattern, sec, flags=re.DOTALL)
+ for sub_sec in split_sec:
+ if re.fullmatch(custom_pattern, sub_sec or ""):
+ continue
+ text = "\n" + sub_sec
+ local_pos = pos
+ if num_tokens_from_string(text) < 8:
+ local_pos = ""
+ if local_pos and text.find(local_pos) < 0:
+ text += local_pos
+ cks.append(text)
+ tk_nums.append(num_tokens_from_string(text))
+ return cks
+
dels = get_delimiters(delimiter)
for sec, pos in sections:
if num_tokens_from_string(sec) < chunk_token_num:
@@ -657,6 +677,29 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
result_images[-1] = concat_img(result_images[-1], image)
tk_nums[-1] += tnum
+ custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
+ has_custom = bool(custom_delimiters)
+ if has_custom:
+ custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
+ cks, result_images, tk_nums = [], [], []
+ for text, image in zip(texts, images):
+ text_str = text[0] if isinstance(text, tuple) else text
+ text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else ""
+ split_sec = re.split(r"(%s)" % custom_pattern, text_str)
+ for sub_sec in split_sec:
+ if re.fullmatch(custom_pattern, sub_sec or ""):
+ continue
+ text_seg = "\n" + sub_sec
+ local_pos = text_pos
+ if num_tokens_from_string(text_seg) < 8:
+ local_pos = ""
+ if local_pos and text_seg.find(local_pos) < 0:
+ text_seg += local_pos
+ cks.append(text_seg)
+ result_images.append(image)
+ tk_nums.append(num_tokens_from_string(text_seg))
+ return cks, result_images
+
dels = get_delimiters(delimiter)
for text, image in zip(texts, images):
# if text is tuple, unpack it
@@ -748,6 +791,23 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
images[-1] = concat_img(images[-1], image)
tk_nums[-1] += tnum
+ custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
+ has_custom = bool(custom_delimiters)
+ if has_custom:
+ custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
+ cks, images, tk_nums = [], [], []
+ pattern = r"(%s)" % custom_pattern
+ for sec, image in sections:
+ split_sec = re.split(pattern, sec)
+ for sub_sec in split_sec:
+ if not sub_sec or re.fullmatch(custom_pattern, sub_sec):
+ continue
+ text_seg = "\n" + sub_sec
+ cks.append(text_seg)
+ images.append(image)
+ tk_nums.append(num_tokens_from_string(text_seg))
+ return cks, images
+
dels = get_delimiters(delimiter)
pattern = r"(%s)" % dels
@@ -789,7 +849,7 @@ class Node:
self.level = level
self.depth = depth
self.texts = texts or []
- self.children = []
+ self.children = []
def add_child(self, child_node):
self.children.append(child_node)
@@ -835,7 +895,7 @@ class Node:
return self
def get_tree(self):
- tree_list = []
+ tree_list = []
self._dfs(self, tree_list, [])
return tree_list
@@ -860,7 +920,7 @@ class Node:
# A leaf title within depth emits its title path as a chunk (header-only section)
elif not child and (1 <= level <= self.depth):
tree_list.append("\n".join(path_titles))
-
+
# Recurse into children with the updated title path
for c in child:
- self._dfs(c, tree_list, path_titles)
\ No newline at end of file
+ self._dfs(c, tree_list, path_titles)
diff --git a/web/src/components/back-button/index.tsx b/web/src/components/back-button/index.tsx
index c790d6882..118042128 100644
--- a/web/src/components/back-button/index.tsx
+++ b/web/src/components/back-button/index.tsx
@@ -29,7 +29,10 @@ const BackButton: React.FC = ({
return (