diff --git a/README.md b/README.md index d82721d98..ded81f099 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ Try our demo at [https://demo.ragflow.io](https://demo.ragflow.io). ## 🔥 Latest Updates - 2025-11-19 Supports Gemini 3 Pro. -- 2025-11-12 Supports data synchronization from Confluence, AWS S3, Discord, Google Drive. +- 2025-11-12 Supports data synchronization from Confluence, S3, Notion, Discord, Google Drive. - 2025-10-23 Supports MinerU & Docling as document parsing methods. - 2025-10-15 Supports orchestrable ingestion pipeline. - 2025-08-08 Supports OpenAI's latest GPT-5 series models. diff --git a/README_id.md b/README_id.md index 953fce4c5..11b09b4fb 100644 --- a/README_id.md +++ b/README_id.md @@ -86,7 +86,7 @@ Coba demo kami di [https://demo.ragflow.io](https://demo.ragflow.io). ## 🔥 Pembaruan Terbaru - 2025-11-19 Mendukung Gemini 3 Pro. -- 2025-11-12 Mendukung sinkronisasi data dari Confluence, AWS S3, Discord, Google Drive. +- 2025-11-12 Mendukung sinkronisasi data dari Confluence, S3, Notion, Discord, Google Drive. - 2025-10-23 Mendukung MinerU & Docling sebagai metode penguraian dokumen. - 2025-10-15 Dukungan untuk jalur data yang terorkestrasi. - 2025-08-08 Mendukung model seri GPT-5 terbaru dari OpenAI. diff --git a/README_ja.md b/README_ja.md index 7711d3ff0..5e471b5c2 100644 --- a/README_ja.md +++ b/README_ja.md @@ -67,7 +67,7 @@ ## 🔥 最新情報 - 2025-11-19 Gemini 3 Proをサポートしています -- 2025-11-12 Confluence、AWS S3、Discord、Google Drive からのデータ同期をサポートします。 +- 2025-11-12 Confluence、S3、Notion、Discord、Google Drive からのデータ同期をサポートします。 - 2025-10-23 ドキュメント解析方法として MinerU と Docling をサポートします。 - 2025-10-15 オーケストレーションされたデータパイプラインのサポート。 - 2025-08-08 OpenAI の最新 GPT-5 シリーズモデルをサポートします。 diff --git a/README_ko.md b/README_ko.md index 386fd2faa..f34f23279 100644 --- a/README_ko.md +++ b/README_ko.md @@ -68,7 +68,7 @@ ## 🔥 업데이트 - 2025-11-19 Gemini 3 Pro를 지원합니다. -- 2025-11-12 Confluence, AWS S3, Discord, Google Drive에서 데이터 동기화를 지원합니다. +- 2025-11-12 Confluence, S3, Notion, Discord, Google Drive에서 데이터 동기화를 지원합니다. - 2025-10-23 문서 파싱 방법으로 MinerU 및 Docling을 지원합니다. - 2025-10-15 조정된 데이터 파이프라인 지원. - 2025-08-08 OpenAI의 최신 GPT-5 시리즈 모델을 지원합니다. diff --git a/README_pt_br.md b/README_pt_br.md index 487ec5530..71690ebb9 100644 --- a/README_pt_br.md +++ b/README_pt_br.md @@ -87,7 +87,7 @@ Experimente nossa demo em [https://demo.ragflow.io](https://demo.ragflow.io). ## 🔥 Últimas Atualizações - 19-11-2025 Suporta Gemini 3 Pro. -- 12-11-2025 Suporta a sincronização de dados do Confluence, AWS S3, Discord e Google Drive. +- 12-11-2025 Suporta a sincronização de dados do Confluence, S3, Notion, Discord e Google Drive. - 23-10-2025 Suporta MinerU e Docling como métodos de análise de documentos. - 15-10-2025 Suporte para pipelines de dados orquestrados. - 08-08-2025 Suporta a mais recente série GPT-5 da OpenAI. diff --git a/README_tzh.md b/README_tzh.md index eab5938e4..7756aacc8 100644 --- a/README_tzh.md +++ b/README_tzh.md @@ -86,7 +86,7 @@ ## 🔥 近期更新 - 2025-11-19 支援 Gemini 3 Pro. -- 2025-11-12 支援從 Confluence、AWS S3、Discord、Google Drive 進行資料同步。 +- 2025-11-12 支援從 Confluence、S3、Notion、Discord、Google Drive 進行資料同步。 - 2025-10-23 支援 MinerU 和 Docling 作為文件解析方法。 - 2025-10-15 支援可編排的資料管道。 - 2025-08-08 支援 OpenAI 最新的 GPT-5 系列模型。 diff --git a/README_zh.md b/README_zh.md index 58394b5fd..799c3aaea 100644 --- a/README_zh.md +++ b/README_zh.md @@ -86,7 +86,7 @@ ## 🔥 近期更新 - 2025-11-19 支持 Gemini 3 Pro. -- 2025-11-12 支持从 Confluence、AWS S3、Discord、Google Drive 进行数据同步。 +- 2025-11-12 支持从 Confluence、S3、Notion、Discord、Google Drive 进行数据同步。 - 2025-10-23 支持 MinerU 和 Docling 作为文档解析方法。 - 2025-10-15 支持可编排的数据管道。 - 2025-08-08 支持 OpenAI 最新的 GPT-5 系列模型。 diff --git a/agent/component/iteration.py b/agent/component/iteration.py index cff09d622..ae5c0b677 100644 --- a/agent/component/iteration.py +++ b/agent/component/iteration.py @@ -32,7 +32,7 @@ class IterationParam(ComponentParamBase): def __init__(self): super().__init__() self.items_ref = "" - self.veriable={} + self.variable={} def get_input_form(self) -> dict[str, dict]: return { diff --git a/api/apps/__init__.py b/api/apps/__init__.py index a53f67c06..a6e33c13b 100644 --- a/api/apps/__init__.py +++ b/api/apps/__init__.py @@ -24,7 +24,7 @@ from flasgger import Swagger from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer from quart_cors import cors from common.constants import StatusEnum -from api.db.db_models import close_connection +from api.db.db_models import close_connection, APIToken from api.db.services import UserService from api.utils.json_encode import CustomJSONEncoder from api.utils import commands @@ -124,6 +124,10 @@ def _load_user(): user = UserService.query( access_token=access_token, status=StatusEnum.VALID.value ) + if not user and len(authorization.split()) == 2: + objs = APIToken.query(token=authorization.split()[1]) + if objs: + user = UserService.query(id=objs[0].tenant_id, status=StatusEnum.VALID.value) if user: if not user[0].access_token or not user[0].access_token.strip(): logging.warning(f"User {user[0].email} has empty access_token in database") diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 30fbd835e..52acebc43 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -1434,6 +1434,7 @@ async def retrieval_test(tenant_id): question = req["question"] doc_ids = req.get("document_ids", []) use_kg = req.get("use_kg", False) + toc_enhance = req.get("toc_enhance", False) langs = req.get("cross_languages", []) if not isinstance(doc_ids, list): return get_error_data_result("`documents` should be a list") @@ -1487,6 +1488,11 @@ async def retrieval_test(tenant_id): highlight=highlight, rank_feature=label_question(question, kbs), ) + if toc_enhance: + chat_mdl = LLMBundle(kb.tenant_id, LLMType.CHAT) + cks = settings.retriever.retrieval_by_toc(question, ranks["chunks"], tenant_ids, chat_mdl, size) + if cks: + ranks["chunks"] = cks if use_kg: ck = settings.kg_retriever.retrieval(question, [k.tenant_id for k in kbs], kb_ids, embd_mdl, LLMBundle(kb.tenant_id, LLMType.CHAT)) if ck["content_with_weight"]: diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 6d8431c82..f6613c2f5 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -1091,7 +1091,7 @@ class RAGFlowPdfParser: logging.debug("Images converted.") self.is_english = [ - re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) + re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars)) ] if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2: @@ -1148,7 +1148,7 @@ class RAGFlowPdfParser: if not self.is_english and not any([c for c in self.page_chars]) and self.boxes: bxes = [b for bxs in self.boxes for b in bxs] - self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))])) + self.is_english = re.search(r"[ \na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))])) logging.debug(f"Is it English: {self.is_english}") diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index bc1b15670..253745432 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -2072,6 +2072,7 @@ Retrieves chunks from specified datasets. - `"cross_languages"`: `list[string]` - `"metadata_condition"`: `object` - `"use_kg"`: `boolean` + - `"toc_enhance"`: `boolean` ##### Request example ```bash @@ -2122,6 +2123,8 @@ curl --request POST \ The number of chunks engaged in vector cosine computation. Defaults to `1024`. - `"use_kg"`: (*Body parameter*), `boolean` The search includes text chunks related to the knowledge graph of the selected dataset to handle complex multi-hop queries. Defaults to `False`. +- `"toc_enhance"`: (*Body parameter*), `boolean` + The search includes table of content enhancement in order to boost rank of relevant chunks. Files parsed with `TOC Enhance` enabled is prerequisite. Defaults to `False`. - `"rerank_id"`: (*Body parameter*), `integer` The ID of the rerank model. - `"keyword"`: (*Body parameter*), `boolean` @@ -2136,6 +2139,9 @@ curl --request POST \ The languages that should be translated into, in order to achieve keywords retrievals in different languages. - `"metadata_condition"`: (*Body parameter*), `object` The metadata condition used for filtering chunks: + - `"logic"`: (*Body parameter*), `string` + - `"and"` Intersection of the result from each condition (default). + - `"or"` union of the result from each condition. - `"conditions"`: (*Body parameter*), `array` A list of metadata filter conditions. - `"name"`: `string` - The metadata field name to filter by, e.g., `"author"`, `"company"`, `"url"`. Ensure this parameter before use. See [Set metadata](../guides/dataset/set_metadata.md) for details. diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index f61019377..add454ade 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -437,16 +437,16 @@ def not_title(txt): return re.search(r"[,;,。;!!]", txt) def tree_merge(bull, sections, depth): - + if not sections or bull < 0: return sections if isinstance(sections[0], type("")): sections = [(s, "") for s in sections] - + # filter out position information in pdf sections sections = [(t, o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())] - + def get_level(bull, section): text, layout = section text = re.sub(r"\u3000", " ", text).strip() @@ -465,7 +465,7 @@ def tree_merge(bull, sections, depth): level, text = get_level(bull, section) if not text.strip("\n"): continue - + lines.append((level, text)) level_set.add(level) @@ -608,6 +608,26 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。; cks[-1] += t tk_nums[-1] += tnum + custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)] + has_custom = bool(custom_delimiters) + if has_custom: + custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)) + cks, tk_nums = [], [] + for sec, pos in sections: + split_sec = re.split(r"(%s)" % custom_pattern, sec, flags=re.DOTALL) + for sub_sec in split_sec: + if re.fullmatch(custom_pattern, sub_sec or ""): + continue + text = "\n" + sub_sec + local_pos = pos + if num_tokens_from_string(text) < 8: + local_pos = "" + if local_pos and text.find(local_pos) < 0: + text += local_pos + cks.append(text) + tk_nums.append(num_tokens_from_string(text)) + return cks + dels = get_delimiters(delimiter) for sec, pos in sections: if num_tokens_from_string(sec) < chunk_token_num: @@ -657,6 +677,29 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 result_images[-1] = concat_img(result_images[-1], image) tk_nums[-1] += tnum + custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)] + has_custom = bool(custom_delimiters) + if has_custom: + custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)) + cks, result_images, tk_nums = [], [], [] + for text, image in zip(texts, images): + text_str = text[0] if isinstance(text, tuple) else text + text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else "" + split_sec = re.split(r"(%s)" % custom_pattern, text_str) + for sub_sec in split_sec: + if re.fullmatch(custom_pattern, sub_sec or ""): + continue + text_seg = "\n" + sub_sec + local_pos = text_pos + if num_tokens_from_string(text_seg) < 8: + local_pos = "" + if local_pos and text_seg.find(local_pos) < 0: + text_seg += local_pos + cks.append(text_seg) + result_images.append(image) + tk_nums.append(num_tokens_from_string(text_seg)) + return cks, result_images + dels = get_delimiters(delimiter) for text, image in zip(texts, images): # if text is tuple, unpack it @@ -748,6 +791,23 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): images[-1] = concat_img(images[-1], image) tk_nums[-1] += tnum + custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)] + has_custom = bool(custom_delimiters) + if has_custom: + custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)) + cks, images, tk_nums = [], [], [] + pattern = r"(%s)" % custom_pattern + for sec, image in sections: + split_sec = re.split(pattern, sec) + for sub_sec in split_sec: + if not sub_sec or re.fullmatch(custom_pattern, sub_sec): + continue + text_seg = "\n" + sub_sec + cks.append(text_seg) + images.append(image) + tk_nums.append(num_tokens_from_string(text_seg)) + return cks, images + dels = get_delimiters(delimiter) pattern = r"(%s)" % dels @@ -789,7 +849,7 @@ class Node: self.level = level self.depth = depth self.texts = texts or [] - self.children = [] + self.children = [] def add_child(self, child_node): self.children.append(child_node) @@ -835,7 +895,7 @@ class Node: return self def get_tree(self): - tree_list = [] + tree_list = [] self._dfs(self, tree_list, []) return tree_list @@ -860,7 +920,7 @@ class Node: # A leaf title within depth emits its title path as a chunk (header-only section) elif not child and (1 <= level <= self.depth): tree_list.append("\n".join(path_titles)) - + # Recurse into children with the updated title path for c in child: - self._dfs(c, tree_list, path_titles) \ No newline at end of file + self._dfs(c, tree_list, path_titles) diff --git a/web/src/components/metadata-filter/index.tsx b/web/src/components/metadata-filter/index.tsx index 8dbdce42f..48388e4c2 100644 --- a/web/src/components/metadata-filter/index.tsx +++ b/web/src/components/metadata-filter/index.tsx @@ -14,6 +14,7 @@ type MetadataFilterProps = { export const MetadataFilterSchema = { meta_data_filter: z .object({ + logic: z.string().optional(), method: z.string().optional(), manual: z .array( diff --git a/web/src/components/metadata-filter/metadata-filter-conditions.tsx b/web/src/components/metadata-filter/metadata-filter-conditions.tsx index 80cb6409b..aee103a1f 100644 --- a/web/src/components/metadata-filter/metadata-filter-conditions.tsx +++ b/web/src/components/metadata-filter/metadata-filter-conditions.tsx @@ -15,14 +15,17 @@ import { } from '@/components/ui/form'; import { Input } from '@/components/ui/input'; import { Separator } from '@/components/ui/separator'; -import { SwitchOperatorOptions } from '@/constants/agent'; +import { SwitchLogicOperator, SwitchOperatorOptions } from '@/constants/agent'; import { useBuildSwitchOperatorOptions } from '@/hooks/logic-hooks/use-build-operator-options'; +import { useBuildSwitchLogicOperatorOptions } from '@/hooks/logic-hooks/use-build-options'; import { useFetchKnowledgeMetadata } from '@/hooks/use-knowledge-request'; import { PromptEditor } from '@/pages/agent/form/components/prompt-editor'; import { Plus, X } from 'lucide-react'; import { useCallback } from 'react'; import { useFieldArray, useFormContext } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; +import { RAGFlowFormItem } from '../ragflow-form'; +import { RAGFlowSelect } from '../ui/select'; export function MetadataFilterConditions({ kbIds, @@ -36,10 +39,13 @@ export function MetadataFilterConditions({ const { t } = useTranslation(); const form = useFormContext(); const name = prefix + 'meta_data_filter.manual'; + const logic = prefix + 'meta_data_filter.logic'; const metadata = useFetchKnowledgeMetadata(kbIds); const switchOperatorOptions = useBuildSwitchOperatorOptions(); + const switchLogicOperatorOptions = useBuildSwitchLogicOperatorOptions(); + const { fields, remove, append } = useFieldArray({ name, control: form.control, @@ -47,13 +53,14 @@ export function MetadataFilterConditions({ const add = useCallback( (key: string) => () => { + form.setValue(logic, SwitchLogicOperator.And); append({ key, value: '', op: SwitchOperatorOptions[0].value, }); }, - [append], + [append, form, logic], ); return ( @@ -77,73 +84,92 @@ export function MetadataFilterConditions({ -
- {fields.map((field, index) => { - const typeField = `${name}.${index}.key`; - return ( -
- ( - - - - - - - )} - /> - - ( - - - - - - - )} - /> - - ( - - - {canReference ? ( - - ) : ( - +
+ {fields.length > 1 && ( +
+ + + +
+
+ )} +
+ {fields.map((field, index) => { + const typeField = `${name}.${index}.key`; + return ( +
+
+
+ ( + + + + + + )} - - - - )} - /> - -
- ); - })} -
+ /> + + ( + + + + + + + )} + /> +
+ ( + + + {canReference ? ( + + ) : ( + + )} + + + + )} + /> +
+ + + ); + })} +
+ ); } diff --git a/web/src/constants/agent.tsx b/web/src/constants/agent.tsx index 0ba1d927c..5877b91b1 100644 --- a/web/src/constants/agent.tsx +++ b/web/src/constants/agent.tsx @@ -179,3 +179,8 @@ export enum JsonSchemaDataType { Array = 'array', Object = 'object', } + +export enum SwitchLogicOperator { + And = 'and', + Or = 'or', +} diff --git a/web/src/hooks/logic-hooks/use-build-options.ts b/web/src/hooks/logic-hooks/use-build-options.ts new file mode 100644 index 000000000..62370e9bd --- /dev/null +++ b/web/src/hooks/logic-hooks/use-build-options.ts @@ -0,0 +1,12 @@ +import { SwitchLogicOperator } from '@/constants/agent'; +import { buildOptions } from '@/utils/form'; +import { useTranslation } from 'react-i18next'; + +export function useBuildSwitchLogicOperatorOptions() { + const { t } = useTranslation(); + return buildOptions( + SwitchLogicOperator, + t, + 'flow.switchLogicOperatorOptions', + ); +} diff --git a/web/src/pages/agent/constant/index.tsx b/web/src/pages/agent/constant/index.tsx index c357e8cb7..4a442271b 100644 --- a/web/src/pages/agent/constant/index.tsx +++ b/web/src/pages/agent/constant/index.tsx @@ -10,6 +10,7 @@ import { JsonSchemaDataType, Operator, ProgrammingLanguage, + SwitchLogicOperator, SwitchOperatorOptions, initialLlmBaseValues, } from '@/constants/agent'; @@ -51,8 +52,6 @@ import { export const BeginId = 'begin'; -export const SwitchLogicOperatorOptions = ['and', 'or']; - export const CommonOperatorList = Object.values(Operator).filter( (x) => x !== Operator.Note, ); @@ -308,7 +307,7 @@ export const initialExeSqlValues = { export const initialSwitchValues = { conditions: [ { - logical_operator: SwitchLogicOperatorOptions[0], + logical_operator: SwitchLogicOperator.And, items: [ { operator: SwitchOperatorOptions[0].value, diff --git a/web/src/pages/agent/form/switch-form/index.tsx b/web/src/pages/agent/form/switch-form/index.tsx index f9ccee919..53f4995af 100644 --- a/web/src/pages/agent/form/switch-form/index.tsx +++ b/web/src/pages/agent/form/switch-form/index.tsx @@ -11,16 +11,17 @@ import { import { RAGFlowSelect } from '@/components/ui/select'; import { Separator } from '@/components/ui/separator'; import { Textarea } from '@/components/ui/textarea'; +import { SwitchLogicOperator } from '@/constants/agent'; import { useBuildSwitchOperatorOptions } from '@/hooks/logic-hooks/use-build-operator-options'; +import { useBuildSwitchLogicOperatorOptions } from '@/hooks/logic-hooks/use-build-options'; import { cn } from '@/lib/utils'; import { zodResolver } from '@hookform/resolvers/zod'; import { t } from 'i18next'; import { X } from 'lucide-react'; -import { memo, useCallback, useMemo } from 'react'; +import { memo, useCallback } from 'react'; import { useFieldArray, useForm, useFormContext } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; import { z } from 'zod'; -import { SwitchLogicOperatorOptions } from '../../constant'; import { IOperatorForm } from '../../interface'; import { FormWrapper } from '../components/form-wrapper'; import { QueryVariable } from '../components/query-variable'; @@ -185,12 +186,7 @@ function SwitchForm({ node }: IOperatorForm) { control: form.control, }); - const switchLogicOperatorOptions = useMemo(() => { - return SwitchLogicOperatorOptions.map((x) => ({ - value: x, - label: t(`flow.switchLogicOperatorOptions.${x}`), - })); - }, [t]); + const switchLogicOperatorOptions = useBuildSwitchLogicOperatorOptions(); useWatchFormChange(node?.id, form); @@ -253,7 +249,7 @@ function SwitchForm({ node }: IOperatorForm) { append({ - logical_operator: SwitchLogicOperatorOptions[0], + logical_operator: SwitchLogicOperator.And, [ItemKey]: [ { operator: switchOperatorOptions[0].value,