Merge branch 'main' of github.com:infiniflow/ragflow into feature/1121

This commit is contained in:
chanx 2025-11-21 17:11:14 +08:00
commit 516bd6be3a
19 changed files with 214 additions and 99 deletions

View file

@ -86,7 +86,7 @@ Try our demo at [https://demo.ragflow.io](https://demo.ragflow.io).
## 🔥 Latest Updates
- 2025-11-19 Supports Gemini 3 Pro.
- 2025-11-12 Supports data synchronization from Confluence, AWS S3, Discord, Google Drive.
- 2025-11-12 Supports data synchronization from Confluence, S3, Notion, Discord, Google Drive.
- 2025-10-23 Supports MinerU & Docling as document parsing methods.
- 2025-10-15 Supports orchestrable ingestion pipeline.
- 2025-08-08 Supports OpenAI's latest GPT-5 series models.

View file

@ -86,7 +86,7 @@ Coba demo kami di [https://demo.ragflow.io](https://demo.ragflow.io).
## 🔥 Pembaruan Terbaru
- 2025-11-19 Mendukung Gemini 3 Pro.
- 2025-11-12 Mendukung sinkronisasi data dari Confluence, AWS S3, Discord, Google Drive.
- 2025-11-12 Mendukung sinkronisasi data dari Confluence, S3, Notion, Discord, Google Drive.
- 2025-10-23 Mendukung MinerU & Docling sebagai metode penguraian dokumen.
- 2025-10-15 Dukungan untuk jalur data yang terorkestrasi.
- 2025-08-08 Mendukung model seri GPT-5 terbaru dari OpenAI.

View file

@ -67,7 +67,7 @@
## 🔥 最新情報
- 2025-11-19 Gemini 3 Proをサポートしています
- 2025-11-12 Confluence、AWS S3、Discord、Google Drive からのデータ同期をサポートします。
- 2025-11-12 Confluence、S3、Notion、Discord、Google Drive からのデータ同期をサポートします。
- 2025-10-23 ドキュメント解析方法として MinerU と Docling をサポートします。
- 2025-10-15 オーケストレーションされたデータパイプラインのサポート。
- 2025-08-08 OpenAI の最新 GPT-5 シリーズモデルをサポートします。

View file

@ -68,7 +68,7 @@
## 🔥 업데이트
- 2025-11-19 Gemini 3 Pro를 지원합니다.
- 2025-11-12 Confluence, AWS S3, Discord, Google Drive에서 데이터 동기화를 지원합니다.
- 2025-11-12 Confluence, S3, Notion, Discord, Google Drive에서 데이터 동기화를 지원합니다.
- 2025-10-23 문서 파싱 방법으로 MinerU 및 Docling을 지원합니다.
- 2025-10-15 조정된 데이터 파이프라인 지원.
- 2025-08-08 OpenAI의 최신 GPT-5 시리즈 모델을 지원합니다.

View file

@ -87,7 +87,7 @@ Experimente nossa demo em [https://demo.ragflow.io](https://demo.ragflow.io).
## 🔥 Últimas Atualizações
- 19-11-2025 Suporta Gemini 3 Pro.
- 12-11-2025 Suporta a sincronização de dados do Confluence, AWS S3, Discord e Google Drive.
- 12-11-2025 Suporta a sincronização de dados do Confluence, S3, Notion, Discord e Google Drive.
- 23-10-2025 Suporta MinerU e Docling como métodos de análise de documentos.
- 15-10-2025 Suporte para pipelines de dados orquestrados.
- 08-08-2025 Suporta a mais recente série GPT-5 da OpenAI.

View file

@ -86,7 +86,7 @@
## 🔥 近期更新
- 2025-11-19 支援 Gemini 3 Pro.
- 2025-11-12 支援從 Confluence、AWS S3、Discord、Google Drive 進行資料同步。
- 2025-11-12 支援從 Confluence、S3、Notion、Discord、Google Drive 進行資料同步。
- 2025-10-23 支援 MinerU 和 Docling 作為文件解析方法。
- 2025-10-15 支援可編排的資料管道。
- 2025-08-08 支援 OpenAI 最新的 GPT-5 系列模型。

View file

@ -86,7 +86,7 @@
## 🔥 近期更新
- 2025-11-19 支持 Gemini 3 Pro.
- 2025-11-12 支持从 Confluence、AWS S3、Discord、Google Drive 进行数据同步。
- 2025-11-12 支持从 Confluence、S3、Notion、Discord、Google Drive 进行数据同步。
- 2025-10-23 支持 MinerU 和 Docling 作为文档解析方法。
- 2025-10-15 支持可编排的数据管道。
- 2025-08-08 支持 OpenAI 最新的 GPT-5 系列模型。

View file

@ -32,7 +32,7 @@ class IterationParam(ComponentParamBase):
def __init__(self):
super().__init__()
self.items_ref = ""
self.veriable={}
self.variable={}
def get_input_form(self) -> dict[str, dict]:
return {

View file

@ -24,7 +24,7 @@ from flasgger import Swagger
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
from quart_cors import cors
from common.constants import StatusEnum
from api.db.db_models import close_connection
from api.db.db_models import close_connection, APIToken
from api.db.services import UserService
from api.utils.json_encode import CustomJSONEncoder
from api.utils import commands
@ -124,6 +124,10 @@ def _load_user():
user = UserService.query(
access_token=access_token, status=StatusEnum.VALID.value
)
if not user and len(authorization.split()) == 2:
objs = APIToken.query(token=authorization.split()[1])
if objs:
user = UserService.query(id=objs[0].tenant_id, status=StatusEnum.VALID.value)
if user:
if not user[0].access_token or not user[0].access_token.strip():
logging.warning(f"User {user[0].email} has empty access_token in database")

View file

@ -1434,6 +1434,7 @@ async def retrieval_test(tenant_id):
question = req["question"]
doc_ids = req.get("document_ids", [])
use_kg = req.get("use_kg", False)
toc_enhance = req.get("toc_enhance", False)
langs = req.get("cross_languages", [])
if not isinstance(doc_ids, list):
return get_error_data_result("`documents` should be a list")
@ -1487,6 +1488,11 @@ async def retrieval_test(tenant_id):
highlight=highlight,
rank_feature=label_question(question, kbs),
)
if toc_enhance:
chat_mdl = LLMBundle(kb.tenant_id, LLMType.CHAT)
cks = settings.retriever.retrieval_by_toc(question, ranks["chunks"], tenant_ids, chat_mdl, size)
if cks:
ranks["chunks"] = cks
if use_kg:
ck = settings.kg_retriever.retrieval(question, [k.tenant_id for k in kbs], kb_ids, embd_mdl, LLMBundle(kb.tenant_id, LLMType.CHAT))
if ck["content_with_weight"]:

View file

@ -1091,7 +1091,7 @@ class RAGFlowPdfParser:
logging.debug("Images converted.")
self.is_english = [
re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
for i in range(len(self.page_chars))
]
if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
@ -1148,7 +1148,7 @@ class RAGFlowPdfParser:
if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
bxes = [b for bxs in self.boxes for b in bxs]
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
self.is_english = re.search(r"[ \na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
logging.debug(f"Is it English: {self.is_english}")

View file

@ -2072,6 +2072,7 @@ Retrieves chunks from specified datasets.
- `"cross_languages"`: `list[string]`
- `"metadata_condition"`: `object`
- `"use_kg"`: `boolean`
- `"toc_enhance"`: `boolean`
##### Request example
```bash
@ -2122,6 +2123,8 @@ curl --request POST \
The number of chunks engaged in vector cosine computation. Defaults to `1024`.
- `"use_kg"`: (*Body parameter*), `boolean`
The search includes text chunks related to the knowledge graph of the selected dataset to handle complex multi-hop queries. Defaults to `False`.
- `"toc_enhance"`: (*Body parameter*), `boolean`
The search includes table of content enhancement in order to boost rank of relevant chunks. Files parsed with `TOC Enhance` enabled is prerequisite. Defaults to `False`.
- `"rerank_id"`: (*Body parameter*), `integer`
The ID of the rerank model.
- `"keyword"`: (*Body parameter*), `boolean`
@ -2136,6 +2139,9 @@ curl --request POST \
The languages that should be translated into, in order to achieve keywords retrievals in different languages.
- `"metadata_condition"`: (*Body parameter*), `object`
The metadata condition used for filtering chunks:
- `"logic"`: (*Body parameter*), `string`
- `"and"` Intersection of the result from each condition (default).
- `"or"` union of the result from each condition.
- `"conditions"`: (*Body parameter*), `array`
A list of metadata filter conditions.
- `"name"`: `string` - The metadata field name to filter by, e.g., `"author"`, `"company"`, `"url"`. Ensure this parameter before use. See [Set metadata](../guides/dataset/set_metadata.md) for details.

View file

@ -437,16 +437,16 @@ def not_title(txt):
return re.search(r"[,;,。;!!]", txt)
def tree_merge(bull, sections, depth):
if not sections or bull < 0:
return sections
if isinstance(sections[0], type("")):
sections = [(s, "") for s in sections]
# filter out position information in pdf sections
sections = [(t, o) for t, o in sections if
t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
def get_level(bull, section):
text, layout = section
text = re.sub(r"\u3000", " ", text).strip()
@ -465,7 +465,7 @@ def tree_merge(bull, sections, depth):
level, text = get_level(bull, section)
if not text.strip("\n"):
continue
lines.append((level, text))
level_set.add(level)
@ -608,6 +608,26 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。
cks[-1] += t
tk_nums[-1] += tnum
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
has_custom = bool(custom_delimiters)
if has_custom:
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
cks, tk_nums = [], []
for sec, pos in sections:
split_sec = re.split(r"(%s)" % custom_pattern, sec, flags=re.DOTALL)
for sub_sec in split_sec:
if re.fullmatch(custom_pattern, sub_sec or ""):
continue
text = "\n" + sub_sec
local_pos = pos
if num_tokens_from_string(text) < 8:
local_pos = ""
if local_pos and text.find(local_pos) < 0:
text += local_pos
cks.append(text)
tk_nums.append(num_tokens_from_string(text))
return cks
dels = get_delimiters(delimiter)
for sec, pos in sections:
if num_tokens_from_string(sec) < chunk_token_num:
@ -657,6 +677,29 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
result_images[-1] = concat_img(result_images[-1], image)
tk_nums[-1] += tnum
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
has_custom = bool(custom_delimiters)
if has_custom:
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
cks, result_images, tk_nums = [], [], []
for text, image in zip(texts, images):
text_str = text[0] if isinstance(text, tuple) else text
text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else ""
split_sec = re.split(r"(%s)" % custom_pattern, text_str)
for sub_sec in split_sec:
if re.fullmatch(custom_pattern, sub_sec or ""):
continue
text_seg = "\n" + sub_sec
local_pos = text_pos
if num_tokens_from_string(text_seg) < 8:
local_pos = ""
if local_pos and text_seg.find(local_pos) < 0:
text_seg += local_pos
cks.append(text_seg)
result_images.append(image)
tk_nums.append(num_tokens_from_string(text_seg))
return cks, result_images
dels = get_delimiters(delimiter)
for text, image in zip(texts, images):
# if text is tuple, unpack it
@ -748,6 +791,23 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。"):
images[-1] = concat_img(images[-1], image)
tk_nums[-1] += tnum
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
has_custom = bool(custom_delimiters)
if has_custom:
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
cks, images, tk_nums = [], [], []
pattern = r"(%s)" % custom_pattern
for sec, image in sections:
split_sec = re.split(pattern, sec)
for sub_sec in split_sec:
if not sub_sec or re.fullmatch(custom_pattern, sub_sec):
continue
text_seg = "\n" + sub_sec
cks.append(text_seg)
images.append(image)
tk_nums.append(num_tokens_from_string(text_seg))
return cks, images
dels = get_delimiters(delimiter)
pattern = r"(%s)" % dels
@ -789,7 +849,7 @@ class Node:
self.level = level
self.depth = depth
self.texts = texts or []
self.children = []
self.children = []
def add_child(self, child_node):
self.children.append(child_node)
@ -835,7 +895,7 @@ class Node:
return self
def get_tree(self):
tree_list = []
tree_list = []
self._dfs(self, tree_list, [])
return tree_list
@ -860,7 +920,7 @@ class Node:
# A leaf title within depth emits its title path as a chunk (header-only section)
elif not child and (1 <= level <= self.depth):
tree_list.append("\n".join(path_titles))
# Recurse into children with the updated title path
for c in child:
self._dfs(c, tree_list, path_titles)
self._dfs(c, tree_list, path_titles)

View file

@ -14,6 +14,7 @@ type MetadataFilterProps = {
export const MetadataFilterSchema = {
meta_data_filter: z
.object({
logic: z.string().optional(),
method: z.string().optional(),
manual: z
.array(

View file

@ -15,14 +15,17 @@ import {
} from '@/components/ui/form';
import { Input } from '@/components/ui/input';
import { Separator } from '@/components/ui/separator';
import { SwitchOperatorOptions } from '@/constants/agent';
import { SwitchLogicOperator, SwitchOperatorOptions } from '@/constants/agent';
import { useBuildSwitchOperatorOptions } from '@/hooks/logic-hooks/use-build-operator-options';
import { useBuildSwitchLogicOperatorOptions } from '@/hooks/logic-hooks/use-build-options';
import { useFetchKnowledgeMetadata } from '@/hooks/use-knowledge-request';
import { PromptEditor } from '@/pages/agent/form/components/prompt-editor';
import { Plus, X } from 'lucide-react';
import { useCallback } from 'react';
import { useFieldArray, useFormContext } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { RAGFlowFormItem } from '../ragflow-form';
import { RAGFlowSelect } from '../ui/select';
export function MetadataFilterConditions({
kbIds,
@ -36,10 +39,13 @@ export function MetadataFilterConditions({
const { t } = useTranslation();
const form = useFormContext();
const name = prefix + 'meta_data_filter.manual';
const logic = prefix + 'meta_data_filter.logic';
const metadata = useFetchKnowledgeMetadata(kbIds);
const switchOperatorOptions = useBuildSwitchOperatorOptions();
const switchLogicOperatorOptions = useBuildSwitchLogicOperatorOptions();
const { fields, remove, append } = useFieldArray({
name,
control: form.control,
@ -47,13 +53,14 @@ export function MetadataFilterConditions({
const add = useCallback(
(key: string) => () => {
form.setValue(logic, SwitchLogicOperator.And);
append({
key,
value: '',
op: SwitchOperatorOptions[0].value,
});
},
[append],
[append, form, logic],
);
return (
@ -77,73 +84,92 @@ export function MetadataFilterConditions({
</DropdownMenuContent>
</DropdownMenu>
</div>
<div className="space-y-5">
{fields.map((field, index) => {
const typeField = `${name}.${index}.key`;
return (
<div key={field.id} className="flex w-full items-center gap-2">
<FormField
control={form.control}
name={typeField}
render={({ field }) => (
<FormItem className="flex-1 overflow-hidden">
<FormControl>
<Input
{...field}
placeholder={t('common.pleaseInput')}
></Input>
</FormControl>
<FormMessage />
</FormItem>
)}
/>
<Separator className="w-3 text-text-secondary" />
<FormField
control={form.control}
name={`${name}.${index}.op`}
render={({ field }) => (
<FormItem className="flex-1 overflow-hidden">
<FormControl>
<SelectWithSearch
{...field}
options={switchOperatorOptions}
></SelectWithSearch>
</FormControl>
<FormMessage />
</FormItem>
)}
/>
<Separator className="w-3 text-text-secondary" />
<FormField
control={form.control}
name={`${name}.${index}.value`}
render={({ field }) => (
<FormItem className="flex-1 overflow-hidden">
<FormControl>
{canReference ? (
<PromptEditor
{...field}
multiLine={false}
showToolbar={false}
></PromptEditor>
) : (
<Input
placeholder={t('common.pleaseInput')}
{...field}
/>
<section className="flex">
{fields.length > 1 && (
<div className="relative min-w-14">
<RAGFlowFormItem
name={logic}
className="absolute top-1/2 -translate-y-1/2 right-1 left-0 z-10 bg-bg-base"
>
<RAGFlowSelect
options={switchLogicOperatorOptions}
triggerClassName="w-full text-xs px-1 py-0 h-6"
></RAGFlowSelect>
</RAGFlowFormItem>
<div className="absolute border-l border-y w-5 right-0 top-4 bottom-4 rounded-l-lg"></div>
</div>
)}
<div className="space-y-5 flex-1">
{fields.map((field, index) => {
const typeField = `${name}.${index}.key`;
return (
<section key={field.id} className="flex gap-2">
<div className="w-full space-y-2">
<div className="flex items-center gap-1">
<FormField
control={form.control}
name={typeField}
render={({ field }) => (
<FormItem className="flex-1 overflow-hidden">
<FormControl>
<Input
{...field}
placeholder={t('common.pleaseInput')}
></Input>
</FormControl>
<FormMessage />
</FormItem>
)}
</FormControl>
<FormMessage />
</FormItem>
)}
/>
<Button variant={'ghost'} onClick={() => remove(index)}>
<X className="text-text-sub-title-invert " />
</Button>
</div>
);
})}
</div>
/>
<Separator className="w-1 text-text-secondary" />
<FormField
control={form.control}
name={`${name}.${index}.op`}
render={({ field }) => (
<FormItem className="flex-1 overflow-hidden">
<FormControl>
<SelectWithSearch
{...field}
options={switchOperatorOptions}
></SelectWithSearch>
</FormControl>
<FormMessage />
</FormItem>
)}
/>
</div>
<FormField
control={form.control}
name={`${name}.${index}.value`}
render={({ field }) => (
<FormItem className="flex-1 overflow-hidden">
<FormControl>
{canReference ? (
<PromptEditor
{...field}
multiLine={false}
showToolbar={false}
></PromptEditor>
) : (
<Input
placeholder={t('common.pleaseInput')}
{...field}
/>
)}
</FormControl>
<FormMessage />
</FormItem>
)}
/>
</div>
<Button variant={'ghost'} onClick={() => remove(index)}>
<X className="text-text-sub-title-invert " />
</Button>
</section>
);
})}
</div>
</section>
</section>
);
}

View file

@ -179,3 +179,8 @@ export enum JsonSchemaDataType {
Array = 'array',
Object = 'object',
}
export enum SwitchLogicOperator {
And = 'and',
Or = 'or',
}

View file

@ -0,0 +1,12 @@
import { SwitchLogicOperator } from '@/constants/agent';
import { buildOptions } from '@/utils/form';
import { useTranslation } from 'react-i18next';
export function useBuildSwitchLogicOperatorOptions() {
const { t } = useTranslation();
return buildOptions(
SwitchLogicOperator,
t,
'flow.switchLogicOperatorOptions',
);
}

View file

@ -10,6 +10,7 @@ import {
JsonSchemaDataType,
Operator,
ProgrammingLanguage,
SwitchLogicOperator,
SwitchOperatorOptions,
initialLlmBaseValues,
} from '@/constants/agent';
@ -51,8 +52,6 @@ import {
export const BeginId = 'begin';
export const SwitchLogicOperatorOptions = ['and', 'or'];
export const CommonOperatorList = Object.values(Operator).filter(
(x) => x !== Operator.Note,
);
@ -308,7 +307,7 @@ export const initialExeSqlValues = {
export const initialSwitchValues = {
conditions: [
{
logical_operator: SwitchLogicOperatorOptions[0],
logical_operator: SwitchLogicOperator.And,
items: [
{
operator: SwitchOperatorOptions[0].value,

View file

@ -11,16 +11,17 @@ import {
import { RAGFlowSelect } from '@/components/ui/select';
import { Separator } from '@/components/ui/separator';
import { Textarea } from '@/components/ui/textarea';
import { SwitchLogicOperator } from '@/constants/agent';
import { useBuildSwitchOperatorOptions } from '@/hooks/logic-hooks/use-build-operator-options';
import { useBuildSwitchLogicOperatorOptions } from '@/hooks/logic-hooks/use-build-options';
import { cn } from '@/lib/utils';
import { zodResolver } from '@hookform/resolvers/zod';
import { t } from 'i18next';
import { X } from 'lucide-react';
import { memo, useCallback, useMemo } from 'react';
import { memo, useCallback } from 'react';
import { useFieldArray, useForm, useFormContext } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { z } from 'zod';
import { SwitchLogicOperatorOptions } from '../../constant';
import { IOperatorForm } from '../../interface';
import { FormWrapper } from '../components/form-wrapper';
import { QueryVariable } from '../components/query-variable';
@ -185,12 +186,7 @@ function SwitchForm({ node }: IOperatorForm) {
control: form.control,
});
const switchLogicOperatorOptions = useMemo(() => {
return SwitchLogicOperatorOptions.map((x) => ({
value: x,
label: t(`flow.switchLogicOperatorOptions.${x}`),
}));
}, [t]);
const switchLogicOperatorOptions = useBuildSwitchLogicOperatorOptions();
useWatchFormChange(node?.id, form);
@ -253,7 +249,7 @@ function SwitchForm({ node }: IOperatorForm) {
<BlockButton
onClick={() =>
append({
logical_operator: SwitchLogicOperatorOptions[0],
logical_operator: SwitchLogicOperator.And,
[ItemKey]: [
{
operator: switchOperatorOptions[0].value,