Merge branch 'main' of github.com:infiniflow/ragflow into feature/1117

This commit is contained in:
chanx 2025-11-20 19:45:39 +08:00
commit 8c55eaa983
23 changed files with 395 additions and 188 deletions

View file

@ -132,8 +132,8 @@ class Retrieval(ToolBase, ABC):
metas = DocumentService.get_meta_by_kbs(kb_ids)
if self._param.meta_data_filter.get("method") == "auto":
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT)
filters = gen_meta_filter(chat_mdl, metas, query)
doc_ids.extend(meta_filter(metas, filters))
filters: dict = gen_meta_filter(chat_mdl, metas, query)
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
doc_ids = None
elif self._param.meta_data_filter.get("method") == "manual":
@ -165,7 +165,7 @@ class Retrieval(ToolBase, ABC):
out_parts.append(s[last:])
flt["value"] = "".join(out_parts)
doc_ids.extend(meta_filter(metas, filters))
doc_ids.extend(meta_filter(metas, filters, self._param.meta_data_filter.get("logic", "and")))
if not doc_ids:
doc_ids = None

View file

@ -305,12 +305,12 @@ async def retrieval_test():
metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto":
chat_mdl = LLMBundle(current_user.id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
filters = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters))
filters: dict = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
doc_ids = None
elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if not doc_ids:
doc_ids = None

View file

@ -159,10 +159,10 @@ async def webhook(tenant_id: str, agent_id: str):
data=False, message=str(e),
code=RetCode.EXCEPTION_ERROR)
def sse():
async def sse():
nonlocal canvas
try:
for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req):
async for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req):
yield "data:" + json.dumps(ans, ensure_ascii=False) + "\n\n"
cvs.dsl = json.loads(str(canvas))

View file

@ -120,7 +120,7 @@ async def retrieval(tenant_id):
retrieval_setting = req.get("retrieval_setting", {})
similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
top = int(retrieval_setting.get("top_k", 1024))
metadata_condition = req.get("metadata_condition", {})
metadata_condition = req.get("metadata_condition", {}) or {}
metas = DocumentService.get_meta_by_kbs([kb_id])
doc_ids = []
@ -132,7 +132,7 @@ async def retrieval(tenant_id):
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
if metadata_condition:
doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
if not doc_ids and metadata_condition:
doc_ids = ["-999"]
ranks = settings.retriever.retrieval(

View file

@ -1442,9 +1442,9 @@ async def retrieval_test(tenant_id):
if doc_id not in doc_ids_list:
return get_error_data_result(f"The datasets don't own the document {doc_id}")
if not doc_ids:
metadata_condition = req.get("metadata_condition", {})
metadata_condition = req.get("metadata_condition", {}) or {}
metas = DocumentService.get_meta_by_kbs(kb_ids)
doc_ids = meta_filter(metas, convert_conditions(metadata_condition))
doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
similarity_threshold = float(req.get("similarity_threshold", 0.2))
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
top = int(req.get("top_k", 1024))

View file

@ -428,17 +428,15 @@ async def agents_completion_openai_compatibility(tenant_id, agent_id):
return resp
else:
# For non-streaming, just return the response directly
response = next(
completion_openai(
async for response in completion_openai(
tenant_id,
agent_id,
question,
session_id=req.pop("session_id", req.get("id", "")) or req.get("metadata", {}).get("id", ""),
stream=False,
**req,
)
)
return jsonify(response)
):
return jsonify(response)
@manager.route("/agents/<agent_id>/completions", methods=["POST"]) # noqa: F821
@ -977,12 +975,12 @@ async def retrieval_test_embedded():
metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto":
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
filters = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters))
filters: dict = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
doc_ids = None
elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if not doc_ids:
doc_ids = None

View file

@ -177,7 +177,7 @@ class UserCanvasService(CommonService):
return True
def completion(tenant_id, agent_id, session_id=None, **kwargs):
async def completion(tenant_id, agent_id, session_id=None, **kwargs):
query = kwargs.get("query", "") or kwargs.get("question", "")
files = kwargs.get("files", [])
inputs = kwargs.get("inputs", {})
@ -219,7 +219,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
"id": message_id
})
txt = ""
for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs):
async for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs):
ans["session_id"] = session_id
if ans["event"] == "message":
txt += ans["data"]["content"]
@ -237,7 +237,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
API4ConversationService.append_message(conv["id"], conv)
def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):
async def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):
tiktoken_encoder = tiktoken.get_encoding("cl100k_base")
prompt_tokens = len(tiktoken_encoder.encode(str(question)))
user_id = kwargs.get("user_id", "")
@ -245,7 +245,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
if stream:
completion_tokens = 0
try:
for ans in completion(
async for ans in completion(
tenant_id=tenant_id,
agent_id=agent_id,
session_id=session_id,
@ -304,7 +304,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
try:
all_content = ""
reference = {}
for ans in completion(
async for ans in completion(
tenant_id=tenant_id,
agent_id=agent_id,
session_id=session_id,

View file

@ -287,7 +287,7 @@ def convert_conditions(metadata_condition):
]
def meta_filter(metas: dict, filters: list[dict]):
def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
doc_ids = set([])
def filter_out(v2docs, operator, value):
@ -331,7 +331,10 @@ def meta_filter(metas: dict, filters: list[dict]):
if not doc_ids:
doc_ids = set(ids)
else:
doc_ids = doc_ids & set(ids)
if logic == "and":
doc_ids = doc_ids & set(ids)
else:
doc_ids = doc_ids | set(ids)
if not doc_ids:
return []
return list(doc_ids)
@ -407,12 +410,12 @@ def chat(dialog, messages, stream=True, **kwargs):
if dialog.meta_data_filter:
metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
if dialog.meta_data_filter.get("method") == "auto":
filters = gen_meta_filter(chat_mdl, metas, questions[-1])
attachments.extend(meta_filter(metas, filters))
filters: dict = gen_meta_filter(chat_mdl, metas, questions[-1])
attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not attachments:
attachments = None
elif dialog.meta_data_filter.get("method") == "manual":
attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"]))
attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"], dialog.meta_data_filter.get("logic", "and")))
if not attachments:
attachments = None
@ -778,12 +781,12 @@ def ask(question, kb_ids, tenant_id, chat_llm_name=None, search_config={}):
if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto":
filters = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters))
filters: dict = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
doc_ids = None
elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if not doc_ids:
doc_ids = None
@ -853,12 +856,12 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto":
filters = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters))
filters: dict = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
doc_ids = None
elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if not doc_ids:
doc_ids = None

View file

@ -187,7 +187,7 @@ class DoclingParser(RAGFlowPdfParser):
bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
yield (DoclingContentType.EQUATION.value, text, bbox)
def _transfer_to_sections(self, doc) -> list[tuple[str, str]]:
def _transfer_to_sections(self, doc, parse_method: str) -> list[tuple[str, str]]:
sections: list[tuple[str, str]] = []
for typ, payload, bbox in self._iter_doc_items(doc):
if typ == DoclingContentType.TEXT.value:
@ -200,7 +200,12 @@ class DoclingParser(RAGFlowPdfParser):
continue
tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
sections.append((section, tag))
if parse_method == "manual":
sections.append((section, typ, tag))
elif parse_method == "paper":
sections.append((section + tag, typ))
else:
sections.append((section, tag))
return sections
def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
@ -282,7 +287,8 @@ class DoclingParser(RAGFlowPdfParser):
output_dir: Optional[str] = None,
lang: Optional[str] = None,
method: str = "auto",
delete_output: bool = True,
delete_output: bool = True,
parse_method: str = "raw"
):
if not self.check_installation():
@ -318,7 +324,7 @@ class DoclingParser(RAGFlowPdfParser):
if callback:
callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
sections = self._transfer_to_sections(doc)
sections = self._transfer_to_sections(doc, parse_method=parse_method)
tables = self._transfer_to_tables(doc)
if callback:

View file

@ -476,7 +476,7 @@ class MinerUParser(RAGFlowPdfParser):
item[key] = str((subdir / item[key]).resolve())
return data
def _transfer_to_sections(self, outputs: list[dict[str, Any]]):
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
sections = []
for output in outputs:
match output["type"]:
@ -497,7 +497,11 @@ class MinerUParser(RAGFlowPdfParser):
case MinerUContentType.DISCARDED:
pass
if section:
if section and parse_method == "manual":
sections.append((section, output["type"], self._line_tag(output)))
elif section and parse_method == "paper":
sections.append((section + self._line_tag(output), output["type"]))
else:
sections.append((section, self._line_tag(output)))
return sections
@ -516,6 +520,7 @@ class MinerUParser(RAGFlowPdfParser):
method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw"
) -> tuple:
import shutil
@ -565,7 +570,8 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
finally:
if temp_pdf and temp_pdf.exists():
try:

View file

@ -33,6 +33,8 @@ import xgboost as xgb
from huggingface_hub import snapshot_download
from PIL import Image
from pypdf import PdfReader as pdf2_read
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from common.file_utils import get_project_base_directory
from common.misc_utils import pip_install_torch
@ -353,7 +355,6 @@ class RAGFlowPdfParser:
def _assign_column(self, boxes, zoomin=3):
if not boxes:
return boxes
if all("col_id" in b for b in boxes):
return boxes
@ -361,61 +362,80 @@ class RAGFlowPdfParser:
for b in boxes:
by_page[b["page_number"]].append(b)
page_info = {} # pg -> dict(page_w, left_edge, cand_cols)
counter = Counter()
page_cols = {}
for pg, bxs in by_page.items():
if not bxs:
page_info[pg] = {"page_w": 1.0, "left_edge": 0.0, "cand": 1}
counter[1] += 1
page_cols[pg] = 1
continue
if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= pg:
page_w = self.page_images[pg - 1].size[0] / max(1, zoomin)
left_edge = 0.0
else:
xs0 = [box["x0"] for box in bxs]
xs1 = [box["x1"] for box in bxs]
left_edge = float(min(xs0))
page_w = max(1.0, float(max(xs1) - left_edge))
x0s_raw = np.array([b["x0"] for b in bxs], dtype=float)
widths = [max(1.0, (box["x1"] - box["x0"])) for box in bxs]
median_w = float(np.median(widths)) if widths else 1.0
min_x0 = np.min(x0s_raw)
max_x1 = np.max([b["x1"] for b in bxs])
width = max_x1 - min_x0
raw_cols = int(page_w / max(1.0, median_w))
INDENT_TOL = width * 0.12
x0s = []
for x in x0s_raw:
if abs(x - min_x0) < INDENT_TOL:
x0s.append([min_x0])
else:
x0s.append([x])
x0s = np.array(x0s, dtype=float)
max_try = min(4, len(bxs))
if max_try < 2:
max_try = 1
best_k = 1
best_score = -1
# cand = raw_cols if (raw_cols >= 2 and median_w < page_w / raw_cols * 0.8) else 1
cand = raw_cols
for k in range(1, max_try + 1):
km = KMeans(n_clusters=k, n_init="auto")
labels = km.fit_predict(x0s)
page_info[pg] = {"page_w": page_w, "left_edge": left_edge, "cand": cand}
counter[cand] += 1
centers = np.sort(km.cluster_centers_.flatten())
if len(centers) > 1:
try:
score = silhouette_score(x0s, labels)
except ValueError:
continue
else:
score = 0
print(f"{k=},{score=}",flush=True)
if score > best_score:
best_score = score
best_k = k
logging.info(f"[Page {pg}] median_w={median_w:.2f}, page_w={page_w:.2f}, raw_cols={raw_cols}, cand={cand}")
page_cols[pg] = best_k
logging.info(f"[Page {pg}] best_score={best_score:.2f}, best_k={best_k}")
global_cols = counter.most_common(1)[0][0]
global_cols = Counter(page_cols.values()).most_common(1)[0][0]
logging.info(f"Global column_num decided by majority: {global_cols}")
for pg, bxs in by_page.items():
if not bxs:
continue
k = page_cols[pg]
if len(bxs) < k:
k = 1
x0s = np.array([[b["x0"]] for b in bxs], dtype=float)
km = KMeans(n_clusters=k, n_init="auto")
labels = km.fit_predict(x0s)
page_w = page_info[pg]["page_w"]
left_edge = page_info[pg]["left_edge"]
centers = km.cluster_centers_.flatten()
order = np.argsort(centers)
if global_cols == 1:
for box in bxs:
box["col_id"] = 0
continue
remap = {orig: new for new, orig in enumerate(order)}
for box in bxs:
w = box["x1"] - box["x0"]
if w >= 0.8 * page_w:
box["col_id"] = 0
continue
cx = 0.5 * (box["x0"] + box["x1"])
norm_cx = (cx - left_edge) / page_w
norm_cx = max(0.0, min(norm_cx, 0.999999))
box["col_id"] = int(min(global_cols - 1, norm_cx * global_cols))
for b, lb in zip(bxs, labels):
b["col_id"] = remap[lb]
grouped = defaultdict(list)
for b in bxs:
grouped[b["col_id"]].append(b)
return boxes
@ -1303,7 +1323,10 @@ class RAGFlowPdfParser:
positions = []
for ii, (pns, left, right, top, bottom) in enumerate(poss):
right = left + max_width
if 0 < ii < len(poss) - 1:
right = max(left + 10, right)
else:
right = left + max_width
bottom *= ZM
for pn in pns[1:]:
if 0 <= pn - 1 < page_count:

View file

@ -230,9 +230,16 @@ REGISTER_ENABLED=1
# SANDBOX_MAX_MEMORY=256m # b, k, m, g
# SANDBOX_TIMEOUT=10s # s, m, 1m30s
# Enable DocLing and Mineru
# Enable DocLing
USE_DOCLING=false
# Enable Mineru
USE_MINERU=false
MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
MINERU_DELETE_OUTPUT=0 # keep output directory
MINERU_BACKEND=pipeline # or another backend you prefer
# pptx support
DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1

View file

@ -2085,6 +2085,7 @@ curl --request POST \
"dataset_ids": ["b2a62730759d11ef987d0242ac120004"],
"document_ids": ["77df9ef4759a11ef8bdd0242ac120004"],
"metadata_condition": {
"logic": "and",
"conditions": [
{
"name": "author",

View file

@ -213,6 +213,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
lang = lang,
callback = callback,
pdf_cls = Pdf,
parse_method = "manual",
**kwargs
)
@ -225,7 +226,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif len(section) != 3:
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
txt, sec_id, poss = section
txt, layoutno, poss = section
if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
@ -235,7 +236,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
return (txt, sec_id, poss)
return (txt, layoutno, poss)
sections = [_normalize_section(sec) for sec in sections]

View file

@ -59,6 +59,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
parse_method = kwargs.get("parse_method", "raw")
if not pdf_parser.check_installation():
callback(-1, "MinerU not found.")
@ -72,12 +73,14 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
server_url=os.environ.get("MINERU_SERVER_URL", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
parse_method=parse_method
)
return sections, tables, pdf_parser
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
pdf_parser = DoclingParser()
parse_method = kwargs.get("parse_method", "raw")
if not pdf_parser.check_installation():
callback(-1, "Docling not found.")
@ -89,6 +92,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
parse_method=parse_method
)
return sections, tables, pdf_parser

View file

@ -21,8 +21,10 @@ import re
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
from common.constants import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
from deepdoc.parser import PdfParser
import numpy as np
from rag.app.naive import by_plaintext, PARSERS
class Pdf(PdfParser):
def __init__(self):
@ -147,19 +149,40 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
if re.search(r"\.pdf$", filename, re.IGNORECASE):
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
pdf_parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
if name == "deepdoc":
pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
else:
sections, tables, pdf_parser = pdf_parser(
filename=filename,
binary=binary,
from_page=from_page,
to_page=to_page,
lang=lang,
callback=callback,
pdf_cls=Pdf,
parse_method="paper",
**kwargs
)
paper = {
"title": filename,
"authors": " ",
"abstract": "",
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
"tables": []
"sections": sections,
"tables": tables
}
else:
pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
tbls=paper["tables"]
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
paper["tables"] = tbls

View file

@ -355,75 +355,102 @@ class Dealer:
rag_tokenizer.tokenize(ans).split(),
rag_tokenizer.tokenize(inst).split())
def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True,
rerank_mdl=None, highlight=False,
rank_feature: dict | None = {PAGERANK_FLD: 10}):
def retrieval(
self,
question,
embd_mdl,
tenant_ids,
kb_ids,
page,
page_size,
similarity_threshold=0.2,
vector_similarity_weight=0.3,
top=1024,
doc_ids=None,
aggs=True,
rerank_mdl=None,
highlight=False,
rank_feature: dict | None = {PAGERANK_FLD: 10},
):
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
if not question:
return ranks
# Ensure RERANK_LIMIT is multiple of page_size
RERANK_LIMIT = math.ceil(64/page_size) * page_size if page_size>1 else 1
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "page": math.ceil(page_size*page/RERANK_LIMIT), "size": RERANK_LIMIT,
"question": question, "vector": True, "topk": top,
"similarity": similarity_threshold,
"available_int": 1}
RERANK_LIMIT = math.ceil(64 / page_size) * page_size if page_size > 1 else 1
req = {
"kb_ids": kb_ids,
"doc_ids": doc_ids,
"page": math.ceil(page_size * page / RERANK_LIMIT),
"size": RERANK_LIMIT,
"question": question,
"vector": True,
"topk": top,
"similarity": similarity_threshold,
"available_int": 1,
}
if isinstance(tenant_ids, str):
tenant_ids = tenant_ids.split(",")
sres = self.search(req, [index_name(tid) for tid in tenant_ids],
kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
if rerank_mdl and sres.total > 0:
sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
sres, question, 1 - vector_similarity_weight,
vector_similarity_weight,
rank_feature=rank_feature)
sim, tsim, vsim = self.rerank_by_model(
rerank_mdl,
sres,
question,
1 - vector_similarity_weight,
vector_similarity_weight,
rank_feature=rank_feature,
)
else:
lower_case_doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
if lower_case_doc_engine in ["elasticsearch","opensearch"]:
lower_case_doc_engine = os.getenv("DOC_ENGINE", "elasticsearch")
if lower_case_doc_engine in ["elasticsearch", "opensearch"]:
# ElasticSearch doesn't normalize each way score before fusion.
sim, tsim, vsim = self.rerank(
sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
rank_feature=rank_feature)
sres,
question,
1 - vector_similarity_weight,
vector_similarity_weight,
rank_feature=rank_feature,
)
else:
# Don't need rerank here since Infinity normalizes each way score before fusion.
sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
sim = [s if s is not None else 0. for s in sim]
sim = [s if s is not None else 0.0 for s in sim]
tsim = sim
vsim = sim
# Already paginated in search function
max_pages = RERANK_LIMIT // page_size
page_index = (page % max_pages) - 1
begin = max(page_index * page_size, 0)
sim = sim[begin : begin + page_size]
sim_np = np.array(sim, dtype=np.float64)
idx = np.argsort(sim_np * -1)
if sim_np.size == 0:
return ranks
sorted_idx = np.argsort(sim_np * -1)
valid_idx = [int(i) for i in sorted_idx if sim_np[i] >= similarity_threshold]
filtered_count = len(valid_idx)
ranks["total"] = int(filtered_count)
if filtered_count == 0:
return ranks
max_pages = max(RERANK_LIMIT // max(page_size, 1), 1)
page_index = (page - 1) % max_pages
begin = page_index * page_size
end = begin + page_size
page_idx = valid_idx[begin:end]
dim = len(sres.query_vector)
vector_column = f"q_{dim}_vec"
zero_vector = [0.0] * dim
filtered_count = (sim_np >= similarity_threshold).sum()
ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
for i in idx:
if np.float64(sim[i]) < similarity_threshold:
break
for i in page_idx:
id = sres.ids[i]
chunk = sres.field[id]
dnm = chunk.get("docnm_kwd", "")
did = chunk.get("doc_id", "")
if len(ranks["chunks"]) >= page_size:
if aggs:
if dnm not in ranks["doc_aggs"]:
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
ranks["doc_aggs"][dnm]["count"] += 1
continue
break
position_int = chunk.get("position_int", [])
d = {
"chunk_id": id,
@ -434,12 +461,12 @@ class Dealer:
"kb_id": chunk["kb_id"],
"important_kwd": chunk.get("important_kwd", []),
"image_id": chunk.get("img_id", ""),
"similarity": sim[i],
"vector_similarity": vsim[i],
"term_similarity": tsim[i],
"similarity": float(sim_np[i]),
"vector_similarity": float(vsim[i]),
"term_similarity": float(tsim[i]),
"vector": chunk.get(vector_column, zero_vector),
"positions": position_int,
"doc_type_kwd": chunk.get("doc_type_kwd", "")
"doc_type_kwd": chunk.get("doc_type_kwd", ""),
}
if highlight and sres.highlight:
if id in sres.highlight:
@ -447,15 +474,30 @@ class Dealer:
else:
d["highlight"] = d["content_with_weight"]
ranks["chunks"].append(d)
if dnm not in ranks["doc_aggs"]:
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
ranks["doc_aggs"][dnm]["count"] += 1
ranks["doc_aggs"] = [{"doc_name": k,
"doc_id": v["doc_id"],
"count": v["count"]} for k,
v in sorted(ranks["doc_aggs"].items(),
key=lambda x: x[1]["count"] * -1)]
ranks["chunks"] = ranks["chunks"][:page_size]
if aggs:
for i in valid_idx:
id = sres.ids[i]
chunk = sres.field[id]
dnm = chunk.get("docnm_kwd", "")
did = chunk.get("doc_id", "")
if dnm not in ranks["doc_aggs"]:
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
ranks["doc_aggs"][dnm]["count"] += 1
ranks["doc_aggs"] = [
{
"doc_name": k,
"doc_id": v["doc_id"],
"count": v["count"],
}
for k, v in sorted(
ranks["doc_aggs"].items(),
key=lambda x: x[1]["count"] * -1,
)
]
else:
ranks["doc_aggs"] = []
return ranks
@ -564,7 +606,7 @@ class Dealer:
ids = relevant_chunks_with_toc(query, toc, chat_mdl, topn*2)
if not ids:
return chunks
vector_size = 1024
id2idx = {ck["chunk_id"]: i for i, ck in enumerate(chunks)}
for cid, sim in ids:

View file

@ -429,7 +429,7 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st
return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> dict:
sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render(
current_date=datetime.datetime.today().strftime('%Y-%m-%d'),
metadata_keys=json.dumps(meta_data),
@ -440,11 +440,13 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
try:
ans = json_repair.loads(ans)
assert isinstance(ans, list), ans
assert isinstance(ans, dict), ans
assert "conditions" in ans and isinstance(ans["conditions"], list), ans
return ans
except Exception:
logging.exception(f"Loading json failure: {ans}")
return []
return {"conditions": []}
def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):

View file

@ -9,11 +9,13 @@ You are a metadata filtering condition generator. Analyze the user's question an
}
2. **Output Requirements**:
- Always output a JSON array of filter objects
- Each object must have:
- Always output a JSON dictionary with only 2 keys: 'conditions'(filter objects) and 'logic' between the conditions ('and' or 'or').
- Each filter object in conditions must have:
"key": (metadata attribute name),
"value": (string value to compare),
"op": (operator from allowed list)
- Logic between all the conditions: 'and'(Intersection of results for each condition) / 'or' (union of results for all conditions)
3. **Operator Guide**:
- Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"]
@ -32,22 +34,97 @@ You are a metadata filtering condition generator. Analyze the user's question an
- Attribute doesn't exist in metadata
- Value has no match in metadata
5. **Example**:
5. **Example A**:
- User query: "上市日期七月份的有哪些商品,不要蓝色的"
- Metadata: { "color": {...}, "listing_date": {...} }
- Output:
[
{
"logic": "and",
"conditions": [
{"key": "listing_date", "value": "2025-07-01", "op": "≥"},
{"key": "listing_date", "value": "2025-08-01", "op": "<"},
{"key": "color", "value": "blue", "op": "≠"}
]
}
6. **Final Output**:
- ONLY output valid JSON array
6. **Example B**:
- User query: "Both blue and red are acceptable."
- Metadata: { "color": {...}, "listing_date": {...} }
- Output:
{
"logic": "or",
"conditions": [
{"key": "color", "value": "blue", "op": "="},
{"key": "color", "value": "red", "op": "="}
]
}
7. **Final Output**:
- ONLY output valid JSON dictionary
- NO additional text/explanations
- Json schema is as following:
```json
{
"type": "object",
"properties": {
"logic": {
"type": "string",
"description": "Logic relationship between all the conditions, the default is 'and'.",
"enum": [
"and",
"or"
]
},
"conditions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"key": {
"type": "string",
"description": "Metadata attribute name."
},
"value": {
"type": "string",
"description": "Value to compare."
},
"op": {
"type": "string",
"description": "Operator from allowed list.",
"enum": [
"contains",
"not contains",
"start with",
"end with",
"empty",
"not empty",
"=",
"≠",
">",
"<",
"≥",
"≤"
]
}
},
"required": [
"key",
"value",
"op"
],
"additionalProperties": false
}
}
},
"required": [
"conditions"
],
"additionalProperties": false
}
```
**Current Task**:
- Today's date: {{current_date}}
- Available metadata keys: {{metadata_keys}}
- User query: "{{user_question}}"
- Today's date: {{ current_date }}
- Available metadata keys: {{ metadata_keys }}
- User query: "{{ user_question }}"

View file

@ -22,7 +22,8 @@ import { Switch } from '@/components/ui/switch';
import { LlmModelType } from '@/constants/knowledge';
import { useFindLlmByUuid } from '@/hooks/use-llm-request';
import { zodResolver } from '@hookform/resolvers/zod';
import { memo, useCallback, useEffect, useMemo } from 'react';
import { get } from 'lodash';
import { memo, useEffect, useMemo } from 'react';
import { useForm, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { z } from 'zod';
@ -45,7 +46,10 @@ import { AgentTools, Agents } from './agent-tools';
import { StructuredOutputDialog } from './structured-output-dialog';
import { StructuredOutputPanel } from './structured-output-panel';
import { useBuildPromptExtraPromptOptions } from './use-build-prompt-options';
import { useShowStructuredOutputDialog } from './use-show-structured-output-dialog';
import {
useHandleShowStructuredOutput,
useShowStructuredOutputDialog,
} from './use-show-structured-output-dialog';
import { useValues } from './use-values';
import { useWatchFormChange } from './use-watch-change';
@ -121,22 +125,19 @@ function AgentForm({ node }: INextOperatorForm) {
});
const {
initialStructuredOutput,
showStructuredOutputDialog,
structuredOutputDialogVisible,
hideStructuredOutputDialog,
handleStructuredOutputDialogOk,
} = useShowStructuredOutputDialog(node?.id);
const updateNodeForm = useGraphStore((state) => state.updateNodeForm);
const structuredOutput = get(
node,
`data.form.outputs.${AgentStructuredOutputField}`,
);
const handleShowStructuredOutput = useCallback(
(val: boolean) => {
if (node?.id && val) {
updateNodeForm(node?.id, {}, ['outputs', AgentStructuredOutputField]);
}
},
[node?.id, updateNodeForm],
const { handleShowStructuredOutput } = useHandleShowStructuredOutput(
node?.id,
);
useEffect(() => {
@ -327,7 +328,7 @@ function AgentForm({ node }: INextOperatorForm) {
</div>
<StructuredOutputPanel
value={initialStructuredOutput}
value={structuredOutput}
></StructuredOutputPanel>
</section>
)}
@ -337,7 +338,7 @@ function AgentForm({ node }: INextOperatorForm) {
<StructuredOutputDialog
hideModal={hideStructuredOutputDialog}
onOk={handleStructuredOutputDialogOk}
initialValues={initialStructuredOutput}
initialValues={structuredOutput}
></StructuredOutputDialog>
)}
</>

View file

@ -1,6 +1,8 @@
import { JSONSchema } from '@/components/jsonjoy-builder';
import { AgentStructuredOutputField } from '@/constants/agent';
import { useSetModalState } from '@/hooks/common-hooks';
import { useCallback } from 'react';
import { initialAgentValues } from '../../constant';
import useGraphStore from '../../store';
export function useShowStructuredOutputDialog(nodeId?: string) {
@ -9,15 +11,13 @@ export function useShowStructuredOutputDialog(nodeId?: string) {
showModal: showStructuredOutputDialog,
hideModal: hideStructuredOutputDialog,
} = useSetModalState();
const { updateNodeForm, getNode } = useGraphStore((state) => state);
const initialStructuredOutput = getNode(nodeId)?.data.form.outputs.structured;
const { updateNodeForm } = useGraphStore((state) => state);
const handleStructuredOutputDialogOk = useCallback(
(values: JSONSchema) => {
// Sync data to canvas
if (nodeId) {
updateNodeForm(nodeId, values, ['outputs', 'structured']);
updateNodeForm(nodeId, values, ['outputs', AgentStructuredOutputField]);
}
hideStructuredOutputDialog();
},
@ -25,10 +25,30 @@ export function useShowStructuredOutputDialog(nodeId?: string) {
);
return {
initialStructuredOutput,
structuredOutputDialogVisible,
showStructuredOutputDialog,
hideStructuredOutputDialog,
handleStructuredOutputDialogOk,
};
}
export function useHandleShowStructuredOutput(nodeId?: string) {
const updateNodeForm = useGraphStore((state) => state.updateNodeForm);
const handleShowStructuredOutput = useCallback(
(val: boolean) => {
if (nodeId) {
if (val) {
updateNodeForm(nodeId, {}, ['outputs', AgentStructuredOutputField]);
} else {
updateNodeForm(nodeId, initialAgentValues.outputs, ['outputs']);
}
}
},
[nodeId, updateNodeForm],
);
return {
handleShowStructuredOutput,
};
}

View file

@ -6,8 +6,10 @@ import { initialAgentValues } from '../../constant';
// You need to exclude the mcp and tools fields that are not in the form,
// otherwise the form data update will reset the tools or mcp data to an array
// Exclude data that is not in the form to avoid writing this data to the canvas when using useWatch.
// Outputs, tools, and MCP data are directly synchronized to the canvas without going through the form.
function omitToolsAndMcp(values: Record<string, any>) {
return omit(values, ['mcp', 'tools']);
return omit(values, ['mcp', 'tools', 'outputs']);
}
export function useValues(node?: RAGFlowNodeType) {

View file

@ -1,7 +1,6 @@
import { omit } from 'lodash';
import { useEffect } from 'react';
import { UseFormReturn, useWatch } from 'react-hook-form';
import { AgentStructuredOutputField, PromptRole } from '../../constant';
import { PromptRole } from '../../constant';
import useGraphStore from '../../store';
export function useWatchFormChange(id?: string, form?: UseFormReturn<any>) {
@ -17,14 +16,6 @@ export function useWatchFormChange(id?: string, form?: UseFormReturn<any>) {
prompts: [{ role: PromptRole.User, content: values.prompts }],
};
if (!values.showStructuredOutput) {
nextValues = {
...nextValues,
outputs: omit(values.outputs, [AgentStructuredOutputField]),
};
} else {
nextValues = omit(nextValues, 'outputs');
}
updateNodeForm(id, nextValues);
}
}, [form?.formState.isDirty, id, updateNodeForm, values]);