Feat: API supports toc_enhance.

This commit is contained in:
Kevin Hu 2025-11-21 14:24:41 +08:00
parent 4c8f9f0d77
commit 1d786aef25
4 changed files with 18 additions and 2 deletions

View file

@ -32,7 +32,7 @@ class IterationParam(ComponentParamBase):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.items_ref = "" self.items_ref = ""
self.veriable={} self.variable={}
def get_input_form(self) -> dict[str, dict]: def get_input_form(self) -> dict[str, dict]:
return { return {

View file

@ -24,7 +24,7 @@ from flasgger import Swagger
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
from quart_cors import cors from quart_cors import cors
from common.constants import StatusEnum from common.constants import StatusEnum
from api.db.db_models import close_connection from api.db.db_models import close_connection, APIToken
from api.db.services import UserService from api.db.services import UserService
from api.utils.json_encode import CustomJSONEncoder from api.utils.json_encode import CustomJSONEncoder
from api.utils import commands from api.utils import commands
@ -124,6 +124,10 @@ def _load_user():
user = UserService.query( user = UserService.query(
access_token=access_token, status=StatusEnum.VALID.value access_token=access_token, status=StatusEnum.VALID.value
) )
if not user and len(authorization.split()) == 2:
objs = APIToken.query(token=authorization.split()[1])
if objs:
user = UserService.query(id=objs[0].tenant_id, status=StatusEnum.VALID.value)
if user: if user:
if not user[0].access_token or not user[0].access_token.strip(): if not user[0].access_token or not user[0].access_token.strip():
logging.warning(f"User {user[0].email} has empty access_token in database") logging.warning(f"User {user[0].email} has empty access_token in database")

View file

@ -1434,6 +1434,7 @@ async def retrieval_test(tenant_id):
question = req["question"] question = req["question"]
doc_ids = req.get("document_ids", []) doc_ids = req.get("document_ids", [])
use_kg = req.get("use_kg", False) use_kg = req.get("use_kg", False)
toc_enhance = req.get("toc_enhance", False)
langs = req.get("cross_languages", []) langs = req.get("cross_languages", [])
if not isinstance(doc_ids, list): if not isinstance(doc_ids, list):
return get_error_data_result("`documents` should be a list") return get_error_data_result("`documents` should be a list")
@ -1487,6 +1488,11 @@ async def retrieval_test(tenant_id):
highlight=highlight, highlight=highlight,
rank_feature=label_question(question, kbs), rank_feature=label_question(question, kbs),
) )
if toc_enhance:
chat_mdl = LLMBundle(kb.tenant_id, LLMType.CHAT)
cks = settings.retriever.retrieval_by_toc(question, ranks["chunks"], tenant_ids, chat_mdl, size)
if cks:
ranks["chunks"] = cks
if use_kg: if use_kg:
ck = settings.kg_retriever.retrieval(question, [k.tenant_id for k in kbs], kb_ids, embd_mdl, LLMBundle(kb.tenant_id, LLMType.CHAT)) ck = settings.kg_retriever.retrieval(question, [k.tenant_id for k in kbs], kb_ids, embd_mdl, LLMBundle(kb.tenant_id, LLMType.CHAT))
if ck["content_with_weight"]: if ck["content_with_weight"]:

View file

@ -2072,6 +2072,7 @@ Retrieves chunks from specified datasets.
- `"cross_languages"`: `list[string]` - `"cross_languages"`: `list[string]`
- `"metadata_condition"`: `object` - `"metadata_condition"`: `object`
- `"use_kg"`: `boolean` - `"use_kg"`: `boolean`
- `"toc_enhance"`: `boolean`
##### Request example ##### Request example
```bash ```bash
@ -2122,6 +2123,8 @@ curl --request POST \
The number of chunks engaged in vector cosine computation. Defaults to `1024`. The number of chunks engaged in vector cosine computation. Defaults to `1024`.
- `"use_kg"`: (*Body parameter*), `boolean` - `"use_kg"`: (*Body parameter*), `boolean`
The search includes text chunks related to the knowledge graph of the selected dataset to handle complex multi-hop queries. Defaults to `False`. The search includes text chunks related to the knowledge graph of the selected dataset to handle complex multi-hop queries. Defaults to `False`.
- `"toc_enhance"`: (*Body parameter*), `boolean`
The search includes table of content enhancement in order to boost rank of relevant chunks. Files parsed with `TOC Enhance` enabled is prerequisite. Defaults to `False`.
- `"rerank_id"`: (*Body parameter*), `integer` - `"rerank_id"`: (*Body parameter*), `integer`
The ID of the rerank model. The ID of the rerank model.
- `"keyword"`: (*Body parameter*), `boolean` - `"keyword"`: (*Body parameter*), `boolean`
@ -2136,6 +2139,9 @@ curl --request POST \
The languages that should be translated into, in order to achieve keywords retrievals in different languages. The languages that should be translated into, in order to achieve keywords retrievals in different languages.
- `"metadata_condition"`: (*Body parameter*), `object` - `"metadata_condition"`: (*Body parameter*), `object`
The metadata condition used for filtering chunks: The metadata condition used for filtering chunks:
- `"logic"`: (*Body parameter*), `string`
- `"and"` Intersection of the result from each condition (default).
- `"or"` union of the result from each condition.
- `"conditions"`: (*Body parameter*), `array` - `"conditions"`: (*Body parameter*), `array`
A list of metadata filter conditions. A list of metadata filter conditions.
- `"name"`: `string` - The metadata field name to filter by, e.g., `"author"`, `"company"`, `"url"`. Ensure this parameter before use. See [Set metadata](../guides/dataset/set_metadata.md) for details. - `"name"`: `string` - The metadata field name to filter by, e.g., `"author"`, `"company"`, `"url"`. Ensure this parameter before use. See [Set metadata](../guides/dataset/set_metadata.md) for details.