/datasets support ingestion pipeline
This commit is contained in:
parent
918d5a9ff8
commit
19fb8d4783
2 changed files with 114 additions and 3 deletions
|
|
@ -14,6 +14,7 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
import string
|
||||||
from typing import Annotated, Any, Literal
|
from typing import Annotated, Any, Literal
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
|
|
||||||
|
|
@ -25,6 +26,7 @@ from pydantic import (
|
||||||
StringConstraints,
|
StringConstraints,
|
||||||
ValidationError,
|
ValidationError,
|
||||||
field_validator,
|
field_validator,
|
||||||
|
model_validator,
|
||||||
)
|
)
|
||||||
from pydantic_core import PydanticCustomError
|
from pydantic_core import PydanticCustomError
|
||||||
from werkzeug.exceptions import BadRequest, UnsupportedMediaType
|
from werkzeug.exceptions import BadRequest, UnsupportedMediaType
|
||||||
|
|
@ -361,10 +363,15 @@ class CreateDatasetReq(Base):
|
||||||
description: Annotated[str | None, Field(default=None, max_length=65535)]
|
description: Annotated[str | None, Field(default=None, max_length=65535)]
|
||||||
embedding_model: Annotated[str | None, Field(default=None, max_length=255, serialization_alias="embd_id")]
|
embedding_model: Annotated[str | None, Field(default=None, max_length=255, serialization_alias="embd_id")]
|
||||||
permission: Annotated[Literal["me", "team"], Field(default="me", min_length=1, max_length=16)]
|
permission: Annotated[Literal["me", "team"], Field(default="me", min_length=1, max_length=16)]
|
||||||
|
# Allow empty parser_id: when the user does not specify a parser, leave it blank and downstream logic can choose based on parse_type or a default strategy
|
||||||
chunk_method: Annotated[
|
chunk_method: Annotated[
|
||||||
Literal["naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"],
|
Literal[None, "", "naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"],
|
||||||
Field(default="naive", min_length=1, max_length=32, serialization_alias="parser_id"),
|
Field(default=None, serialization_alias="parser_id"),
|
||||||
]
|
]
|
||||||
|
# Optional parse_type (e.g. distinguish pipeline/custom parsing flows); None means this mode is not used
|
||||||
|
parse_type: Annotated[int | None, Field(default=None, ge=0, le=64)]
|
||||||
|
# Processing pipeline ID; optional; must be a 32-character hexadecimal string (UUID hex without hyphens)
|
||||||
|
pipeline_id: Annotated[str | None, Field(default=None, min_length=32, max_length=32, serialization_alias="pipeline_id")]
|
||||||
parser_config: Annotated[ParserConfig | None, Field(default=None)]
|
parser_config: Annotated[ParserConfig | None, Field(default=None)]
|
||||||
|
|
||||||
@field_validator("avatar", mode="after")
|
@field_validator("avatar", mode="after")
|
||||||
|
|
@ -525,6 +532,71 @@ class CreateDatasetReq(Base):
|
||||||
raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
|
raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
|
||||||
return v
|
return v
|
||||||
|
|
||||||
|
@field_validator("pipeline_id", mode="after")
|
||||||
|
@classmethod
|
||||||
|
def validate_pipeline_id(cls, v: str | None) -> str | None:
|
||||||
|
"""Validate pipeline_id as 32-char lowercase hex string if provided.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- None or empty string: treat as None (not set)
|
||||||
|
- Must be exactly length 32
|
||||||
|
- Must contain only hex digits (0-9a-fA-F); normalized to lowercase
|
||||||
|
"""
|
||||||
|
if v is None:
|
||||||
|
return None
|
||||||
|
if v == "":
|
||||||
|
return None
|
||||||
|
if len(v) != 32:
|
||||||
|
raise PydanticCustomError("format_invalid", "pipeline_id must be 32 hex characters")
|
||||||
|
if any(ch not in string.hexdigits for ch in v):
|
||||||
|
raise PydanticCustomError("format_invalid", "pipeline_id must be hexadecimal")
|
||||||
|
return v.lower()
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def validate_parser_dependency(self) -> "CreateDatasetReq":
|
||||||
|
"""
|
||||||
|
Mixed conditional validation:
|
||||||
|
- If chunk_method (parser_id) is empty string → require parse_type and pipeline_id (both not None)
|
||||||
|
- If chunk_method is non-empty → parse_type and pipeline_id must be None (disallow mixed usage)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
PydanticCustomError with code 'dependency_error' on violation.
|
||||||
|
"""
|
||||||
|
# Fallback: all three absent → default naive
|
||||||
|
if self.chunk_method in (None, "") and self.parse_type is None and self.pipeline_id is None:
|
||||||
|
object.__setattr__(self, "chunk_method", "naive")
|
||||||
|
return self
|
||||||
|
|
||||||
|
# parser_id empty/None: require BOTH parse_type & pipeline_id present (no partial allowed)
|
||||||
|
if self.chunk_method in (None, ""):
|
||||||
|
if self.parse_type is None or self.pipeline_id is None:
|
||||||
|
missing = []
|
||||||
|
if self.parse_type is None:
|
||||||
|
missing.append("parse_type")
|
||||||
|
if self.pipeline_id is None:
|
||||||
|
missing.append("pipeline_id")
|
||||||
|
raise PydanticCustomError(
|
||||||
|
"dependency_error",
|
||||||
|
"parser_id empty/None → required fields missing: {fields}",
|
||||||
|
{"fields": ", ".join(missing)},
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
# parser_id provided (non-empty): MUST NOT have parse_type or pipeline_id
|
||||||
|
if self.chunk_method not in (None, ""):
|
||||||
|
if self.parse_type is not None or self.pipeline_id is not None:
|
||||||
|
invalid = []
|
||||||
|
if self.parse_type is not None:
|
||||||
|
invalid.append("parse_type")
|
||||||
|
if self.pipeline_id is not None:
|
||||||
|
invalid.append("pipeline_id")
|
||||||
|
raise PydanticCustomError(
|
||||||
|
"dependency_error",
|
||||||
|
"parser_id provided → disallowed fields present: {fields}",
|
||||||
|
{"fields": ", ".join(invalid)},
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class UpdateDatasetReq(CreateDatasetReq):
|
class UpdateDatasetReq(CreateDatasetReq):
|
||||||
dataset_id: Annotated[str, Field(...)]
|
dataset_id: Annotated[str, Field(...)]
|
||||||
|
|
|
||||||
|
|
@ -419,7 +419,15 @@ Creates a dataset.
|
||||||
- `"embedding_model"`: `string`
|
- `"embedding_model"`: `string`
|
||||||
- `"permission"`: `string`
|
- `"permission"`: `string`
|
||||||
- `"chunk_method"`: `string`
|
- `"chunk_method"`: `string`
|
||||||
- `"parser_config"`: `object`
|
- "parser_config": `object`
|
||||||
|
- "parse_type": `int`
|
||||||
|
- "pipeline_id": `string`
|
||||||
|
|
||||||
|
Note: Choose exactly one ingestion mode when creating a dataset.
|
||||||
|
- Chunking method: provide `"chunk_method"` (optionally with `"parser_config"`).
|
||||||
|
- Ingestion pipeline: provide both `"parse_type"` and `"pipeline_id"` and do not provide `"chunk_method"`.
|
||||||
|
|
||||||
|
These options are mutually exclusive. If all three of `chunk_method`, `parse_type`, and `pipeline_id` are omitted, the system defaults to `chunk_method = "naive"`.
|
||||||
|
|
||||||
##### Request example
|
##### Request example
|
||||||
|
|
||||||
|
|
@ -433,6 +441,26 @@ curl --request POST \
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
##### Request example (ingestion pipeline)
|
||||||
|
|
||||||
|
Use this form when specifying an ingestion pipeline (do not include `chunk_method`).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --request POST \
|
||||||
|
--url http://{address}/api/v1/datasets \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--header 'Authorization: Bearer <YOUR_API_KEY>' \
|
||||||
|
--data '{
|
||||||
|
"name": "test-sdk",
|
||||||
|
"parse_type": <NUMBER_OF_FORMATS_IN_PARSE>,
|
||||||
|
"pipeline_id": "<PIPELINE_ID_32_HEX>"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- `parse_type` is an integer. Replace `<NUMBER_OF_FORMATS_IN_PARSE>` with your pipeline's parse-type value.
|
||||||
|
- `pipeline_id` must be a 32-character lowercase hexadecimal string.
|
||||||
|
|
||||||
##### Request parameters
|
##### Request parameters
|
||||||
|
|
||||||
- `"name"`: (*Body parameter*), `string`, *Required*
|
- `"name"`: (*Body parameter*), `string`, *Required*
|
||||||
|
|
@ -473,6 +501,7 @@ curl --request POST \
|
||||||
- `"qa"`: Q&A
|
- `"qa"`: Q&A
|
||||||
- `"table"`: Table
|
- `"table"`: Table
|
||||||
- `"tag"`: Tag
|
- `"tag"`: Tag
|
||||||
|
- Mutually exclusive with `parse_type` and `pipeline_id`. If you set `chunk_method`, do not include `parse_type` or `pipeline_id`.
|
||||||
|
|
||||||
- `"parser_config"`: (*Body parameter*), `object`
|
- `"parser_config"`: (*Body parameter*), `object`
|
||||||
The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`:
|
The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`:
|
||||||
|
|
@ -509,6 +538,16 @@ curl --request POST \
|
||||||
- Defaults to: `{"use_raptor": false}`.
|
- Defaults to: `{"use_raptor": false}`.
|
||||||
- If `"chunk_method"` is `"table"`, `"picture"`, `"one"`, or `"email"`, `"parser_config"` is an empty JSON object.
|
- If `"chunk_method"` is `"table"`, `"picture"`, `"one"`, or `"email"`, `"parser_config"` is an empty JSON object.
|
||||||
|
|
||||||
|
- "parse_type": (*Body parameter*), `int`
|
||||||
|
The ingestion pipeline parse type identifier. Required if and only if you are using an ingestion pipeline (together with `"pipeline_id"`). Must not be provided when `"chunk_method"` is set.
|
||||||
|
|
||||||
|
- "pipeline_id": (*Body parameter*), `string`
|
||||||
|
The ingestion pipeline ID. Required if and only if you are using an ingestion pipeline (together with `"parse_type"`).
|
||||||
|
- Must be a 32-character lowercase hexadecimal string (e.g., `"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"`).
|
||||||
|
- Must not be provided when `"chunk_method"` is set.
|
||||||
|
|
||||||
|
Note: If none of `chunk_method`, `parse_type`, and `pipeline_id` are provided, the system will default to `chunk_method = "naive"`.
|
||||||
|
|
||||||
#### Response
|
#### Response
|
||||||
|
|
||||||
Success:
|
Success:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue