Update docling preset options

This commit is contained in:
Mike Fortman 2025-09-29 16:40:29 -05:00
parent 8933131b4b
commit 2341bf4700
7 changed files with 211 additions and 153 deletions

View file

@ -21,7 +21,9 @@ knowledge:
# Overlap between chunks
chunk_overlap: 200
# Docling preset setting
doclingPresets: standard
ocr: false
picture_descriptions: false
table_structure: false
# AI agent configuration
agent:

View file

@ -8,7 +8,9 @@ interface UpdateFlowSettingVariables {
llm_model?: string;
system_prompt?: string;
embedding_model?: string;
doclingPresets?: string;
table_structure?: boolean;
ocr?: boolean;
picture_descriptions?: boolean;
chunk_size?: number;
chunk_overlap?: number;
}

View file

@ -13,7 +13,9 @@ export interface KnowledgeSettings {
embedding_model?: string;
chunk_size?: number;
chunk_overlap?: number;
doclingPresets?: string;
table_structure?: boolean;
ocr?: boolean;
picture_descriptions?: boolean;
}
export interface Settings {

View file

@ -22,9 +22,9 @@ import {
CardTitle,
} from "@/components/ui/card";
import { Checkbox } from "@/components/ui/checkbox";
import { Switch } from "@/components/ui/switch";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { RadioGroup, RadioGroupItem } from "@/components/ui/radio-group";
import {
Select,
SelectContent,
@ -39,11 +39,6 @@ import { DEFAULT_AGENT_SETTINGS, DEFAULT_KNOWLEDGE_SETTINGS, UI_CONSTANTS } from
import { getFallbackModels, type ModelProvider } from "./helpers/model-helpers";
import { ModelSelectItems } from "./helpers/model-select-item";
import { LabelWrapper } from "@/components/label-wrapper";
import {
Tooltip,
TooltipContent,
TooltipTrigger,
} from "@radix-ui/react-tooltip";
const { MAX_SYSTEM_PROMPT_CHARS } = UI_CONSTANTS;
@ -112,7 +107,9 @@ function KnowledgeSourcesPage() {
const [systemPrompt, setSystemPrompt] = useState<string>("");
const [chunkSize, setChunkSize] = useState<number>(1024);
const [chunkOverlap, setChunkOverlap] = useState<number>(50);
const [processingMode, setProcessingMode] = useState<string>("standard");
const [tableStructure, setTableStructure] = useState<boolean>(false);
const [ocr, setOcr] = useState<boolean>(false);
const [pictureDescriptions, setPictureDescriptions] = useState<boolean>(false);
// Fetch settings using React Query
const { data: settings = {} } = useGetSettingsQuery({
@ -195,12 +192,24 @@ function KnowledgeSourcesPage() {
}
}, [settings.knowledge?.chunk_overlap]);
// Sync processing mode with settings data
// Sync docling settings with settings data
useEffect(() => {
if (settings.knowledge?.doclingPresets) {
setProcessingMode(settings.knowledge.doclingPresets);
if (settings.knowledge?.table_structure !== undefined) {
setTableStructure(settings.knowledge.table_structure);
}
}, [settings.knowledge?.doclingPresets]);
}, [settings.knowledge?.table_structure]);
useEffect(() => {
if (settings.knowledge?.ocr !== undefined) {
setOcr(settings.knowledge.ocr);
}
}, [settings.knowledge?.ocr]);
useEffect(() => {
if (settings.knowledge?.picture_descriptions !== undefined) {
setPictureDescriptions(settings.knowledge.picture_descriptions);
}
}, [settings.knowledge?.picture_descriptions]);
// Update model selection immediately
const handleModelChange = (newModel: string) => {
@ -231,11 +240,20 @@ function KnowledgeSourcesPage() {
debouncedUpdate({ chunk_overlap: numValue });
};
// Update processing mode
const handleProcessingModeChange = (mode: string) => {
setProcessingMode(mode);
// Update the configuration setting (backend will also update the flow automatically)
debouncedUpdate({ doclingPresets: mode });
// Update docling settings
const handleTableStructureChange = (checked: boolean) => {
setTableStructure(checked);
updateFlowSettingMutation.mutate({ table_structure: checked });
};
const handleOcrChange = (checked: boolean) => {
setOcr(checked);
updateFlowSettingMutation.mutate({ ocr: checked });
};
const handlePictureDescriptionsChange = (checked: boolean) => {
setPictureDescriptions(checked);
updateFlowSettingMutation.mutate({ picture_descriptions: checked });
};
// Helper function to get connector icon
@ -569,7 +587,9 @@ function KnowledgeSourcesPage() {
// Only reset form values if the API call was successful
setChunkSize(DEFAULT_KNOWLEDGE_SETTINGS.chunk_size);
setChunkOverlap(DEFAULT_KNOWLEDGE_SETTINGS.chunk_overlap);
setProcessingMode(DEFAULT_KNOWLEDGE_SETTINGS.processing_mode);
setTableStructure(false);
setOcr(false);
setPictureDescriptions(false);
closeDialog(); // Close after successful completion
})
.catch((error) => {
@ -1064,75 +1084,60 @@ function KnowledgeSourcesPage() {
</div>
</div>
<div className="space-y-3">
<Label className="text-base font-medium">Ingestion presets</Label>
<RadioGroup
value={processingMode}
onValueChange={handleProcessingModeChange}
className="space-y-3"
>
<div className="flex items-center space-x-3">
<RadioGroupItem value="standard" id="standard" />
<div className="flex-1">
<Label
htmlFor="standard"
className="text-base font-medium cursor-pointer"
>
No OCR
</Label>
<div className="text-sm text-muted-foreground">
Fast ingest for documents with selectable text. Images are
ignored.
</div>
<div className="flex items-center justify-between">
<div className="flex-1">
<Label
htmlFor="table-structure"
className="text-base font-medium cursor-pointer pb-3"
>
Table Structure
</Label>
<div className="text-sm text-muted-foreground">
Capture table structure during ingest.
</div>
</div>
<div className="flex items-center space-x-3">
<RadioGroupItem value="ocr" id="ocr" />
<div className="flex-1">
<Label
htmlFor="ocr"
className="text-base font-medium cursor-pointer"
>
OCR
</Label>
<div className="text-sm text-muted-foreground">
Extracts text from images and scanned pages.
</div>
<Switch
id="table-structure"
checked={tableStructure}
onCheckedChange={handleTableStructureChange}
/>
</div>
<div className="flex items-center justify-between">
<div className="flex-1">
<Label
htmlFor="ocr"
className="text-base font-medium cursor-pointer pb-3"
>
OCR
</Label>
<div className="text-sm text-muted-foreground">
Extracts text from images/PDFs. Ingest is slower when enabled.
</div>
</div>
<div className="flex items-center space-x-3">
<RadioGroupItem
value="picture_description"
id="picture_description"
/>
<div className="flex-1">
<Label
htmlFor="picture_description"
className="text-base font-medium cursor-pointer"
>
OCR + Captions
</Label>
<div className="text-sm text-muted-foreground">
Extracts text from images and scanned pages. Generates
short image captions.
</div>
<Switch
id="ocr"
checked={ocr}
onCheckedChange={handleOcrChange}
/>
</div>
<div className="flex items-center justify-between">
<div className="flex-1">
<Label
htmlFor="picture-descriptions"
className="text-base font-medium cursor-pointer pb-3"
>
Picture Descriptions
</Label>
<div className="text-sm text-muted-foreground">
Adds captions for images. Ingest is slower when enabled.
</div>
</div>
<div className="flex items-center space-x-3">
<RadioGroupItem value="VLM" id="VLM" />
<div className="flex-1">
<Label
htmlFor="VLM"
className="text-base font-medium cursor-pointer"
>
VLM
</Label>
<div className="text-sm text-muted-foreground">
Extracts text from layout-aware parsing of text, tables,
and sections.
</div>
</div>
</div>
</RadioGroup>
<Switch
id="picture-descriptions"
checked={pictureDescriptions}
onCheckedChange={handlePictureDescriptionsChange}
/>
</div>
</div>
</div>
</CardContent>

View file

@ -12,7 +12,9 @@ export const DEFAULT_AGENT_SETTINGS = {
export const DEFAULT_KNOWLEDGE_SETTINGS = {
chunk_size: 1000,
chunk_overlap: 200,
processing_mode: "standard"
table_structure: false,
ocr: false,
picture_descriptions: false
} as const;
/**

View file

@ -17,35 +17,30 @@ logger = get_logger(__name__)
# Docling preset configurations
def get_docling_preset_configs():
"""Get docling preset configurations with platform-specific settings"""
def get_docling_preset_configs(table_structure=False, ocr=False, picture_descriptions=False):
"""Get docling preset configurations based on toggle settings
Args:
table_structure: Enable table structure parsing (default: False)
ocr: Enable OCR for text extraction from images (default: False)
picture_descriptions: Enable picture descriptions/captions (default: False)
"""
is_macos = platform.system() == "Darwin"
return {
"standard": {"do_ocr": False},
"ocr": {"do_ocr": True, "ocr_engine": "ocrmac" if is_macos else "easyocr"},
"picture_description": {
"do_ocr": True,
"ocr_engine": "ocrmac" if is_macos else "easyocr",
"do_picture_classification": True,
"do_picture_description": True,
"picture_description_local": {
"repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",
"prompt": "Describe this image in a few sentences.",
},
},
"VLM": {
"pipeline": "vlm",
"vlm_pipeline_model_local": {
"repo_id": "ds4sd/SmolDocling-256M-preview-mlx-bf16"
if is_macos
else "ds4sd/SmolDocling-256M-preview",
"response_format": "doctags",
"inference_framework": "mlx",
},
},
config = {
"do_ocr": ocr,
"ocr_engine": "ocrmac" if is_macos else "easyocr",
"do_table_structure": table_structure,
"do_picture_classification": picture_descriptions,
"do_picture_description": picture_descriptions,
"picture_description_local": {
"repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",
"prompt": "Describe this image in a few sentences.",
}
}
return config
async def get_settings(request, session_manager):
"""Get application settings"""
@ -71,7 +66,9 @@ async def get_settings(request, session_manager):
"embedding_model": knowledge_config.embedding_model,
"chunk_size": knowledge_config.chunk_size,
"chunk_overlap": knowledge_config.chunk_overlap,
"doclingPresets": knowledge_config.doclingPresets,
"table_structure": knowledge_config.table_structure,
"ocr": knowledge_config.ocr,
"picture_descriptions": knowledge_config.picture_descriptions,
},
"agent": {
"llm_model": agent_config.llm_model,
@ -178,7 +175,9 @@ async def update_settings(request, session_manager):
"system_prompt",
"chunk_size",
"chunk_overlap",
"doclingPresets",
"table_structure",
"ocr",
"picture_descriptions",
"embedding_model",
}
@ -255,32 +254,68 @@ async def update_settings(request, session_manager):
# Don't fail the entire settings update if flow update fails
# The config will still be saved
if "doclingPresets" in body:
preset_configs = get_docling_preset_configs()
valid_presets = list(preset_configs.keys())
if body["doclingPresets"] not in valid_presets:
if "table_structure" in body:
if not isinstance(body["table_structure"], bool):
return JSONResponse(
{
"error": f"doclingPresets must be one of: {', '.join(valid_presets)}"
},
status_code=400,
{"error": "table_structure must be a boolean"}, status_code=400
)
current_config.knowledge.doclingPresets = body["doclingPresets"]
current_config.knowledge.table_structure = body["table_structure"]
config_updated = True
# Also update the flow with the new docling preset
# Also update the flow with the new docling settings
try:
flows_service = _get_flows_service()
await flows_service.update_flow_docling_preset(
body["doclingPresets"], preset_configs[body["doclingPresets"]]
)
logger.info(
f"Successfully updated docling preset in flow to '{body['doclingPresets']}'"
preset_config = get_docling_preset_configs(
table_structure=body["table_structure"],
ocr=current_config.knowledge.ocr,
picture_descriptions=current_config.knowledge.picture_descriptions
)
await flows_service.update_flow_docling_preset("custom", preset_config)
logger.info(f"Successfully updated table_structure setting in flow")
except Exception as e:
logger.error(f"Failed to update docling preset in flow: {str(e)}")
# Don't fail the entire settings update if flow update fails
# The config will still be saved
logger.error(f"Failed to update docling settings in flow: {str(e)}")
if "ocr" in body:
if not isinstance(body["ocr"], bool):
return JSONResponse(
{"error": "ocr must be a boolean"}, status_code=400
)
current_config.knowledge.ocr = body["ocr"]
config_updated = True
# Also update the flow with the new docling settings
try:
flows_service = _get_flows_service()
preset_config = get_docling_preset_configs(
table_structure=current_config.knowledge.table_structure,
ocr=body["ocr"],
picture_descriptions=current_config.knowledge.picture_descriptions
)
await flows_service.update_flow_docling_preset("custom", preset_config)
logger.info(f"Successfully updated ocr setting in flow")
except Exception as e:
logger.error(f"Failed to update docling settings in flow: {str(e)}")
if "picture_descriptions" in body:
if not isinstance(body["picture_descriptions"], bool):
return JSONResponse(
{"error": "picture_descriptions must be a boolean"}, status_code=400
)
current_config.knowledge.picture_descriptions = body["picture_descriptions"]
config_updated = True
# Also update the flow with the new docling settings
try:
flows_service = _get_flows_service()
preset_config = get_docling_preset_configs(
table_structure=current_config.knowledge.table_structure,
ocr=current_config.knowledge.ocr,
picture_descriptions=body["picture_descriptions"]
)
await flows_service.update_flow_docling_preset("custom", preset_config)
logger.info(f"Successfully updated picture_descriptions setting in flow")
except Exception as e:
logger.error(f"Failed to update docling settings in flow: {str(e)}")
if "chunk_size" in body:
if not isinstance(body["chunk_size"], int) or body["chunk_size"] <= 0:
@ -624,48 +659,56 @@ def _get_flows_service():
async def update_docling_preset(request, session_manager):
"""Update docling preset in the ingest flow"""
"""Update docling settings in the ingest flow - deprecated endpoint, use /settings instead"""
try:
# Parse request body
body = await request.json()
# Validate preset parameter
if "preset" not in body:
return JSONResponse(
{"error": "preset parameter is required"}, status_code=400
)
# Support old preset-based API for backwards compatibility
if "preset" in body:
# Map old presets to new toggle settings
preset_map = {
"standard": {"table_structure": False, "ocr": False, "picture_descriptions": False},
"ocr": {"table_structure": False, "ocr": True, "picture_descriptions": False},
"picture_description": {"table_structure": False, "ocr": True, "picture_descriptions": True},
"VLM": {"table_structure": False, "ocr": False, "picture_descriptions": False},
}
preset = body["preset"]
preset_configs = get_docling_preset_configs()
preset = body["preset"]
if preset not in preset_map:
return JSONResponse(
{"error": f"Invalid preset '{preset}'. Valid presets: {', '.join(preset_map.keys())}"},
status_code=400,
)
if preset not in preset_configs:
valid_presets = list(preset_configs.keys())
return JSONResponse(
{
"error": f"Invalid preset '{preset}'. Valid presets: {', '.join(valid_presets)}"
},
status_code=400,
)
settings = preset_map[preset]
else:
# Support new toggle-based API
settings = {
"table_structure": body.get("table_structure", False),
"ocr": body.get("ocr", False),
"picture_descriptions": body.get("picture_descriptions", False),
}
# Get the preset configuration
preset_config = preset_configs[preset]
preset_config = get_docling_preset_configs(**settings)
# Use the helper function to update the flow
flows_service = _get_flows_service()
await flows_service.update_flow_docling_preset(preset, preset_config)
await flows_service.update_flow_docling_preset("custom", preset_config)
logger.info(f"Successfully updated docling preset to '{preset}' in ingest flow")
logger.info(f"Successfully updated docling settings in ingest flow")
return JSONResponse(
{
"message": f"Successfully updated docling preset to '{preset}'",
"preset": preset,
"message": f"Successfully updated docling settings",
"settings": settings,
"preset_config": preset_config,
}
)
except Exception as e:
logger.error("Failed to update docling preset", error=str(e))
logger.error("Failed to update docling settings", error=str(e))
return JSONResponse(
{"error": f"Failed to update docling preset: {str(e)}"}, status_code=500
{"error": f"Failed to update docling settings: {str(e)}"}, status_code=500
)

View file

@ -27,7 +27,9 @@ class KnowledgeConfig:
embedding_model: str = "text-embedding-3-small"
chunk_size: int = 1000
chunk_overlap: int = 200
doclingPresets: str = "standard"
table_structure: bool = False
ocr: bool = False
picture_descriptions: bool = False
@dataclass