Add file path filters to query parameters and update UI components
- Introduced `file_path_filters` to `QueryRequest` and `QueryParam` for limiting retrieval based on file paths. - Implemented filtering logic in various functions to apply these filters during data retrieval. - Updated UI components to support input for file path filters, including localization for multiple languages.
This commit is contained in:
parent
779a0d664c
commit
c09b0d2b02
11 changed files with 270 additions and 2 deletions
|
|
@ -103,6 +103,11 @@ class QueryRequest(BaseModel):
|
|||
description="If True, includes reference list in responses. Affects /query and /query/stream endpoints. /query/data always includes references.",
|
||||
)
|
||||
|
||||
file_path_filters: Optional[List[str]] = Field(
|
||||
default=None,
|
||||
description="Optional list of filename/path substrings; when provided, retrieval is restricted to matching sources.",
|
||||
)
|
||||
|
||||
stream: Optional[bool] = Field(
|
||||
default=True,
|
||||
description="If True, enables streaming output for real-time responses. Only affects /query/stream endpoint.",
|
||||
|
|
|
|||
|
|
@ -130,6 +130,9 @@ class QueryParam:
|
|||
)
|
||||
"""Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""
|
||||
|
||||
file_path_filters: list[str] | None = None
|
||||
"""Optional filename/path substrings; when provided, retrieval limits results to matching sources."""
|
||||
|
||||
hl_keywords: list[str] = field(default_factory=list)
|
||||
"""List of high-level keywords to prioritize in retrieval."""
|
||||
|
||||
|
|
|
|||
|
|
@ -149,6 +149,118 @@ def chunking_by_token_size(
|
|||
)
|
||||
return results
|
||||
|
||||
def _get_normalized_file_filters(query_param: QueryParam | None) -> list[str]:
|
||||
"""Prepare normalized (lower-cased) file path filters from the query parameters."""
|
||||
if query_param is None:
|
||||
return []
|
||||
|
||||
filters = getattr(query_param, "file_path_filters", None)
|
||||
if not filters:
|
||||
return []
|
||||
|
||||
normalized_filters: list[str] = []
|
||||
for value in filters:
|
||||
if isinstance(value, str):
|
||||
trimmed = value.strip()
|
||||
if trimmed:
|
||||
normalized_filters.append(trimmed.lower())
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
deduped_filters: list[str] = []
|
||||
for value in normalized_filters:
|
||||
if value not in seen:
|
||||
seen.add(value)
|
||||
deduped_filters.append(value)
|
||||
return deduped_filters
|
||||
|
||||
|
||||
def _split_file_path_values(file_path_value: Any) -> list[str]:
|
||||
"""Split stored file_path metadata into comparable parts."""
|
||||
if not file_path_value or file_path_value == "unknown_source":
|
||||
return []
|
||||
|
||||
paths: list[str] = []
|
||||
# Support strings, lists, or other iterables
|
||||
if isinstance(file_path_value, list):
|
||||
iterable = file_path_value
|
||||
else:
|
||||
iterable = [file_path_value]
|
||||
|
||||
for item in iterable:
|
||||
if item is None:
|
||||
continue
|
||||
item_str = str(item)
|
||||
if not item_str:
|
||||
continue
|
||||
fragments = split_string_by_multi_markers(item_str, [GRAPH_FIELD_SEP])
|
||||
for fragment in fragments:
|
||||
fragment = fragment.strip()
|
||||
if fragment:
|
||||
paths.append(fragment)
|
||||
return paths
|
||||
|
||||
|
||||
def _file_path_matches(file_path_value: Any, normalized_filters: list[str]) -> bool:
|
||||
"""Return True when the stored file path metadata matches any filter substring."""
|
||||
if not normalized_filters:
|
||||
return True
|
||||
|
||||
candidate_paths = _split_file_path_values(file_path_value)
|
||||
if not candidate_paths:
|
||||
return False
|
||||
|
||||
for candidate in candidate_paths:
|
||||
lower_candidate = candidate.lower()
|
||||
candidate_basename = Path(candidate).name.lower()
|
||||
for filter_value in normalized_filters:
|
||||
if filter_value in lower_candidate or filter_value in candidate_basename:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _filter_items_by_file_path(
|
||||
items: list[dict],
|
||||
normalized_filters: list[str],
|
||||
) -> list[dict]:
|
||||
"""Filter entity/relation dictionaries by file path metadata."""
|
||||
if not normalized_filters:
|
||||
return items
|
||||
|
||||
filtered: list[dict] = []
|
||||
for item in items:
|
||||
if _file_path_matches(item.get("file_path"), normalized_filters):
|
||||
filtered.append(item)
|
||||
return filtered
|
||||
|
||||
|
||||
def _filter_vector_chunks_by_file_path(
|
||||
vector_chunks: list[dict],
|
||||
normalized_filters: list[str],
|
||||
chunk_tracking: dict | None = None,
|
||||
) -> list[dict]:
|
||||
"""Filter vector chunk results and prune chunk tracking accordingly."""
|
||||
if not normalized_filters:
|
||||
return vector_chunks
|
||||
|
||||
filtered_chunks: list[dict] = []
|
||||
allowed_chunk_ids: set[str] = set()
|
||||
|
||||
for chunk in vector_chunks:
|
||||
if _file_path_matches(chunk.get("file_path"), normalized_filters):
|
||||
filtered_chunks.append(chunk)
|
||||
chunk_id = chunk.get("chunk_id") or chunk.get("id")
|
||||
if chunk_id:
|
||||
allowed_chunk_ids.add(chunk_id)
|
||||
|
||||
if chunk_tracking is not None:
|
||||
for chunk_id in list(chunk_tracking.keys()):
|
||||
tracking_source = chunk_tracking[chunk_id].get("source")
|
||||
if tracking_source == "C" and chunk_id not in allowed_chunk_ids:
|
||||
del chunk_tracking[chunk_id]
|
||||
|
||||
return filtered_chunks
|
||||
|
||||
|
||||
async def _handle_entity_relation_summary(
|
||||
description_type: str,
|
||||
|
|
@ -3418,6 +3530,20 @@ async def _get_vector_context(
|
|||
}
|
||||
valid_chunks.append(chunk_with_metadata)
|
||||
|
||||
normalized_filters = _get_normalized_file_filters(query_param)
|
||||
if normalized_filters:
|
||||
original_len = len(valid_chunks)
|
||||
valid_chunks = [
|
||||
chunk
|
||||
for chunk in valid_chunks
|
||||
if _file_path_matches(chunk.get("file_path"), normalized_filters)
|
||||
]
|
||||
logger.info(
|
||||
"Vector chunk filter applied %s -> %d/%d",
|
||||
query_param.file_path_filters,
|
||||
len(valid_chunks),
|
||||
original_len,
|
||||
)
|
||||
logger.info(
|
||||
f"Naive query: {len(valid_chunks)} chunks (chunk_top_k:{search_top_k} cosine:{cosine_threshold})"
|
||||
)
|
||||
|
|
@ -3584,6 +3710,34 @@ async def _perform_kg_search(
|
|||
final_relations.append(relation)
|
||||
seen_relations.add(rel_key)
|
||||
|
||||
normalized_file_filters = _get_normalized_file_filters(query_param)
|
||||
if normalized_file_filters:
|
||||
original_counts = (
|
||||
len(final_entities),
|
||||
len(final_relations),
|
||||
len(vector_chunks),
|
||||
)
|
||||
final_entities = _filter_items_by_file_path(
|
||||
final_entities, normalized_file_filters
|
||||
)
|
||||
final_relations = _filter_items_by_file_path(
|
||||
final_relations, normalized_file_filters
|
||||
)
|
||||
vector_chunks = _filter_vector_chunks_by_file_path(
|
||||
vector_chunks, normalized_file_filters, chunk_tracking
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Applied file path filters %s -> entities %d/%d, relations %d/%d, vector chunks %d/%d",
|
||||
query_param.file_path_filters,
|
||||
len(final_entities),
|
||||
original_counts[0],
|
||||
len(final_relations),
|
||||
original_counts[1],
|
||||
len(vector_chunks),
|
||||
original_counts[2],
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Raw search results: {len(final_entities)} entities, {len(final_relations)} relations, {len(vector_chunks)} vector chunks"
|
||||
)
|
||||
|
|
@ -3867,6 +4021,35 @@ async def _merge_all_chunks(
|
|||
f"Round-robin merged chunks: {origin_len} -> {len(merged_chunks)} (deduplicated {origin_len - len(merged_chunks)})"
|
||||
)
|
||||
|
||||
normalized_filters = _get_normalized_file_filters(query_param)
|
||||
if normalized_filters:
|
||||
original_len = len(merged_chunks)
|
||||
merged_chunks = [
|
||||
chunk
|
||||
for chunk in merged_chunks
|
||||
if _file_path_matches(chunk.get("file_path"), normalized_filters)
|
||||
]
|
||||
if chunk_tracking is not None:
|
||||
remaining_ids = {
|
||||
chunk.get("chunk_id")
|
||||
for chunk in merged_chunks
|
||||
if chunk.get("chunk_id")
|
||||
}
|
||||
for chunk_id in list(chunk_tracking.keys()):
|
||||
is_known_source = chunk_tracking[chunk_id].get("source") in {
|
||||
"C",
|
||||
"E",
|
||||
"R",
|
||||
}
|
||||
if is_known_source and chunk_id not in remaining_ids:
|
||||
del chunk_tracking[chunk_id]
|
||||
logger.info(
|
||||
"Merged chunks filtered by file paths %s -> %d/%d",
|
||||
query_param.file_path_filters,
|
||||
len(merged_chunks),
|
||||
original_len,
|
||||
)
|
||||
|
||||
return merged_chunks
|
||||
|
||||
|
||||
|
|
@ -4309,9 +4492,15 @@ async def _find_related_text_unit_from_entities(
|
|||
if not node_datas:
|
||||
return []
|
||||
|
||||
normalized_filters = _get_normalized_file_filters(query_param)
|
||||
|
||||
# Step 1: Collect all text chunks for each entity
|
||||
entities_with_chunks = []
|
||||
for entity in node_datas:
|
||||
if normalized_filters and not _file_path_matches(
|
||||
entity.get("file_path"), normalized_filters
|
||||
):
|
||||
continue
|
||||
if entity.get("source_id"):
|
||||
chunks = split_string_by_multi_markers(
|
||||
entity["source_id"], [GRAPH_FIELD_SEP]
|
||||
|
|
@ -4434,6 +4623,10 @@ async def _find_related_text_unit_from_entities(
|
|||
result_chunks = []
|
||||
for i, (chunk_id, chunk_data) in enumerate(zip(unique_chunk_ids, chunk_data_list)):
|
||||
if chunk_data is not None and "content" in chunk_data:
|
||||
if normalized_filters and not _file_path_matches(
|
||||
chunk_data.get("file_path"), normalized_filters
|
||||
):
|
||||
continue
|
||||
chunk_data_copy = chunk_data.copy()
|
||||
chunk_data_copy["source_type"] = "entity"
|
||||
chunk_data_copy["chunk_id"] = chunk_id # Add chunk_id for deduplication
|
||||
|
|
@ -4561,9 +4754,15 @@ async def _find_related_text_unit_from_relations(
|
|||
if not edge_datas:
|
||||
return []
|
||||
|
||||
normalized_filters = _get_normalized_file_filters(query_param)
|
||||
|
||||
# Step 1: Collect all text chunks for each relationship
|
||||
relations_with_chunks = []
|
||||
for relation in edge_datas:
|
||||
if normalized_filters and not _file_path_matches(
|
||||
relation.get("file_path"), normalized_filters
|
||||
):
|
||||
continue
|
||||
if relation.get("source_id"):
|
||||
chunks = split_string_by_multi_markers(
|
||||
relation["source_id"], [GRAPH_FIELD_SEP]
|
||||
|
|
@ -4729,6 +4928,10 @@ async def _find_related_text_unit_from_relations(
|
|||
result_chunks = []
|
||||
for i, (chunk_id, chunk_data) in enumerate(zip(unique_chunk_ids, chunk_data_list)):
|
||||
if chunk_data is not None and "content" in chunk_data:
|
||||
if normalized_filters and not _file_path_matches(
|
||||
chunk_data.get("file_path"), normalized_filters
|
||||
):
|
||||
continue
|
||||
chunk_data_copy = chunk_data.copy()
|
||||
chunk_data_copy["source_type"] = "relationship"
|
||||
chunk_data_copy["chunk_id"] = chunk_id # Add chunk_id for deduplication
|
||||
|
|
|
|||
|
|
@ -137,6 +137,8 @@ export type QueryRequest = {
|
|||
user_prompt?: string
|
||||
/** Enable reranking for retrieved text chunks. If True but no rerank model is configured, a warning will be issued. Default is True. */
|
||||
enable_rerank?: boolean
|
||||
/** Optional list of filename/path substrings used to limit retrieval to matching sources. */
|
||||
file_path_filters?: string[]
|
||||
}
|
||||
|
||||
export type QueryResponse = {
|
||||
|
|
|
|||
|
|
@ -45,7 +45,8 @@ export default function QuerySettings() {
|
|||
chunk_top_k: 20,
|
||||
max_entity_tokens: 6000,
|
||||
max_relation_tokens: 8000,
|
||||
max_total_tokens: 30000
|
||||
max_total_tokens: 30000,
|
||||
file_path_filters: [] as string[]
|
||||
}), [])
|
||||
|
||||
const handleReset = useCallback((key: keyof typeof defaultValues) => {
|
||||
|
|
@ -73,6 +74,14 @@ export default function QuerySettings() {
|
|||
</TooltipProvider>
|
||||
)
|
||||
|
||||
const handleFilePathFiltersChange = useCallback((value: string) => {
|
||||
const filters = value
|
||||
.split(',')
|
||||
.map((item) => item.trim())
|
||||
.filter(Boolean)
|
||||
handleChange('file_path_filters', filters)
|
||||
}, [handleChange])
|
||||
|
||||
return (
|
||||
<Card className="flex shrink-0 flex-col w-[280px]">
|
||||
<CardHeader className="px-4 pt-4 pb-2">
|
||||
|
|
@ -273,6 +282,36 @@ export default function QuerySettings() {
|
|||
</div>
|
||||
</>
|
||||
|
||||
{/* File Path Filters */}
|
||||
<>
|
||||
<TooltipProvider>
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<label htmlFor="file_path_filters" className="ml-1 cursor-help">
|
||||
{t('retrievePanel.querySettings.filePathFilters')}
|
||||
</label>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="left">
|
||||
<p>{t('retrievePanel.querySettings.filePathFiltersTooltip')}</p>
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</TooltipProvider>
|
||||
<div className="flex items-center gap-1">
|
||||
<Input
|
||||
id="file_path_filters"
|
||||
type="text"
|
||||
value={(querySettings.file_path_filters ?? []).join(', ')}
|
||||
onChange={(e) => handleFilePathFiltersChange(e.target.value)}
|
||||
placeholder={t('retrievePanel.querySettings.filePathFiltersPlaceholder')}
|
||||
className="h-9 flex-1 pr-2"
|
||||
/>
|
||||
<ResetButton
|
||||
onClick={() => handleReset('file_path_filters')}
|
||||
title="Clear filters"
|
||||
/>
|
||||
</div>
|
||||
</>
|
||||
|
||||
{/* Max Entity Tokens */}
|
||||
<>
|
||||
<TooltipProvider>
|
||||
|
|
|
|||
|
|
@ -421,6 +421,9 @@
|
|||
"chunkTopK": "أعلى K للقطع",
|
||||
"chunkTopKTooltip": "عدد أجزاء النص المطلوب استردادها، وينطبق على جميع الأوضاع.",
|
||||
"chunkTopKPlaceholder": "أدخل قيمة chunk_top_k",
|
||||
"filePathFilters": "مرشحات مسار الملف",
|
||||
"filePathFiltersTooltip": "قم بتقييد الاسترجاع بالوثائق التي يحتوي مسار الملف المخزن على أي من السلاسل المحددة. افصل بين المرشحات باستخدام فواصل.",
|
||||
"filePathFiltersPlaceholder": "مثال: reports/، meeting-notes.pdf",
|
||||
"maxEntityTokens": "الحد الأقصى لرموز الكيان",
|
||||
"maxEntityTokensTooltip": "الحد الأقصى لعدد الرموز المخصصة لسياق الكيان في نظام التحكم الموحد في الرموز",
|
||||
"maxRelationTokens": "الحد الأقصى لرموز العلاقة",
|
||||
|
|
|
|||
|
|
@ -421,6 +421,9 @@
|
|||
"chunkTopK": "Chunk Top K",
|
||||
"chunkTopKTooltip": "Number of text chunks to retrieve, applicable for all modes.",
|
||||
"chunkTopKPlaceholder": "Enter chunk_top_k value",
|
||||
"filePathFilters": "File Path Filters",
|
||||
"filePathFiltersTooltip": "Restrict retrieval to documents whose stored file path contains any of the specified substrings. Separate multiple filters with commas.",
|
||||
"filePathFiltersPlaceholder": "e.g. reports/, meeting-notes.pdf",
|
||||
"maxEntityTokens": "Max Entity Tokens",
|
||||
"maxEntityTokensTooltip": "Maximum number of tokens allocated for entity context in unified token control system",
|
||||
"maxRelationTokens": "Max Relation Tokens",
|
||||
|
|
|
|||
|
|
@ -421,6 +421,9 @@
|
|||
"chunkTopK": "Top K des Chunks",
|
||||
"chunkTopKTooltip": "Nombre de morceaux de texte à récupérer, applicable à tous les modes.",
|
||||
"chunkTopKPlaceholder": "Entrez la valeur chunk_top_k",
|
||||
"filePathFilters": "Filtres de chemin de fichier",
|
||||
"filePathFiltersTooltip": "Limiter la récupération aux documents dont le chemin de fichier enregistré contient l'une des sous-chaînes spécifiées. Séparez plusieurs filtres par des virgules.",
|
||||
"filePathFiltersPlaceholder": "ex. reports/, meeting-notes.pdf",
|
||||
"maxEntityTokens": "Limite de jetons d'entité",
|
||||
"maxEntityTokensTooltip": "Nombre maximum de jetons alloués au contexte d'entité dans le système de contrôle de jetons unifié",
|
||||
"maxRelationTokens": "Limite de jetons de relation",
|
||||
|
|
|
|||
|
|
@ -421,6 +421,9 @@
|
|||
"chunkTopK": "文本块 Top K",
|
||||
"chunkTopKTooltip": "文本块检索数量, 适用于所有模式",
|
||||
"chunkTopKPlaceholder": "输入文本块chunk_top_k值",
|
||||
"filePathFilters": "文件路径过滤",
|
||||
"filePathFiltersTooltip": "仅检索文件路径包含任一指定子串的文档。多个过滤条件用逗号分隔。",
|
||||
"filePathFiltersPlaceholder": "例如:reports/,meeting-notes.pdf",
|
||||
"maxEntityTokens": "实体令牌数上限",
|
||||
"maxEntityTokensTooltip": "统一令牌控制系统中分配给实体上下文的最大令牌数",
|
||||
"maxRelationTokens": "关系令牌数上限",
|
||||
|
|
|
|||
|
|
@ -421,6 +421,9 @@
|
|||
"chunkTopK": "文本區塊 Top K",
|
||||
"chunkTopKTooltip": "文本區塊檢索數量,適用於所有模式。",
|
||||
"chunkTopKPlaceholder": "輸入文本區塊 chunk_top_k 值",
|
||||
"filePathFilters": "檔案路徑篩選",
|
||||
"filePathFiltersTooltip": "僅檢索儲存的檔案路徑包含任一指定子字串的文件。多個篩選條件請以逗號分隔。",
|
||||
"filePathFiltersPlaceholder": "例如:reports/,meeting-notes.pdf",
|
||||
"historyTurns": "歷史輪次",
|
||||
"historyTurnsTooltip": "回應上下文中考慮的完整對話輪次(使用者-助手對)數量",
|
||||
"historyTurnsPlaceholder": "歷史輪次數",
|
||||
|
|
|
|||
|
|
@ -134,7 +134,8 @@ const useSettingsStoreBase = create<SettingsState>()(
|
|||
stream: true,
|
||||
history_turns: 0,
|
||||
user_prompt: '',
|
||||
enable_rerank: true
|
||||
enable_rerank: true,
|
||||
file_path_filters: []
|
||||
},
|
||||
|
||||
setTheme: (theme: Theme) => set({ theme }),
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue