Add file path filters to query parameters and update UI components

- Introduced `file_path_filters` to `QueryRequest` and `QueryParam` for limiting retrieval based on file paths.
- Implemented filtering logic in various functions to apply these filters during data retrieval.
- Updated UI components to support input for file path filters, including localization for multiple languages.
This commit is contained in:
Mohit Tilwani 2025-10-28 16:03:26 +01:00
parent 779a0d664c
commit c09b0d2b02
11 changed files with 270 additions and 2 deletions

View file

@ -103,6 +103,11 @@ class QueryRequest(BaseModel):
description="If True, includes reference list in responses. Affects /query and /query/stream endpoints. /query/data always includes references.",
)
file_path_filters: Optional[List[str]] = Field(
default=None,
description="Optional list of filename/path substrings; when provided, retrieval is restricted to matching sources.",
)
stream: Optional[bool] = Field(
default=True,
description="If True, enables streaming output for real-time responses. Only affects /query/stream endpoint.",

View file

@ -130,6 +130,9 @@ class QueryParam:
)
"""Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""
file_path_filters: list[str] | None = None
"""Optional filename/path substrings; when provided, retrieval limits results to matching sources."""
hl_keywords: list[str] = field(default_factory=list)
"""List of high-level keywords to prioritize in retrieval."""

View file

@ -149,6 +149,118 @@ def chunking_by_token_size(
)
return results
def _get_normalized_file_filters(query_param: QueryParam | None) -> list[str]:
"""Prepare normalized (lower-cased) file path filters from the query parameters."""
if query_param is None:
return []
filters = getattr(query_param, "file_path_filters", None)
if not filters:
return []
normalized_filters: list[str] = []
for value in filters:
if isinstance(value, str):
trimmed = value.strip()
if trimmed:
normalized_filters.append(trimmed.lower())
# Remove duplicates while preserving order
seen = set()
deduped_filters: list[str] = []
for value in normalized_filters:
if value not in seen:
seen.add(value)
deduped_filters.append(value)
return deduped_filters
def _split_file_path_values(file_path_value: Any) -> list[str]:
"""Split stored file_path metadata into comparable parts."""
if not file_path_value or file_path_value == "unknown_source":
return []
paths: list[str] = []
# Support strings, lists, or other iterables
if isinstance(file_path_value, list):
iterable = file_path_value
else:
iterable = [file_path_value]
for item in iterable:
if item is None:
continue
item_str = str(item)
if not item_str:
continue
fragments = split_string_by_multi_markers(item_str, [GRAPH_FIELD_SEP])
for fragment in fragments:
fragment = fragment.strip()
if fragment:
paths.append(fragment)
return paths
def _file_path_matches(file_path_value: Any, normalized_filters: list[str]) -> bool:
"""Return True when the stored file path metadata matches any filter substring."""
if not normalized_filters:
return True
candidate_paths = _split_file_path_values(file_path_value)
if not candidate_paths:
return False
for candidate in candidate_paths:
lower_candidate = candidate.lower()
candidate_basename = Path(candidate).name.lower()
for filter_value in normalized_filters:
if filter_value in lower_candidate or filter_value in candidate_basename:
return True
return False
def _filter_items_by_file_path(
items: list[dict],
normalized_filters: list[str],
) -> list[dict]:
"""Filter entity/relation dictionaries by file path metadata."""
if not normalized_filters:
return items
filtered: list[dict] = []
for item in items:
if _file_path_matches(item.get("file_path"), normalized_filters):
filtered.append(item)
return filtered
def _filter_vector_chunks_by_file_path(
vector_chunks: list[dict],
normalized_filters: list[str],
chunk_tracking: dict | None = None,
) -> list[dict]:
"""Filter vector chunk results and prune chunk tracking accordingly."""
if not normalized_filters:
return vector_chunks
filtered_chunks: list[dict] = []
allowed_chunk_ids: set[str] = set()
for chunk in vector_chunks:
if _file_path_matches(chunk.get("file_path"), normalized_filters):
filtered_chunks.append(chunk)
chunk_id = chunk.get("chunk_id") or chunk.get("id")
if chunk_id:
allowed_chunk_ids.add(chunk_id)
if chunk_tracking is not None:
for chunk_id in list(chunk_tracking.keys()):
tracking_source = chunk_tracking[chunk_id].get("source")
if tracking_source == "C" and chunk_id not in allowed_chunk_ids:
del chunk_tracking[chunk_id]
return filtered_chunks
async def _handle_entity_relation_summary(
description_type: str,
@ -3418,6 +3530,20 @@ async def _get_vector_context(
}
valid_chunks.append(chunk_with_metadata)
normalized_filters = _get_normalized_file_filters(query_param)
if normalized_filters:
original_len = len(valid_chunks)
valid_chunks = [
chunk
for chunk in valid_chunks
if _file_path_matches(chunk.get("file_path"), normalized_filters)
]
logger.info(
"Vector chunk filter applied %s -> %d/%d",
query_param.file_path_filters,
len(valid_chunks),
original_len,
)
logger.info(
f"Naive query: {len(valid_chunks)} chunks (chunk_top_k:{search_top_k} cosine:{cosine_threshold})"
)
@ -3584,6 +3710,34 @@ async def _perform_kg_search(
final_relations.append(relation)
seen_relations.add(rel_key)
normalized_file_filters = _get_normalized_file_filters(query_param)
if normalized_file_filters:
original_counts = (
len(final_entities),
len(final_relations),
len(vector_chunks),
)
final_entities = _filter_items_by_file_path(
final_entities, normalized_file_filters
)
final_relations = _filter_items_by_file_path(
final_relations, normalized_file_filters
)
vector_chunks = _filter_vector_chunks_by_file_path(
vector_chunks, normalized_file_filters, chunk_tracking
)
logger.info(
"Applied file path filters %s -> entities %d/%d, relations %d/%d, vector chunks %d/%d",
query_param.file_path_filters,
len(final_entities),
original_counts[0],
len(final_relations),
original_counts[1],
len(vector_chunks),
original_counts[2],
)
logger.info(
f"Raw search results: {len(final_entities)} entities, {len(final_relations)} relations, {len(vector_chunks)} vector chunks"
)
@ -3867,6 +4021,35 @@ async def _merge_all_chunks(
f"Round-robin merged chunks: {origin_len} -> {len(merged_chunks)} (deduplicated {origin_len - len(merged_chunks)})"
)
normalized_filters = _get_normalized_file_filters(query_param)
if normalized_filters:
original_len = len(merged_chunks)
merged_chunks = [
chunk
for chunk in merged_chunks
if _file_path_matches(chunk.get("file_path"), normalized_filters)
]
if chunk_tracking is not None:
remaining_ids = {
chunk.get("chunk_id")
for chunk in merged_chunks
if chunk.get("chunk_id")
}
for chunk_id in list(chunk_tracking.keys()):
is_known_source = chunk_tracking[chunk_id].get("source") in {
"C",
"E",
"R",
}
if is_known_source and chunk_id not in remaining_ids:
del chunk_tracking[chunk_id]
logger.info(
"Merged chunks filtered by file paths %s -> %d/%d",
query_param.file_path_filters,
len(merged_chunks),
original_len,
)
return merged_chunks
@ -4309,9 +4492,15 @@ async def _find_related_text_unit_from_entities(
if not node_datas:
return []
normalized_filters = _get_normalized_file_filters(query_param)
# Step 1: Collect all text chunks for each entity
entities_with_chunks = []
for entity in node_datas:
if normalized_filters and not _file_path_matches(
entity.get("file_path"), normalized_filters
):
continue
if entity.get("source_id"):
chunks = split_string_by_multi_markers(
entity["source_id"], [GRAPH_FIELD_SEP]
@ -4434,6 +4623,10 @@ async def _find_related_text_unit_from_entities(
result_chunks = []
for i, (chunk_id, chunk_data) in enumerate(zip(unique_chunk_ids, chunk_data_list)):
if chunk_data is not None and "content" in chunk_data:
if normalized_filters and not _file_path_matches(
chunk_data.get("file_path"), normalized_filters
):
continue
chunk_data_copy = chunk_data.copy()
chunk_data_copy["source_type"] = "entity"
chunk_data_copy["chunk_id"] = chunk_id # Add chunk_id for deduplication
@ -4561,9 +4754,15 @@ async def _find_related_text_unit_from_relations(
if not edge_datas:
return []
normalized_filters = _get_normalized_file_filters(query_param)
# Step 1: Collect all text chunks for each relationship
relations_with_chunks = []
for relation in edge_datas:
if normalized_filters and not _file_path_matches(
relation.get("file_path"), normalized_filters
):
continue
if relation.get("source_id"):
chunks = split_string_by_multi_markers(
relation["source_id"], [GRAPH_FIELD_SEP]
@ -4729,6 +4928,10 @@ async def _find_related_text_unit_from_relations(
result_chunks = []
for i, (chunk_id, chunk_data) in enumerate(zip(unique_chunk_ids, chunk_data_list)):
if chunk_data is not None and "content" in chunk_data:
if normalized_filters and not _file_path_matches(
chunk_data.get("file_path"), normalized_filters
):
continue
chunk_data_copy = chunk_data.copy()
chunk_data_copy["source_type"] = "relationship"
chunk_data_copy["chunk_id"] = chunk_id # Add chunk_id for deduplication

View file

@ -137,6 +137,8 @@ export type QueryRequest = {
user_prompt?: string
/** Enable reranking for retrieved text chunks. If True but no rerank model is configured, a warning will be issued. Default is True. */
enable_rerank?: boolean
/** Optional list of filename/path substrings used to limit retrieval to matching sources. */
file_path_filters?: string[]
}
export type QueryResponse = {

View file

@ -45,7 +45,8 @@ export default function QuerySettings() {
chunk_top_k: 20,
max_entity_tokens: 6000,
max_relation_tokens: 8000,
max_total_tokens: 30000
max_total_tokens: 30000,
file_path_filters: [] as string[]
}), [])
const handleReset = useCallback((key: keyof typeof defaultValues) => {
@ -73,6 +74,14 @@ export default function QuerySettings() {
</TooltipProvider>
)
const handleFilePathFiltersChange = useCallback((value: string) => {
const filters = value
.split(',')
.map((item) => item.trim())
.filter(Boolean)
handleChange('file_path_filters', filters)
}, [handleChange])
return (
<Card className="flex shrink-0 flex-col w-[280px]">
<CardHeader className="px-4 pt-4 pb-2">
@ -273,6 +282,36 @@ export default function QuerySettings() {
</div>
</>
{/* File Path Filters */}
<>
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<label htmlFor="file_path_filters" className="ml-1 cursor-help">
{t('retrievePanel.querySettings.filePathFilters')}
</label>
</TooltipTrigger>
<TooltipContent side="left">
<p>{t('retrievePanel.querySettings.filePathFiltersTooltip')}</p>
</TooltipContent>
</Tooltip>
</TooltipProvider>
<div className="flex items-center gap-1">
<Input
id="file_path_filters"
type="text"
value={(querySettings.file_path_filters ?? []).join(', ')}
onChange={(e) => handleFilePathFiltersChange(e.target.value)}
placeholder={t('retrievePanel.querySettings.filePathFiltersPlaceholder')}
className="h-9 flex-1 pr-2"
/>
<ResetButton
onClick={() => handleReset('file_path_filters')}
title="Clear filters"
/>
</div>
</>
{/* Max Entity Tokens */}
<>
<TooltipProvider>

View file

@ -421,6 +421,9 @@
"chunkTopK": "أعلى K للقطع",
"chunkTopKTooltip": "عدد أجزاء النص المطلوب استردادها، وينطبق على جميع الأوضاع.",
"chunkTopKPlaceholder": "أدخل قيمة chunk_top_k",
"filePathFilters": "مرشحات مسار الملف",
"filePathFiltersTooltip": "قم بتقييد الاسترجاع بالوثائق التي يحتوي مسار الملف المخزن على أي من السلاسل المحددة. افصل بين المرشحات باستخدام فواصل.",
"filePathFiltersPlaceholder": "مثال: reports/، meeting-notes.pdf",
"maxEntityTokens": "الحد الأقصى لرموز الكيان",
"maxEntityTokensTooltip": "الحد الأقصى لعدد الرموز المخصصة لسياق الكيان في نظام التحكم الموحد في الرموز",
"maxRelationTokens": "الحد الأقصى لرموز العلاقة",

View file

@ -421,6 +421,9 @@
"chunkTopK": "Chunk Top K",
"chunkTopKTooltip": "Number of text chunks to retrieve, applicable for all modes.",
"chunkTopKPlaceholder": "Enter chunk_top_k value",
"filePathFilters": "File Path Filters",
"filePathFiltersTooltip": "Restrict retrieval to documents whose stored file path contains any of the specified substrings. Separate multiple filters with commas.",
"filePathFiltersPlaceholder": "e.g. reports/, meeting-notes.pdf",
"maxEntityTokens": "Max Entity Tokens",
"maxEntityTokensTooltip": "Maximum number of tokens allocated for entity context in unified token control system",
"maxRelationTokens": "Max Relation Tokens",

View file

@ -421,6 +421,9 @@
"chunkTopK": "Top K des Chunks",
"chunkTopKTooltip": "Nombre de morceaux de texte à récupérer, applicable à tous les modes.",
"chunkTopKPlaceholder": "Entrez la valeur chunk_top_k",
"filePathFilters": "Filtres de chemin de fichier",
"filePathFiltersTooltip": "Limiter la récupération aux documents dont le chemin de fichier enregistré contient l'une des sous-chaînes spécifiées. Séparez plusieurs filtres par des virgules.",
"filePathFiltersPlaceholder": "ex. reports/, meeting-notes.pdf",
"maxEntityTokens": "Limite de jetons d'entité",
"maxEntityTokensTooltip": "Nombre maximum de jetons alloués au contexte d'entité dans le système de contrôle de jetons unifié",
"maxRelationTokens": "Limite de jetons de relation",

View file

@ -421,6 +421,9 @@
"chunkTopK": "文本块 Top K",
"chunkTopKTooltip": "文本块检索数量, 适用于所有模式",
"chunkTopKPlaceholder": "输入文本块chunk_top_k值",
"filePathFilters": "文件路径过滤",
"filePathFiltersTooltip": "仅检索文件路径包含任一指定子串的文档。多个过滤条件用逗号分隔。",
"filePathFiltersPlaceholder": "例如reports/meeting-notes.pdf",
"maxEntityTokens": "实体令牌数上限",
"maxEntityTokensTooltip": "统一令牌控制系统中分配给实体上下文的最大令牌数",
"maxRelationTokens": "关系令牌数上限",

View file

@ -421,6 +421,9 @@
"chunkTopK": "文本區塊 Top K",
"chunkTopKTooltip": "文本區塊檢索數量,適用於所有模式。",
"chunkTopKPlaceholder": "輸入文本區塊 chunk_top_k 值",
"filePathFilters": "檔案路徑篩選",
"filePathFiltersTooltip": "僅檢索儲存的檔案路徑包含任一指定子字串的文件。多個篩選條件請以逗號分隔。",
"filePathFiltersPlaceholder": "例如reports/meeting-notes.pdf",
"historyTurns": "歷史輪次",
"historyTurnsTooltip": "回應上下文中考慮的完整對話輪次(使用者-助手對)數量",
"historyTurnsPlaceholder": "歷史輪次數",

View file

@ -134,7 +134,8 @@ const useSettingsStoreBase = create<SettingsState>()(
stream: true,
history_turns: 0,
user_prompt: '',
enable_rerank: true
enable_rerank: true,
file_path_filters: []
},
setTheme: (theme: Theme) => set({ theme }),