diff --git a/lightrag/api/routers/query_routes.py b/lightrag/api/routers/query_routes.py index f0ee0e98..5fbf728f 100644 --- a/lightrag/api/routers/query_routes.py +++ b/lightrag/api/routers/query_routes.py @@ -103,6 +103,11 @@ class QueryRequest(BaseModel): description="If True, includes reference list in responses. Affects /query and /query/stream endpoints. /query/data always includes references.", ) + file_path_filters: Optional[List[str]] = Field( + default=None, + description="Optional list of filename/path substrings; when provided, retrieval is restricted to matching sources.", + ) + stream: Optional[bool] = Field( default=True, description="If True, enables streaming output for real-time responses. Only affects /query/stream endpoint.", diff --git a/lightrag/base.py b/lightrag/base.py index 3cf40136..ad6d1164 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -130,6 +130,9 @@ class QueryParam: ) """Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt).""" + file_path_filters: list[str] | None = None + """Optional filename/path substrings; when provided, retrieval limits results to matching sources.""" + hl_keywords: list[str] = field(default_factory=list) """List of high-level keywords to prioritize in retrieval.""" diff --git a/lightrag/operate.py b/lightrag/operate.py index f94fb1ea..5a18918b 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -149,6 +149,118 @@ def chunking_by_token_size( ) return results +def _get_normalized_file_filters(query_param: QueryParam | None) -> list[str]: + """Prepare normalized (lower-cased) file path filters from the query parameters.""" + if query_param is None: + return [] + + filters = getattr(query_param, "file_path_filters", None) + if not filters: + return [] + + normalized_filters: list[str] = [] + for value in filters: + if isinstance(value, str): + trimmed = value.strip() + if trimmed: + normalized_filters.append(trimmed.lower()) + + # Remove duplicates while preserving order + seen = set() + deduped_filters: list[str] = [] + for value in normalized_filters: + if value not in seen: + seen.add(value) + deduped_filters.append(value) + return deduped_filters + + +def _split_file_path_values(file_path_value: Any) -> list[str]: + """Split stored file_path metadata into comparable parts.""" + if not file_path_value or file_path_value == "unknown_source": + return [] + + paths: list[str] = [] + # Support strings, lists, or other iterables + if isinstance(file_path_value, list): + iterable = file_path_value + else: + iterable = [file_path_value] + + for item in iterable: + if item is None: + continue + item_str = str(item) + if not item_str: + continue + fragments = split_string_by_multi_markers(item_str, [GRAPH_FIELD_SEP]) + for fragment in fragments: + fragment = fragment.strip() + if fragment: + paths.append(fragment) + return paths + + +def _file_path_matches(file_path_value: Any, normalized_filters: list[str]) -> bool: + """Return True when the stored file path metadata matches any filter substring.""" + if not normalized_filters: + return True + + candidate_paths = _split_file_path_values(file_path_value) + if not candidate_paths: + return False + + for candidate in candidate_paths: + lower_candidate = candidate.lower() + candidate_basename = Path(candidate).name.lower() + for filter_value in normalized_filters: + if filter_value in lower_candidate or filter_value in candidate_basename: + return True + return False + + +def _filter_items_by_file_path( + items: list[dict], + normalized_filters: list[str], +) -> list[dict]: + """Filter entity/relation dictionaries by file path metadata.""" + if not normalized_filters: + return items + + filtered: list[dict] = [] + for item in items: + if _file_path_matches(item.get("file_path"), normalized_filters): + filtered.append(item) + return filtered + + +def _filter_vector_chunks_by_file_path( + vector_chunks: list[dict], + normalized_filters: list[str], + chunk_tracking: dict | None = None, +) -> list[dict]: + """Filter vector chunk results and prune chunk tracking accordingly.""" + if not normalized_filters: + return vector_chunks + + filtered_chunks: list[dict] = [] + allowed_chunk_ids: set[str] = set() + + for chunk in vector_chunks: + if _file_path_matches(chunk.get("file_path"), normalized_filters): + filtered_chunks.append(chunk) + chunk_id = chunk.get("chunk_id") or chunk.get("id") + if chunk_id: + allowed_chunk_ids.add(chunk_id) + + if chunk_tracking is not None: + for chunk_id in list(chunk_tracking.keys()): + tracking_source = chunk_tracking[chunk_id].get("source") + if tracking_source == "C" and chunk_id not in allowed_chunk_ids: + del chunk_tracking[chunk_id] + + return filtered_chunks + async def _handle_entity_relation_summary( description_type: str, @@ -3418,6 +3530,20 @@ async def _get_vector_context( } valid_chunks.append(chunk_with_metadata) + normalized_filters = _get_normalized_file_filters(query_param) + if normalized_filters: + original_len = len(valid_chunks) + valid_chunks = [ + chunk + for chunk in valid_chunks + if _file_path_matches(chunk.get("file_path"), normalized_filters) + ] + logger.info( + "Vector chunk filter applied %s -> %d/%d", + query_param.file_path_filters, + len(valid_chunks), + original_len, + ) logger.info( f"Naive query: {len(valid_chunks)} chunks (chunk_top_k:{search_top_k} cosine:{cosine_threshold})" ) @@ -3583,6 +3709,34 @@ async def _perform_kg_search( if rel_key not in seen_relations: final_relations.append(relation) seen_relations.add(rel_key) + + normalized_file_filters = _get_normalized_file_filters(query_param) + if normalized_file_filters: + original_counts = ( + len(final_entities), + len(final_relations), + len(vector_chunks), + ) + final_entities = _filter_items_by_file_path( + final_entities, normalized_file_filters + ) + final_relations = _filter_items_by_file_path( + final_relations, normalized_file_filters + ) + vector_chunks = _filter_vector_chunks_by_file_path( + vector_chunks, normalized_file_filters, chunk_tracking + ) + + logger.info( + "Applied file path filters %s -> entities %d/%d, relations %d/%d, vector chunks %d/%d", + query_param.file_path_filters, + len(final_entities), + original_counts[0], + len(final_relations), + original_counts[1], + len(vector_chunks), + original_counts[2], + ) logger.info( f"Raw search results: {len(final_entities)} entities, {len(final_relations)} relations, {len(vector_chunks)} vector chunks" @@ -3867,6 +4021,35 @@ async def _merge_all_chunks( f"Round-robin merged chunks: {origin_len} -> {len(merged_chunks)} (deduplicated {origin_len - len(merged_chunks)})" ) + normalized_filters = _get_normalized_file_filters(query_param) + if normalized_filters: + original_len = len(merged_chunks) + merged_chunks = [ + chunk + for chunk in merged_chunks + if _file_path_matches(chunk.get("file_path"), normalized_filters) + ] + if chunk_tracking is not None: + remaining_ids = { + chunk.get("chunk_id") + for chunk in merged_chunks + if chunk.get("chunk_id") + } + for chunk_id in list(chunk_tracking.keys()): + is_known_source = chunk_tracking[chunk_id].get("source") in { + "C", + "E", + "R", + } + if is_known_source and chunk_id not in remaining_ids: + del chunk_tracking[chunk_id] + logger.info( + "Merged chunks filtered by file paths %s -> %d/%d", + query_param.file_path_filters, + len(merged_chunks), + original_len, + ) + return merged_chunks @@ -4308,10 +4491,16 @@ async def _find_related_text_unit_from_entities( if not node_datas: return [] + + normalized_filters = _get_normalized_file_filters(query_param) # Step 1: Collect all text chunks for each entity entities_with_chunks = [] for entity in node_datas: + if normalized_filters and not _file_path_matches( + entity.get("file_path"), normalized_filters + ): + continue if entity.get("source_id"): chunks = split_string_by_multi_markers( entity["source_id"], [GRAPH_FIELD_SEP] @@ -4434,6 +4623,10 @@ async def _find_related_text_unit_from_entities( result_chunks = [] for i, (chunk_id, chunk_data) in enumerate(zip(unique_chunk_ids, chunk_data_list)): if chunk_data is not None and "content" in chunk_data: + if normalized_filters and not _file_path_matches( + chunk_data.get("file_path"), normalized_filters + ): + continue chunk_data_copy = chunk_data.copy() chunk_data_copy["source_type"] = "entity" chunk_data_copy["chunk_id"] = chunk_id # Add chunk_id for deduplication @@ -4560,10 +4753,16 @@ async def _find_related_text_unit_from_relations( if not edge_datas: return [] + + normalized_filters = _get_normalized_file_filters(query_param) # Step 1: Collect all text chunks for each relationship relations_with_chunks = [] for relation in edge_datas: + if normalized_filters and not _file_path_matches( + relation.get("file_path"), normalized_filters + ): + continue if relation.get("source_id"): chunks = split_string_by_multi_markers( relation["source_id"], [GRAPH_FIELD_SEP] @@ -4729,6 +4928,10 @@ async def _find_related_text_unit_from_relations( result_chunks = [] for i, (chunk_id, chunk_data) in enumerate(zip(unique_chunk_ids, chunk_data_list)): if chunk_data is not None and "content" in chunk_data: + if normalized_filters and not _file_path_matches( + chunk_data.get("file_path"), normalized_filters + ): + continue chunk_data_copy = chunk_data.copy() chunk_data_copy["source_type"] = "relationship" chunk_data_copy["chunk_id"] = chunk_id # Add chunk_id for deduplication diff --git a/lightrag_webui/src/api/lightrag.ts b/lightrag_webui/src/api/lightrag.ts index 7cf1aec6..0c7b8406 100644 --- a/lightrag_webui/src/api/lightrag.ts +++ b/lightrag_webui/src/api/lightrag.ts @@ -137,6 +137,8 @@ export type QueryRequest = { user_prompt?: string /** Enable reranking for retrieved text chunks. If True but no rerank model is configured, a warning will be issued. Default is True. */ enable_rerank?: boolean + /** Optional list of filename/path substrings used to limit retrieval to matching sources. */ + file_path_filters?: string[] } export type QueryResponse = { diff --git a/lightrag_webui/src/components/retrieval/QuerySettings.tsx b/lightrag_webui/src/components/retrieval/QuerySettings.tsx index 4ffebbb1..f6aef4c0 100644 --- a/lightrag_webui/src/components/retrieval/QuerySettings.tsx +++ b/lightrag_webui/src/components/retrieval/QuerySettings.tsx @@ -45,7 +45,8 @@ export default function QuerySettings() { chunk_top_k: 20, max_entity_tokens: 6000, max_relation_tokens: 8000, - max_total_tokens: 30000 + max_total_tokens: 30000, + file_path_filters: [] as string[] }), []) const handleReset = useCallback((key: keyof typeof defaultValues) => { @@ -73,6 +74,14 @@ export default function QuerySettings() { ) + const handleFilePathFiltersChange = useCallback((value: string) => { + const filters = value + .split(',') + .map((item) => item.trim()) + .filter(Boolean) + handleChange('file_path_filters', filters) + }, [handleChange]) + return ( @@ -273,6 +282,36 @@ export default function QuerySettings() { + {/* File Path Filters */} + <> + + + + + + +

{t('retrievePanel.querySettings.filePathFiltersTooltip')}

+
+
+
+
+ handleFilePathFiltersChange(e.target.value)} + placeholder={t('retrievePanel.querySettings.filePathFiltersPlaceholder')} + className="h-9 flex-1 pr-2" + /> + handleReset('file_path_filters')} + title="Clear filters" + /> +
+ + {/* Max Entity Tokens */} <> diff --git a/lightrag_webui/src/locales/ar.json b/lightrag_webui/src/locales/ar.json index 5c8561dc..c2f1a481 100644 --- a/lightrag_webui/src/locales/ar.json +++ b/lightrag_webui/src/locales/ar.json @@ -421,6 +421,9 @@ "chunkTopK": "أعلى K للقطع", "chunkTopKTooltip": "عدد أجزاء النص المطلوب استردادها، وينطبق على جميع الأوضاع.", "chunkTopKPlaceholder": "أدخل قيمة chunk_top_k", + "filePathFilters": "مرشحات مسار الملف", + "filePathFiltersTooltip": "قم بتقييد الاسترجاع بالوثائق التي يحتوي مسار الملف المخزن على أي من السلاسل المحددة. افصل بين المرشحات باستخدام فواصل.", + "filePathFiltersPlaceholder": "مثال: reports/، meeting-notes.pdf", "maxEntityTokens": "الحد الأقصى لرموز الكيان", "maxEntityTokensTooltip": "الحد الأقصى لعدد الرموز المخصصة لسياق الكيان في نظام التحكم الموحد في الرموز", "maxRelationTokens": "الحد الأقصى لرموز العلاقة", diff --git a/lightrag_webui/src/locales/en.json b/lightrag_webui/src/locales/en.json index d3243e98..af35cb4d 100644 --- a/lightrag_webui/src/locales/en.json +++ b/lightrag_webui/src/locales/en.json @@ -421,6 +421,9 @@ "chunkTopK": "Chunk Top K", "chunkTopKTooltip": "Number of text chunks to retrieve, applicable for all modes.", "chunkTopKPlaceholder": "Enter chunk_top_k value", + "filePathFilters": "File Path Filters", + "filePathFiltersTooltip": "Restrict retrieval to documents whose stored file path contains any of the specified substrings. Separate multiple filters with commas.", + "filePathFiltersPlaceholder": "e.g. reports/, meeting-notes.pdf", "maxEntityTokens": "Max Entity Tokens", "maxEntityTokensTooltip": "Maximum number of tokens allocated for entity context in unified token control system", "maxRelationTokens": "Max Relation Tokens", diff --git a/lightrag_webui/src/locales/fr.json b/lightrag_webui/src/locales/fr.json index 7913cb71..822393b7 100644 --- a/lightrag_webui/src/locales/fr.json +++ b/lightrag_webui/src/locales/fr.json @@ -421,6 +421,9 @@ "chunkTopK": "Top K des Chunks", "chunkTopKTooltip": "Nombre de morceaux de texte à récupérer, applicable à tous les modes.", "chunkTopKPlaceholder": "Entrez la valeur chunk_top_k", + "filePathFilters": "Filtres de chemin de fichier", + "filePathFiltersTooltip": "Limiter la récupération aux documents dont le chemin de fichier enregistré contient l'une des sous-chaînes spécifiées. Séparez plusieurs filtres par des virgules.", + "filePathFiltersPlaceholder": "ex. reports/, meeting-notes.pdf", "maxEntityTokens": "Limite de jetons d'entité", "maxEntityTokensTooltip": "Nombre maximum de jetons alloués au contexte d'entité dans le système de contrôle de jetons unifié", "maxRelationTokens": "Limite de jetons de relation", diff --git a/lightrag_webui/src/locales/zh.json b/lightrag_webui/src/locales/zh.json index 0261e9ce..dfb2304d 100644 --- a/lightrag_webui/src/locales/zh.json +++ b/lightrag_webui/src/locales/zh.json @@ -421,6 +421,9 @@ "chunkTopK": "文本块 Top K", "chunkTopKTooltip": "文本块检索数量, 适用于所有模式", "chunkTopKPlaceholder": "输入文本块chunk_top_k值", + "filePathFilters": "文件路径过滤", + "filePathFiltersTooltip": "仅检索文件路径包含任一指定子串的文档。多个过滤条件用逗号分隔。", + "filePathFiltersPlaceholder": "例如:reports/,meeting-notes.pdf", "maxEntityTokens": "实体令牌数上限", "maxEntityTokensTooltip": "统一令牌控制系统中分配给实体上下文的最大令牌数", "maxRelationTokens": "关系令牌数上限", diff --git a/lightrag_webui/src/locales/zh_TW.json b/lightrag_webui/src/locales/zh_TW.json index 0448e36a..946d0e2e 100644 --- a/lightrag_webui/src/locales/zh_TW.json +++ b/lightrag_webui/src/locales/zh_TW.json @@ -421,6 +421,9 @@ "chunkTopK": "文本區塊 Top K", "chunkTopKTooltip": "文本區塊檢索數量,適用於所有模式。", "chunkTopKPlaceholder": "輸入文本區塊 chunk_top_k 值", + "filePathFilters": "檔案路徑篩選", + "filePathFiltersTooltip": "僅檢索儲存的檔案路徑包含任一指定子字串的文件。多個篩選條件請以逗號分隔。", + "filePathFiltersPlaceholder": "例如:reports/,meeting-notes.pdf", "historyTurns": "歷史輪次", "historyTurnsTooltip": "回應上下文中考慮的完整對話輪次(使用者-助手對)數量", "historyTurnsPlaceholder": "歷史輪次數", diff --git a/lightrag_webui/src/stores/settings.ts b/lightrag_webui/src/stores/settings.ts index 983f5c43..4c60be8f 100644 --- a/lightrag_webui/src/stores/settings.ts +++ b/lightrag_webui/src/stores/settings.ts @@ -134,7 +134,8 @@ const useSettingsStoreBase = create()( stream: true, history_turns: 0, user_prompt: '', - enable_rerank: true + enable_rerank: true, + file_path_filters: [] }, setTheme: (theme: Theme) => set({ theme }),