fix: Make exluded paths use absolute path

This commit is contained in:
Igor Ilic 2025-08-29 16:04:14 +02:00
parent fd4deee27c
commit 4159846bb3
3 changed files with 29 additions and 8 deletions

View file

@ -1,6 +1,7 @@
import os import os
import pathlib import pathlib
import asyncio import asyncio
from typing import Optional
from cognee.shared.logging_utils import get_logger, setup_logging from cognee.shared.logging_utils import get_logger, setup_logging
from cognee.modules.observability.get_observe import get_observe from cognee.modules.observability.get_observe import get_observe
@ -28,7 +29,12 @@ logger = get_logger("code_graph_pipeline")
@observe @observe
async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None): async def run_code_graph_pipeline(
repo_path,
include_docs=False,
excluded_paths: Optional[list[str]] = None,
supported_languages: Optional[list[str]] = None,
):
import cognee import cognee
from cognee.low_level import setup from cognee.low_level import setup
@ -40,8 +46,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=
user = await get_default_user() user = await get_default_user()
detailed_extraction = True detailed_extraction = True
# Multi-language support: allow passing supported_languages
supported_languages = None # defer to task defaults
tasks = [ tasks = [
Task( Task(
get_repo_file_dependencies, get_repo_file_dependencies,
@ -95,7 +99,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=
if __name__ == "__main__": if __name__ == "__main__":
async def main(): async def main():
async for run_status in run_code_graph_pipeline("/Users/igorilic/Desktop/cognee/examples"): async for run_status in run_code_graph_pipeline("REPO_PATH"):
print(f"{run_status.pipeline_run_id}: {run_status.status}") print(f"{run_status.pipeline_run_id}: {run_status.status}")
file_path = os.path.join( file_path = os.path.join(

View file

@ -94,7 +94,15 @@ class CodeRetriever(BaseRetriever):
{"id": res.id, "score": res.score, "payload": res.payload} {"id": res.id, "score": res.score, "payload": res.payload}
) )
existing_collection = []
for collection in self.classes_and_functions_collections: for collection in self.classes_and_functions_collections:
if await vector_engine.has_collection(collection):
existing_collection.append(collection)
if not existing_collection:
raise RuntimeError("No collection found for code retriever")
for collection in existing_collection:
logger.debug(f"Searching {collection} collection with general query") logger.debug(f"Searching {collection} collection with general query")
search_results_code = await vector_engine.search( search_results_code = await vector_engine.search(
collection, query, limit=self.top_k collection, query, limit=self.top_k

View file

@ -1,6 +1,7 @@
import asyncio import asyncio
import math import math
import os import os
from pathlib import Path
from typing import Set from typing import Set
from typing import AsyncGenerator, Optional, List from typing import AsyncGenerator, Optional, List
from uuid import NAMESPACE_OID, uuid5 from uuid import NAMESPACE_OID, uuid5
@ -78,15 +79,22 @@ async def get_source_code_files(
if lang is None: if lang is None:
continue continue
# Exclude tests, common build/venv directories and files provided in exclude_paths # Exclude tests, common build/venv directories and files provided in exclude_paths
excluded_dirs = EXCLUDED_DIRS | set(excluded_paths or []) excluded_dirs = EXCLUDED_DIRS
root_parts = set(os.path.normpath(root).split(os.sep)) excluded_paths = {Path(p).resolve() for p in (excluded_paths or [])} # full paths
root_path = Path(root).resolve()
root_parts = set(root_path.parts) # same as before
base_name, _ext = os.path.splitext(file) base_name, _ext = os.path.splitext(file)
if ( if (
base_name.startswith("test_") base_name.startswith("test_")
or base_name.endswith("_test") # catches Go's *_test.go and similar or base_name.endswith("_test")
or ".test." in file or ".test." in file
or ".spec." in file or ".spec." in file
or (excluded_dirs & root_parts) or (excluded_dirs & root_parts) # name match
or any(
root_path.is_relative_to(p) # full-path match
for p in excluded_paths
)
): ):
continue continue
file_path = os.path.abspath(os.path.join(root, file)) file_path = os.path.abspath(os.path.join(root, file))
@ -164,6 +172,7 @@ async def get_repo_file_dependencies(
"go": [".go"], "go": [".go"],
"rust": [".rs"], "rust": [".rs"],
"cpp": [".cpp", ".c", ".h", ".hpp"], "cpp": [".cpp", ".c", ".h", ".hpp"],
"c": [".c", ".h"],
} }
if supported_languages is not None: if supported_languages is not None:
language_config = { language_config = {