Merge pull request #414 from topoteretes/COG-949

Code graph pipeline improvements and fixes
2025-01-10 14:32:06 +01:00 · 2025-01-10 14:32:06 +01:00 · f7e808eddd
commit f7e808eddd
parent 892666caec 5839ab04db
18 changed files with 255 additions and 54 deletions
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@ -3,7 +3,6 @@ import logging
 from pathlib import Path

 from cognee.base_config import get_base_config
-from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
@ -54,8 +53,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
    await cognee.prune.prune_system(metadata=True)
    await create_db_and_tables()

-    embedding_engine = get_embedding_engine()
-
    cognee_config = get_cognify_config()
    user = await get_default_user()

@ -63,11 +60,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
        Task(get_repo_file_dependencies),
        Task(enrich_dependency_graph),
        Task(expand_dependency_graph, task_config={"batch_size": 50}),
-        Task(
-            get_source_code_chunks,
-            embedding_model=embedding_engine.model,
-            task_config={"batch_size": 50},
-        ),
+        Task(get_source_code_chunks, task_config={"batch_size": 50}),
        Task(summarize_code, task_config={"batch_size": 50}),
        Task(add_data_points, task_config={"batch_size": 50}),
    ]
@ -78,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
            Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
            Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
            Task(classify_documents),
-            Task(extract_chunks_from_documents),
+            Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
            Task(
                extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
            ),
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@ -1,7 +1,9 @@
-from uuid import uuid5, NAMESPACE_OID
+from typing import Optional
+from uuid import NAMESPACE_OID, uuid5
+
+from cognee.tasks.chunks import chunk_by_paragraph

 from .models.DocumentChunk import DocumentChunk
-from cognee.tasks.chunks import chunk_by_paragraph


 class TextChunker:
@ -10,23 +12,36 @@ class TextChunker:

    chunk_index = 0
    chunk_size = 0
+    token_count = 0

-    def __init__(self, document, get_text: callable, chunk_size: int = 1024):
+    def __init__(
+        self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024
+    ):
        self.document = document
        self.max_chunk_size = chunk_size
        self.get_text = get_text
+        self.max_tokens = max_tokens if max_tokens else float("inf")
+
+    def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
+        word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
+        token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
+        return word_count_fits and token_count_fits

    def read(self):
        paragraph_chunks = []
        for content_text in self.get_text():
            for chunk_data in chunk_by_paragraph(
                content_text,
+                self.max_tokens,
                self.max_chunk_size,
                batch_paragraphs=True,
            ):
-                if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
+                if self.check_word_count_and_token_count(
+                    self.chunk_size, self.token_count, chunk_data
+                ):
                    paragraph_chunks.append(chunk_data)
                    self.chunk_size += chunk_data["word_count"]
+                    self.token_count += chunk_data["token_count"]
                else:
                    if len(paragraph_chunks) == 0:
                        yield DocumentChunk(
@ -66,6 +81,7 @@ class TextChunker:
                            print(e)
                        paragraph_chunks = [chunk_data]
                        self.chunk_size = chunk_data["word_count"]
+                        self.token_count = chunk_data["token_count"]

                    self.chunk_index += 1

--- a/cognee/modules/cognify/config.py
+++ b/cognee/modules/cognify/config.py
@ -1,12 +1,14 @@
 from functools import lru_cache
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from cognee.shared.data_models import DefaultContentPrediction, SummarizedContent
+from typing import Optional
+import os


 class CognifyConfig(BaseSettings):
    classification_model: object = DefaultContentPrediction
    summarization_model: object = SummarizedContent
-
+    max_tokens: Optional[int] = os.getenv("MAX_TOKENS")
    model_config = SettingsConfigDict(env_file=".env", extra="allow")

    def to_dict(self) -> dict:
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@ -1,6 +1,9 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document


 class AudioDocument(Document):
@ -10,12 +13,14 @@ class AudioDocument(Document):
        result = get_llm_client().create_transcript(self.raw_data_location)
        return result.text

-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
        # Transcribe the audio file

        text = self.create_transcript()

        chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text])
+        chunker = chunker_func(
+            self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
+        )

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@ -1,3 +1,4 @@
+from typing import Optional
 from uuid import UUID

 from cognee.infrastructure.engine import DataPoint
@ -10,5 +11,5 @@ class Document(DataPoint):
    mime_type: str
    _metadata: dict = {"index_fields": ["name"], "type": "Document"}

-    def read(self, chunk_size: int, chunker=str) -> str:
+    def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
        pass
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@ -1,6 +1,9 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document


 class ImageDocument(Document):
@ -10,11 +13,13 @@ class ImageDocument(Document):
        result = get_llm_client().transcribe_image(self.raw_data_location)
        return result.choices[0].message.content

-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
        # Transcribe the image file
        text = self.transcribe_image()

        chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text])
+        chunker = chunker_func(
+            self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
+        )

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@ -1,12 +1,15 @@
+from typing import Optional
+
 from pypdf import PdfReader
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document


 class PdfDocument(Document):
    type: str = "pdf"

-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
        file = PdfReader(self.raw_data_location)

        def get_text():
@ -15,7 +18,9 @@ class PdfDocument(Document):
                yield page_text

        chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text)
+        chunker = chunker_func(
+            self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
+        )

        yield from chunker.read()

--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@ -1,11 +1,13 @@
-from .Document import Document
+from typing import Optional
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document


 class TextDocument(Document):
    type: str = "text"

-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
        def get_text():
            with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
                while True:
@ -18,6 +20,8 @@ class TextDocument(Document):

        chunker_func = ChunkerConfig.get_chunker(chunker)

-        chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text)
+        chunker = chunker_func(
+            self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
+        )

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@ -1,14 +1,16 @@
 from io import StringIO
+from typing import Optional

 from cognee.modules.chunking.TextChunker import TextChunker
-from .Document import Document
 from cognee.modules.data.exceptions import UnstructuredLibraryImportError

+from .Document import Document
+

 class UnstructuredDocument(Document):
    type: str = "unstructured"

-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str:
        def get_text():
            try:
                from unstructured.partition.auto import partition
@ -27,6 +29,6 @@ class UnstructuredDocument(Document):

                yield text

-        chunker = TextChunker(self, chunk_size=chunk_size, get_text=get_text)
+        chunker = TextChunker(self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens)

        yield from chunker.read()
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@ -1,10 +1,18 @@
-from uuid import uuid5, NAMESPACE_OID
-from typing import Dict, Any, Iterator
+from typing import Any, Dict, Iterator, Optional, Union
+from uuid import NAMESPACE_OID, uuid5
+
+import tiktoken
+
+from cognee.infrastructure.databases.vector import get_vector_engine
+
 from .chunk_by_sentence import chunk_by_sentence


 def chunk_by_paragraph(
-    data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True
+    data: str,
+    max_tokens: Optional[Union[int, float]] = None,
+    paragraph_length: int = 1024,
+    batch_paragraphs: bool = True,
 ) -> Iterator[Dict[str, Any]]:
    """
    Chunks text by paragraph while preserving exact text reconstruction capability.
@ -15,16 +23,31 @@ def chunk_by_paragraph(
    chunk_index = 0
    paragraph_ids = []
    last_cut_type = None
+    current_token_count = 0
+    if not max_tokens:
+        max_tokens = float("inf")
+
+    vector_engine = get_vector_engine()
+    embedding_model = vector_engine.embedding_engine.model
+    embedding_model = embedding_model.split("/")[-1]

    for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(
        data, maximum_length=paragraph_length
    ):
        # Check if this sentence would exceed length limit
-        if current_word_count > 0 and current_word_count + word_count > paragraph_length:
+
+        tokenizer = tiktoken.encoding_for_model(embedding_model)
+        token_count = len(tokenizer.encode(sentence))
+
+        if current_word_count > 0 and (
+            current_word_count + word_count > paragraph_length
+            or current_token_count + token_count > max_tokens
+        ):
            # Yield current chunk
            chunk_dict = {
                "text": current_chunk,
                "word_count": current_word_count,
+                "token_count": current_token_count,
                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                "paragraph_ids": paragraph_ids,
                "chunk_index": chunk_index,
@ -37,11 +60,13 @@ def chunk_by_paragraph(
            paragraph_ids = []
            current_chunk = ""
            current_word_count = 0
+            current_token_count = 0
            chunk_index += 1

        paragraph_ids.append(paragraph_id)
        current_chunk += sentence
        current_word_count += word_count
+        current_token_count += token_count

        # Handle end of paragraph
        if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
@ -49,6 +74,7 @@ def chunk_by_paragraph(
            chunk_dict = {
                "text": current_chunk,
                "word_count": current_word_count,
+                "token_count": current_token_count,
                "paragraph_ids": paragraph_ids,
                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                "chunk_index": chunk_index,
@ -58,6 +84,7 @@ def chunk_by_paragraph(
            paragraph_ids = []
            current_chunk = ""
            current_word_count = 0
+            current_token_count = 0
            chunk_index += 1

        last_cut_type = end_type
@ -67,6 +94,7 @@ def chunk_by_paragraph(
        chunk_dict = {
            "text": current_chunk,
            "word_count": current_word_count,
+            "token_count": current_token_count,
            "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
            "paragraph_ids": paragraph_ids,
            "chunk_index": chunk_index,
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@ -1,9 +1,16 @@
+from typing import Optional
+
 from cognee.modules.data.processing.document_types.Document import Document


 async def extract_chunks_from_documents(
-    documents: list[Document], chunk_size: int = 1024, chunker="text_chunker"
+    documents: list[Document],
+    chunk_size: int = 1024,
+    chunker="text_chunker",
+    max_tokens: Optional[int] = None,
 ):
    for document in documents:
-        for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker):
+        for document_chunk in document.read(
+            chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens
+        ):
            yield document_chunk
--- a/cognee/tasks/repo_processor/get_non_code_files.py
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@ -29,8 +29,105 @@ async def get_non_py_files(repo_path):
        "*.egg-info",
    }

+    ALLOWED_EXTENSIONS = {
+        ".txt",
+        ".md",
+        ".csv",
+        ".json",
+        ".xml",
+        ".yaml",
+        ".yml",
+        ".html",
+        ".css",
+        ".js",
+        ".ts",
+        ".jsx",
+        ".tsx",
+        ".sql",
+        ".log",
+        ".ini",
+        ".toml",
+        ".properties",
+        ".sh",
+        ".bash",
+        ".dockerfile",
+        ".gitignore",
+        ".gitattributes",
+        ".makefile",
+        ".pyproject",
+        ".requirements",
+        ".env",
+        ".pdf",
+        ".doc",
+        ".docx",
+        ".dot",
+        ".dotx",
+        ".rtf",
+        ".wps",
+        ".wpd",
+        ".odt",
+        ".ott",
+        ".ottx",
+        ".txt",
+        ".wp",
+        ".sdw",
+        ".sdx",
+        ".docm",
+        ".dotm",
+        # Additional extensions for other programming languages
+        ".java",
+        ".c",
+        ".cpp",
+        ".h",
+        ".cs",
+        ".go",
+        ".php",
+        ".rb",
+        ".swift",
+        ".pl",
+        ".lua",
+        ".rs",
+        ".scala",
+        ".kt",
+        ".sh",
+        ".sql",
+        ".v",
+        ".asm",
+        ".pas",
+        ".d",
+        ".ml",
+        ".clj",
+        ".cljs",
+        ".erl",
+        ".ex",
+        ".exs",
+        ".f",
+        ".fs",
+        ".r",
+        ".pyi",
+        ".pdb",
+        ".ipynb",
+        ".rmd",
+        ".cabal",
+        ".hs",
+        ".nim",
+        ".vhdl",
+        ".verilog",
+        ".svelte",
+        ".html",
+        ".css",
+        ".scss",
+        ".less",
+        ".json5",
+        ".yaml",
+        ".yml",
+    }
+
    def should_process(path):
-        return not any(pattern in path for pattern in IGNORED_PATTERNS)
+        _, ext = os.path.splitext(path)
+        return ext in ALLOWED_EXTENSIONS and not any(
+            pattern in path for pattern in IGNORED_PATTERNS
+        )

    non_py_files_paths = [
        os.path.join(root, file)
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@ -5,6 +5,7 @@ from uuid import NAMESPACE_OID, uuid5
 import parso
 import tiktoken

+from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk

@ -126,6 +127,9 @@ def get_source_code_chunks_from_code_part(
        logger.error(f"No source code in CodeFile {code_file_part.id}")
        return

+    vector_engine = get_vector_engine()
+    embedding_model = vector_engine.embedding_engine.model
+    model_name = embedding_model.split("/")[-1]
    tokenizer = tiktoken.encoding_for_model(model_name)
    max_subchunk_tokens = max(1, int(granularity * max_tokens))
    subchunk_token_counts = _get_subchunk_token_counts(
@ -150,7 +154,7 @@ def get_source_code_chunks_from_code_part(


 async def get_source_code_chunks(
-    data_points: list[DataPoint], embedding_model="text-embedding-3-large"
+    data_points: list[DataPoint],
 ) -> AsyncGenerator[list[DataPoint], None]:
    """Processes code graph datapoints, create SourceCodeChink datapoints."""
    # TODO: Add support for other embedding models, with max_token mapping
@ -165,9 +169,7 @@ async def get_source_code_chunks(
            for code_part in data_point.contains:
                try:
                    yield code_part
-                    for source_code_chunk in get_source_code_chunks_from_code_part(
-                        code_part, model_name=embedding_model
-                    ):
+                    for source_code_chunk in get_source_code_chunks_from_code_part(code_part):
                        yield source_code_chunk
                except Exception as e:
                    logger.error(f"Error processing code part: {e}")
--- a/cognee/tests/integration/documents/UnstructuredDocument_test.py
+++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py
@ -68,7 +68,7 @@ def test_UnstructuredDocument():
    )

    # Test PPTX
-    for paragraph_data in pptx_document.read(chunk_size=1024):
+    for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
        assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
        assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
        assert (
@ -76,7 +76,7 @@ def test_UnstructuredDocument():
        ), f" sentence_cut != {paragraph_data.cut_type = }"

    # Test DOCX
-    for paragraph_data in docx_document.read(chunk_size=1024):
+    for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
        assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
        assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
        assert (
@ -84,7 +84,7 @@ def test_UnstructuredDocument():
        ), f" sentence_end != {paragraph_data.cut_type = }"

    # TEST CSV
-    for paragraph_data in csv_document.read(chunk_size=1024):
+    for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
        assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
        assert (
            "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text
@ -94,7 +94,7 @@ def test_UnstructuredDocument():
        ), f" sentence_cut != {paragraph_data.cut_type = }"

    # Test XLSX
-    for paragraph_data in xlsx_document.read(chunk_size=1024):
+    for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
        assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
        assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
        assert (
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
@ -27,7 +27,11 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_para
    list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
 )
 def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
-    chunks = list(chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs))
+    chunks = list(
+        chunk_by_paragraph(
+            data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs
+        )
+    )

    chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks])

@ -42,7 +46,9 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
    list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
 )
 def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_paragraphs):
-    chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
+    chunks = chunk_by_paragraph(
+        data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs
+    )
    chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])
    assert np.all(
        chunk_indices == np.arange(len(chunk_indices))
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
@ -49,7 +49,9 @@ Third paragraph is cut and is missing the dot at the end""",

 def run_chunking_test(test_text, expected_chunks):
    chunks = []
-    for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=False):
+    for chunk_data in chunk_by_paragraph(
+        data=test_text, paragraph_length=12, batch_paragraphs=False
+    ):
        chunks.append(chunk_data)

    assert len(chunks) == 3
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@ -34,9 +34,8 @@ def check_install_package(package_name):

 async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
    repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")
-    pipeline = await run_code_graph_pipeline(repo_path)

-    async for result in pipeline:
+    async for result in run_code_graph_pipeline(repo_path, include_docs=True):
        print(result)

    print("Here we have the repo under the repo_path")
@ -47,7 +46,9 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
    instructions = read_query_prompt("patch_gen_kg_instructions.txt")

    retrieved_edges = await brute_force_triplet_search(
-        problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"]
+        problem_statement,
+        top_k=3,
+        collections=["code_summary_text"],
    )

    retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@ -1,7 +1,9 @@
 import argparse
 import asyncio
+import logging

 from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
+from cognee.shared.utils import setup_logging


 async def main(repo_path, include_docs):
@ -9,7 +11,7 @@ async def main(repo_path, include_docs):
        print(result)


-if __name__ == "__main__":
+def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
    parser.add_argument(
@ -18,5 +20,28 @@ if __name__ == "__main__":
        default=True,
        help="Whether or not to process non-code files",
    )
-    args = parser.parse_args()
-    asyncio.run(main(args.repo_path, args.include_docs))
+    parser.add_argument(
+        "--time",
+        type=lambda x: x.lower() in ("true", "1"),
+        default=True,
+        help="Whether or not to time the pipeline run",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    setup_logging(logging.ERROR)
+
+    args = parse_args()
+
+    if args.time:
+        import time
+
+        start_time = time.time()
+        asyncio.run(main(args.repo_path, args.include_docs))
+        end_time = time.time()
+        print("\n" + "=" * 50)
+        print(f"Pipeline Execution Time: {end_time - start_time:.2f} seconds")
+        print("=" * 50 + "\n")
+    else:
+        asyncio.run(main(args.repo_path, args.include_docs))