Docstring tasks. (#878)

## Description  ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
2025-05-27 21:33:16 +02:00 · 2025-05-27 21:33:16 +02:00 · bb68d6a0df
commit bb68d6a0df
parent ec68e99438
18 changed files with 610 additions and 37 deletions
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@ -10,14 +10,23 @@ def chunk_by_paragraph(
    batch_paragraphs: bool = True,
 ) -> Iterator[Dict[str, Any]]:
    """
-    Chunks text by paragraph while preserving exact text reconstruction capability.
+    Chunk the input text by paragraph while enabling exact text reconstruction.
    When chunks are joined with empty string "", they reproduce the original text exactly.
-    Notes:
+    This function divides the given text data into smaller chunks based on the specified
-        - Tokenization is handled using our tokenization adapters, ensuring compatibility with the vector engine's embedding model.
+    maximum chunk size. It ensures that when the generated chunks are concatenated, they
-        - If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk.
+    reproduce the original text accurately. The tokenization process is handled by adapters
-        - Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed.
+    compatible with the vector engine's embedding model, and the function can operate in
-        - Remaining text at the end of the input will be yielded as a final chunk.
+    either batch mode or paragraph mode, based on the `batch_paragraphs` flag.
    Parameters:
    -----------
        - data (str): The input text to be chunked.
        - max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or
          words.
        - batch_paragraphs (bool): Flag indicating whether to yield each paragraph as a
          separate chunk. If set to False, individual paragraphs are yielded as they are
          processed. (default True)
    """
    current_chunk = ""
    chunk_index = 0
--- a/cognee/tasks/chunks/chunk_by_sentence.py
+++ b/cognee/tasks/chunks/chunk_by_sentence.py
@ -5,6 +5,23 @@ from cognee.infrastructure.databases.vector.embeddings import get_embedding_engi
 def get_word_size(word: str) -> int:
    """
    Calculate the size of a given word in terms of tokens.
    If an embedding engine's tokenizer is available, count the tokens for the provided word.
    If the tokenizer is not available, assume the word counts as one token.
    Parameters:
    -----------
        - word (str): The word for which the token size is to be calculated.
    Returns:
    --------
        - int: The number of tokens representing the word, typically an integer, depending
          on the tokenizer's output.
    """
    embedding_engine = get_embedding_engine()
    if embedding_engine.tokenizer:
        return embedding_engine.tokenizer.count_tokens(word)
@ -16,12 +33,22 @@ def chunk_by_sentence(
    data: str, maximum_size: Optional[int] = None
 ) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
    """
-    Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
+    Splits text into sentences while preserving word and paragraph boundaries.
-    Notes:
+    This function processes the input string, dividing it into sentences based on word-level
-        - Relies on the `chunk_by_word` function for word-level tokenization and classification.
+    tokenization. Each sentence is identified with a unique UUID, and it handles scenarios
-        - Ensures sentences within paragraphs are uniquely identifiable using UUIDs.
+    where the text may end mid-sentence by tagging it with a specific type. If a maximum
-        - Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type.
+    sentence length is specified, the function ensures that sentences do not exceed this
    length, raising a ValueError if an individual word surpasses it. The function utilizes
    an external word processing function `chunk_by_word` to determine the structure of the
    text.
    Parameters:
    -----------
        - data (str): The input text to be split into sentences.
        - maximum_size (Optional[int]): An optional limit on the maximum size of sentences
          generated. (default None)
    """
    sentence = ""
    paragraph_id = uuid4()
--- a/cognee/tasks/chunks/chunk_by_word.py
+++ b/cognee/tasks/chunks/chunk_by_word.py
@ -8,15 +8,23 @@ PARAGRAPH_ENDINGS = r"[\n\r]"
 def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
    """
-    Determines if the current position represents a real paragraph ending.
+    Determine if the current position represents a valid paragraph end.
-    Args:
+    The function checks if the last character indicates a possible sentence ending, then
-        last_char: The last processed character
+    verifies if the subsequent characters lead to a valid paragraph end based on specific
-        current_pos: Current position in the text
+    conditions.
-        text: The input text
+
    Parameters:
    -----------
        - last_char (str): The last processed character
        - current_pos (int): Current position in the text
        - text (str): The input text
    Returns:
-        bool: True if this is a real paragraph end, False otherwise
+    --------
        - bool: True if this is a real paragraph end, False otherwise
    """
    if re.match(SENTENCE_ENDINGS, last_char):
        return True
@ -38,9 +46,16 @@ def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
 def chunk_by_word(data: str) -> Iterator[Tuple[str, str]]:
    """
-    Chunks text into words and endings while preserving whitespace.
+    Chunk text into words and sentence endings, preserving whitespace.
-    Whitespace is included with the preceding word.
+
-    Outputs can be joined with "" to recreate the original input.
+    Whitespace is included with the preceding word. Outputs can be joined with "" to
    recreate the original input.
    Parameters:
    -----------
        - data (str): The input string of text to be chunked into words and sentence
          endings.
    """
    current_chunk = ""
    i = 0
--- a/cognee/tasks/code/enrich_dependency_graph_checker.py
+++ b/cognee/tasks/code/enrich_dependency_graph_checker.py
@ -6,6 +6,14 @@ from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependenc
 def main():
    """
    Execute the main logic of the dependency graph processor.
    This function sets up argument parsing to retrieve the repository path, checks the
    existence of the specified path, and processes the repository to produce a dependency
    graph. If the repository path does not exist, it logs an error message and terminates
    without further execution.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("repo_path", help="Path to the repository")
    args = parser.parse_args()
--- a/cognee/tasks/code/get_repo_dependency_graph_checker.py
+++ b/cognee/tasks/code/get_repo_dependency_graph_checker.py
@ -5,6 +5,14 @@ from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file
 def main():
    """
    Parse the command line arguments and print the repository file dependencies.
    This function sets up an argument parser to retrieve the path of a repository. It checks
    if the provided path exists and if it doesn’t, it prints an error message and exits. If
    the path is valid, it calls an asynchronous function to get the dependencies and prints
    the nodes and their relations in the dependency graph.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("repo_path", help="Path to the repository")
    args = parser.parse_args()
--- a/cognee/tasks/completion/exceptions/exceptions.py
+++ b/cognee/tasks/completion/exceptions/exceptions.py
@ -3,6 +3,21 @@ from fastapi import status
 class NoRelevantDataError(CogneeApiError):
    """
    Represents an error when no relevant data is found during a search. This class is a
    subclass of CogneeApiError.
    Public methods:
    - __init__
    Instance variables:
    - message
    - name
    - status_code
    """
    def __init__(
        self,
        message: str = "Search did not find any data.",
--- a/cognee/tasks/documents/classify_documents.py
+++ b/cognee/tasks/documents/classify_documents.py
@ -52,7 +52,21 @@ EXTENSION_TO_DOCUMENT_CLASS = {
 def update_node_set(document):
-    """Extracts node_set from document's external_metadata."""
+    """
    Extracts node_set from document's external_metadata.
    Parses the external_metadata of the given document and updates the document's
    belongs_to_set attribute with NodeSet objects generated from the node_set found in the
    external_metadata. If the external_metadata is not valid JSON, is not a dictionary, does
    not contain the 'node_set' key, or if node_set is not a list, the function has no effect
    and will return early.
    Parameters:
    -----------
        - document: The document object which contains external_metadata from which the
          node_set will be extracted.
    """
    try:
        external_metadata = json.loads(document.external_metadata)
    except json.JSONDecodeError:
@ -76,11 +90,26 @@ def update_node_set(document):
 async def classify_documents(data_documents: list[Data]) -> list[Document]:
    """
-    Classifies a list of data items into specific document types based on file extensions.
+    Classifies a list of data items into specific document types based on their file
    extensions.
-    Notes:
+    This function processes each item in the provided list of data documents, retrieves
-        - The function relies on `get_metadata` to retrieve metadata information for each data item.
+    relevant metadata, and creates instances of document classes mapped to their extensions.
-        - Ensure the `Data` objects and their attributes (e.g., `extension`, `id`) are valid before calling this function.
+    It ensures that the data items are valid before performing the classification and
    invokes `update_node_set` to extract and set relevant node information from the
    document's external metadata.
    Parameters:
    -----------
        - data_documents (list[Data]): A list of Data objects representing the documents to
          be classified.
    Returns:
    --------
        - list[Document]: A list of Document objects created based on the classified data
          documents.
    """
    documents = []
    for data_item in data_documents:
--- a/cognee/tasks/graph/infer_data_ontology.py
+++ b/cognee/tasks/graph/infer_data_ontology.py
@ -33,6 +33,25 @@ logger = get_logger("task:infer_data_ontology")
 async def extract_ontology(content: str, response_model: Type[BaseModel]):
    """
    Extracts structured ontology from the provided content using a pre-defined LLM client.
    This asynchronous function retrieves a system prompt from a file and utilizes an LLM
    client to create a structured output based on the input content and specified response
    model.
    Parameters:
    -----------
        - content (str): The content from which to extract the ontology.
        - response_model (Type[BaseModel]): The model that defines the structure of the
          output ontology.
    Returns:
    --------
        The structured ontology extracted from the content.
    """
    llm_client = get_llm_client()
    system_prompt = read_query_prompt("extract_ontology.txt")
@ -43,10 +62,38 @@ async def extract_ontology(content: str, response_model: Type[BaseModel]):
 class OntologyEngine:
    """
    Manage ontology data and operations for graph structures, providing methods for data
    loading, flattening models, and adding ontological relationships to a graph database.
    Public methods:
    - flatten_model
    - recursive_flatten
    - load_data
    - add_graph_ontology
    """
    async def flatten_model(
        self, model: NodeModel, parent_id: Optional[str] = None
    ) -> Dict[str, Any]:
-        """Flatten the model to a dictionary."""
+        """
        Flatten the model to a dictionary including optional parent ID and relationship details
        if available.
        Parameters:
        -----------
            - model (NodeModel): The NodeModel instance to flatten.
            - parent_id (Optional[str]): An optional ID of the parent node for hierarchical
              purposes. (default None)
        Returns:
        --------
            - Dict[str, Any]: A dictionary representation of the model with flattened
              attributes.
        """
        result = model.dict()
        result["parent_id"] = parent_id
        if model.default_relationship:
@ -62,7 +109,23 @@ class OntologyEngine:
    async def recursive_flatten(
        self, items: Union[List[Dict[str, Any]], Dict[str, Any]], parent_id: Optional[str] = None
    ) -> List[Dict[str, Any]]:
-        """Recursively flatten the items."""
+        """
        Recursively flatten a hierarchical structure of models into a flat list of dictionaries.
        Parameters:
        -----------
            - items (Union[List[Dict[str, Any]], Dict[str, Any]]): A list or dictionary
              containing models to flatten.
            - parent_id (Optional[str]): An optional ID of the parent node to maintain hierarchy
              during flattening. (default None)
        Returns:
        --------
            - List[Dict[str, Any]]: A flat list of dictionaries representing the hierarchical
              model structure.
        """
        flat_list = []
        if isinstance(items, list):
@ -76,7 +139,20 @@ class OntologyEngine:
        return flat_list
    async def load_data(self, file_path: str) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
-        """Load data from a JSON or CSV file."""
+        """
        Load data from a specified JSON or CSV file and return it in a structured format.
        Parameters:
        -----------
            - file_path (str): The path to the file to load data from.
        Returns:
        --------
            - Union[List[Dict[str, Any]], Dict[str, Any]]: Parsed data from the file as either a
              list of dictionaries or a single dictionary depending on content type.
        """
        try:
            if file_path.endswith(".json"):
                async with aiofiles.open(file_path, mode="r") as f:
@ -96,7 +172,18 @@ class OntologyEngine:
            )
    async def add_graph_ontology(self, file_path: str = None, documents: list = None):
-        """Add graph ontology from a JSON or CSV file or infer from documents content."""
+        """
        Add graph ontology from a JSON or CSV file, or infer relationships from provided
        document content. Raise exceptions for invalid file types or missing entities.
        Parameters:
        -----------
            - file_path (str): Optional path to a file containing data to be loaded. (default
              None)
            - documents (list): Optional list of document objects for content extraction if no
              file path is provided. (default None)
        """
        if file_path is None:
            initial_chunks_and_ids = []
@ -202,6 +289,17 @@ class OntologyEngine:
 async def infer_data_ontology(documents, ontology_model=KnowledgeGraph, root_node_id=None):
    """
    Infer data ontology from provided documents and optionally add it to a graph.
    Parameters:
    -----------
        - documents: The documents from which to infer the ontology.
        - ontology_model: The ontology model to use for the inference, defaults to
          KnowledgeGraph. (default KnowledgeGraph)
        - root_node_id: An optional root node identifier for the ontology. (default None)
    """
    if ontology_model == KnowledgeGraph:
        ontology_engine = OntologyEngine()
        root_node_id = await ontology_engine.add_graph_ontology(documents=documents)
--- a/cognee/tasks/graph/models.py
+++ b/cognee/tasks/graph/models.py
@ -3,12 +3,40 @@ from pydantic import BaseModel, Field
 class RelationshipModel(BaseModel):
    """
    Represents a relationship between two entities in a model.
    This class holds the type of the relationship and the identifiers for the source and
    target entities. It includes the following public instance variables:
    - type: A string indicating the type of relationship.
    - source: A string representing the source entity of the relationship.
    - target: A string representing the target entity of the relationship.
    """
    type: str
    source: str
    target: str
 class NodeModel(BaseModel):
    """
    Represents a node in a hierarchical model structure with relationships to other nodes.
    Public methods:
    - __init__(self, node_id: str, name: str, default_relationship:
    Optional[RelationshipModel] = None, children: List[Union[Dict[str, Any], NodeModel]] =
    Field(default_factory=list))
    Instance variables:
    - node_id: Unique identifier for the node.
    - name: Name of the node.
    - default_relationship: Default relationship associated with the node, if any.
    - children: List of child nodes or dictionaries representing children for this node.
    """
    node_id: str
    name: str
    default_relationship: Optional[RelationshipModel] = None
@ -19,12 +47,28 @@ NodeModel.model_rebuild()
 class OntologyNode(BaseModel):
    """
    Represents a node in an ontology with a unique identifier, name, and description.
    """
    id: str = Field(..., description="Unique identifier made from node name.")
    name: str
    description: str
 class OntologyEdge(BaseModel):
    """
    Represent an edge in an ontology, connecting a source and target with a specific
    relationship type.
    The class includes the following instance variables:
    - id: A unique identifier for the edge.
    - source_id: The identifier of the source node.
    - target_id: The identifier of the target node.
    - relationship_type: The type of relationship represented by this edge, defining how the
    source and target are related.
    """
    id: str
    source_id: str
    target_id: str
@ -32,5 +76,14 @@ class OntologyEdge(BaseModel):
 class GraphOntology(BaseModel):
    """
    Represents a graph-based structure of ontology consisting of nodes and edges.
    The GraphOntology class contains a collection of OntologyNode instances representing the
    nodes of the graph and OntologyEdge instances representing the relationships between
    them. Public methods include the management of nodes and edges as well as any relevant
    graph operations. Instance variables include a list of nodes and a list of edges.
    """
    nodes: list[OntologyNode]
    edges: list[OntologyEdge]
--- a/cognee/tasks/ingestion/get_dlt_destination.py
+++ b/cognee/tasks/ingestion/get_dlt_destination.py
@ -10,11 +10,19 @@ from cognee.infrastructure.databases.relational import get_relational_config
@lru_cache
 def get_dlt_destination() -> Union[type[dlt.destinations.sqlalchemy], None]:
    """
-    Handles propagation of the cognee database configuration to the dlt library
+    Handle the propagation of the cognee database configuration to the dlt library.
    This function determines the appropriate sqlalchemy destination based on the database
    provider specified in the relational configuration. It constructs the destination
    credentials for either sqlite or postgres databases accordingly. If the database
    provider is neither sqlite nor postgres, it returns None.
    Returns:
-        sqlachemy: sqlachemy destination used by the dlt library
+    --------
        - Union[type[dlt.destinations.sqlalchemy], None]: An instance of sqlalchemy
          destination used by the dlt library, or None if the database provider is
          unsupported.
    """
    relational_config = get_relational_config()
--- a/cognee/tasks/ingestion/transform_data.py
+++ b/cognee/tasks/ingestion/transform_data.py
@ -5,6 +5,26 @@ from typing import Union
 def get_data_from_llama_index(data_point: Union[Document, ImageDocument], dataset_name: str) -> str:
    """
    Retrieve the file path based on the data point type.
    Ensure the data point is an instance of either Document or ImageDocument. If the data
    point has a metadata or image path file path, return it; otherwise, save the data
    point's text to a file and return the newly created file path.
    Parameters:
    -----------
        - data_point (Union[Document, ImageDocument]): An instance of Document or
          ImageDocument to extract data from.
        - dataset_name (str): The name of the dataset associated with the data point.
    Returns:
    --------
        - str: The file path as a string where the data is stored or the existing path from
          the data point.
    """
    # Specific type checking is used to ensure it's not a child class from Document
    if isinstance(data_point, Document) and type(data_point) is Document:
        file_path = data_point.metadata.get("file_path")
--- a/cognee/tasks/repo_processor/get_local_dependencies.py
+++ b/cognee/tasks/repo_processor/get_local_dependencies.py
@ -19,10 +19,34 @@ logger = get_logger()
 class FileParser:
    """
    Handles the parsing of files into source code and an abstract syntax tree
    representation. Public methods include:
    - parse_file: Parses a file and returns its source code and syntax tree representation.
    """
    def __init__(self):
        self.parsed_files = {}
    async def parse_file(self, file_path: str) -> tuple[str, Tree]:
        """
        Parse a file and return its source code along with its syntax tree representation.
        If the file has already been parsed, retrieve the result from memory instead of reading
        the file again.
        Parameters:
        -----------
            - file_path (str): The path of the file to parse.
        Returns:
        --------
            - tuple[str, Tree]: A tuple containing the source code of the file and its
              corresponding syntax tree representation.
        """
        PY_LANGUAGE = Language(tspython.language())
        source_code_parser = Parser(PY_LANGUAGE)
@ -35,6 +59,24 @@ class FileParser:
 async def get_source_code(file_path: str):
    """
    Read source code from a file asynchronously.
    This function attempts to open a file specified by the given file path, read its
    contents, and return the source code. In case of any errors during the file reading
    process, it logs an error message and returns None.
    Parameters:
    -----------
        - file_path (str): The path to the file from which to read the source code.
    Returns:
    --------
        Returns the contents of the file as a string if successful, or None if an error
        occurs.
    """
    try:
        async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
            source_code = await f.read()
@ -45,7 +87,22 @@ async def get_source_code(file_path: str):
 def resolve_module_path(module_name):
-    """Find the file path of a module."""
+    """
    Find the file path of a module.
    Return the file path of the specified module if found, or return None if the module does
    not exist or cannot be located.
    Parameters:
    -----------
        - module_name: The name of the module whose file path is to be resolved.
    Returns:
    --------
        The file path of the module as a string or None if the module is not found.
    """
    try:
        spec = importlib.util.find_spec(module_name)
        if spec and spec.origin:
@ -58,7 +115,23 @@ def resolve_module_path(module_name):
 def find_function_location(
    module_path: str, function_name: str, parser: FileParser
 ) -> Optional[tuple[str, str]]:
-    """Find the function definition in the module."""
+    """
    Find the location of a function definition in a specified module.
    Parameters:
    -----------
        - module_path (str): The path to the module where the function is defined.
        - function_name (str): The name of the function whose location is to be found.
        - parser (FileParser): An instance of FileParser used to parse the module's source
          code.
    Returns:
    --------
        - Optional[tuple[str, str]]: Returns a tuple containing the module path and the
          start point of the function if found; otherwise, returns None.
    """
    if not module_path or not os.path.exists(module_path):
        return None
@ -78,6 +151,24 @@ def find_function_location(
 async def get_local_script_dependencies(
    repo_path: str, script_path: str, detailed_extraction: bool = False
 ) -> CodeFile:
    """
    Retrieve local script dependencies and create a CodeFile object.
    Parameters:
    -----------
        - repo_path (str): The path to the repository that contains the script.
        - script_path (str): The path of the script for which dependencies are being
          extracted.
        - detailed_extraction (bool): A flag indicating whether to perform a detailed
          extraction of code components.
    Returns:
    --------
        - CodeFile: Returns a CodeFile object containing information about the script,
          including its dependencies and definitions.
    """
    code_file_parser = FileParser()
    source_code, source_code_tree = await code_file_parser.parse_file(script_path)
@ -113,6 +204,24 @@ async def get_local_script_dependencies(
 def find_node(nodes: list[Node], condition: callable) -> Node:
    """
    Find and return the first node that satisfies the given condition.
    Iterate through the provided list of nodes and return the first node for which the
    condition callable returns True. If no such node is found, return None.
    Parameters:
    -----------
        - nodes (list[Node]): A list of Node objects to search through.
        - condition (callable): A callable that takes a Node and returns a boolean
          indicating if the node meets specified criteria.
    Returns:
    --------
        - Node: The first Node that matches the condition, or None if no such node exists.
    """
    for node in nodes:
        if condition(node):
            return node
@ -123,6 +232,30 @@ def find_node(nodes: list[Node], condition: callable) -> Node:
 async def extract_code_parts(
    tree_root: Node, script_path: str, existing_nodes: list[DataPoint] = {}
 ) -> AsyncGenerator[DataPoint, None]:
    """
    Extract code parts from a given AST node tree asynchronously.
    Iteratively yields DataPoint nodes representing import statements, function definitions,
    and class definitions found in the children of the specified tree root. The function
    checks
    if nodes are already present in the existing_nodes dictionary to prevent duplicates.
    This function has to be used in an asynchronous context, and it requires a valid
    tree_root
    and proper initialization of existing_nodes.
    Parameters:
    -----------
        - tree_root (Node): The root node of the AST tree containing code parts to extract.
        - script_path (str): The file path of the script from which the AST was generated.
        - existing_nodes (list[DataPoint]): A dictionary that holds already extracted
          DataPoint nodes to avoid duplicates. (default {})
    Returns:
    --------
        Yields DataPoint nodes representing imported modules, functions, and classes.
    """
    for child_node in tree_root.children:
        if child_node.type == "import_statement" or child_node.type == "import_from_statement":
            parts = child_node.text.decode("utf-8").split()
--- a/cognee/tasks/repo_processor/get_non_code_files.py
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@ -2,7 +2,24 @@ import os
 async def get_non_py_files(repo_path):
-    """Get files that are not .py files and their contents"""
+    """
    Get files that are not .py files and their contents.
    Check if the specified repository path exists and if so, traverse the directory,
    collecting the paths of files that do not have a .py extension and meet the
    criteria set in the allowed and ignored patterns. Return a list of paths to
    those files.
    Parameters:
    -----------
        - repo_path: The file system path to the repository to scan for non-Python files.
    Returns:
    --------
        A list of file paths that are not Python files and meet the specified criteria.
    """
    if not os.path.exists(repo_path):
        return {}
@ -111,6 +128,22 @@ async def get_non_py_files(repo_path):
    }
    def should_process(path):
        """
        Determine if a file should be processed based on its extension and path patterns.
        This function checks if the file extension is in the allowed list and ensures that none
        of the ignored patterns are present in the provided file path.
        Parameters:
        -----------
            - path: The file path to check for processing eligibility.
        Returns:
        --------
            Returns True if the file should be processed; otherwise, False.
        """
        _, ext = os.path.splitext(path)
        return ext in ALLOWED_EXTENSIONS and not any(
            pattern in path for pattern in IGNORED_PATTERNS
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@ -11,7 +11,24 @@ from cognee.shared.CodeGraphEntities import CodeFile, Repository
 async def get_source_code_files(repo_path):
-    """Get .py files and their source code"""
+    """
    Retrieve Python source code files from the specified repository path.
    This function scans the given repository path for files that have the .py extension
    while excluding test files and files within a virtual environment. It returns a list of
    absolute paths to the source code files that are not empty.
    Parameters:
    -----------
        - repo_path: The file path to the repository to search for Python source files.
    Returns:
    --------
        A list of absolute paths to .py files that contain source code, excluding empty
        files, test files, and files from a virtual environment.
    """
    if not os.path.exists(repo_path):
        return {}
@ -40,6 +57,26 @@ async def get_source_code_files(repo_path):
 def run_coroutine(coroutine_func, *args, **kwargs):
    """
    Run a coroutine function until it completes.
    This function creates a new asyncio event loop, sets it as the current loop, and
    executes the given coroutine function with the provided arguments. Once the coroutine
    completes, the loop is closed. Intended for use in environments where an existing event
    loop is not available or desirable.
    Parameters:
    -----------
        - coroutine_func: The coroutine function to be run.
        - *args: Positional arguments to pass to the coroutine function.
        - **kwargs: Keyword arguments to pass to the coroutine function.
    Returns:
    --------
        The result returned by the coroutine after completion.
    """
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    result = loop.run_until_complete(coroutine_func(*args, **kwargs))
@ -50,7 +87,21 @@ def run_coroutine(coroutine_func, *args, **kwargs):
 async def get_repo_file_dependencies(
    repo_path: str, detailed_extraction: bool = False
 ) -> AsyncGenerator[DataPoint, None]:
-    """Generate a dependency graph for Python files in the given repository path."""
+    """
    Generate a dependency graph for Python files in the given repository path.
    Check the validity of the repository path and yield a repository object followed by the
    dependencies of Python files within that repository. Raise a FileNotFoundError if the
    provided path does not exist. The extraction of detailed dependencies can be controlled
    via the `detailed_extraction` argument.
    Parameters:
    -----------
        - repo_path (str): The file path to the repository where Python files are located.
        - detailed_extraction (bool): A flag indicating whether to perform a detailed
          extraction of dependencies (default is False). (default False)
    """
    if not os.path.exists(repo_path):
        raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
--- a/cognee/tasks/summarization/mock_summary.py
+++ b/cognee/tasks/summarization/mock_summary.py
@ -2,6 +2,19 @@ from cognee.shared.data_models import SummarizedCode, SummarizedClass, Summarize
 def get_mock_summarized_code() -> SummarizedCode:
    """
    Return a summarized representation of mock code.
    This function constructs and returns a `SummarizedCode` object that includes various
    components such as file name, high-level summary, key features, imports, constants,
    classes, and functions, all described with placeholders for mock data.
    Returns:
    --------
        - SummarizedCode: A `SummarizedCode` object containing mock data for file summary,
          features, imports, constants, classes, and functions.
    """
    return SummarizedCode(
        file_name="mock_file.py",
        high_level_summary="This is a mock high-level summary.",
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@ -6,6 +6,15 @@ from cognee.shared.CodeGraphEntities import CodeFile, CodePart
 class TextSummary(DataPoint):
    """
    Represent a text summary derived from a document chunk.
    This class encapsulates a text summary as well as its associated metadata. The public
    instance variables include 'text' for the summary content and 'made_from' which
    indicates the source document chunk. The 'metadata' instance variable contains
    additional information such as indexed fields.
    """
    text: str
    made_from: DocumentChunk
@ -13,6 +22,15 @@ class TextSummary(DataPoint):
 class CodeSummary(DataPoint):
    """
    Summarizes code and its components.
    This class inherits from DataPoint and contains a text representation alongside the
    summarized content, which can either be a full code file or a part of it. The metadata
    dictionary defines index fields for the class's instances, particularly focusing on the
    'text' attribute. Public attributes include 'text', 'summarizes', and 'metadata'.
    """
    text: str
    summarizes: Union[CodeFile, CodePart]
--- a/cognee/tasks/summarization/summarize_text.py
+++ b/cognee/tasks/summarization/summarize_text.py
@ -11,6 +11,29 @@ from .models import TextSummary
 async def summarize_text(
    data_chunks: list[DocumentChunk], summarization_model: Type[BaseModel] = None
 ):
    """
    Summarize the text contained in the provided data chunks.
    If no summarization model is provided, the function retrieves the default model from the
    configuration. It processes the data chunks asynchronously and returns summaries for
    each chunk. If the provided list of data chunks is empty, it simply returns the list as
    is.
    Parameters:
    -----------
        - data_chunks (list[DocumentChunk]): A list of DocumentChunk objects containing text
          to be summarized.
        - summarization_model (Type[BaseModel]): An optional model used for summarizing
          text. If not provided, the default is fetched from the configuration. (default
          None)
    Returns:
    --------
        A list of TextSummary objects, each containing the summary of a corresponding
        DocumentChunk.
    """
    if len(data_chunks) == 0:
        return data_chunks
--- a/cognee/tasks/temporal_awareness/graphiti_model.py
+++ b/cognee/tasks/temporal_awareness/graphiti_model.py
@ -3,6 +3,18 @@ from typing import Optional
 class GraphitiNode(DataPoint):
    """
    Represent a node in a graph with optional content, name, and summary attributes.
    This class extends DataPoint and includes a metadata dictionary that specifies the index
    fields for the node's data. The public instance variables are:
    - content: an optional string representing the content of the node.
    - name: an optional string representing the name of the node.
    - summary: an optional string providing a summary of the node.
    - metadata: a dictionary outlining the fields used for indexing.
    """
    content: Optional[str] = None
    name: Optional[str] = None
    summary: Optional[str] = None