Docstring tasks. (#878)

## Description  ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
2025-05-27 21:33:16 +02:00 · 2025-05-27 21:33:16 +02:00 · bb68d6a0df
commit bb68d6a0df
parent ec68e99438
18 changed files with 610 additions and 37 deletions
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@ -10,14 +10,23 @@ def chunk_by_paragraph(
    batch_paragraphs: bool = True,
 ) -> Iterator[Dict[str, Any]]:
    """
-    Chunks text by paragraph while preserving exact text reconstruction capability.
-    When chunks are joined with empty string "", they reproduce the original text exactly.
+    Chunk the input text by paragraph while enabling exact text reconstruction.

-    Notes:
-        - Tokenization is handled using our tokenization adapters, ensuring compatibility with the vector engine's embedding model.
-        - If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk.
-        - Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed.
-        - Remaining text at the end of the input will be yielded as a final chunk.
+    This function divides the given text data into smaller chunks based on the specified
+    maximum chunk size. It ensures that when the generated chunks are concatenated, they
+    reproduce the original text accurately. The tokenization process is handled by adapters
+    compatible with the vector engine's embedding model, and the function can operate in
+    either batch mode or paragraph mode, based on the `batch_paragraphs` flag.
+
+    Parameters:
+    -----------
+
+        - data (str): The input text to be chunked.
+        - max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or
+          words.
+        - batch_paragraphs (bool): Flag indicating whether to yield each paragraph as a
+          separate chunk. If set to False, individual paragraphs are yielded as they are
+          processed. (default True)
    """
    current_chunk = ""
    chunk_index = 0
--- a/cognee/tasks/chunks/chunk_by_sentence.py
+++ b/cognee/tasks/chunks/chunk_by_sentence.py
@ -5,6 +5,23 @@ from cognee.infrastructure.databases.vector.embeddings import get_embedding_engi


 def get_word_size(word: str) -> int:
+    """
+    Calculate the size of a given word in terms of tokens.
+
+    If an embedding engine's tokenizer is available, count the tokens for the provided word.
+    If the tokenizer is not available, assume the word counts as one token.
+
+    Parameters:
+    -----------
+
+        - word (str): The word for which the token size is to be calculated.
+
+    Returns:
+    --------
+
+        - int: The number of tokens representing the word, typically an integer, depending
+          on the tokenizer's output.
+    """
    embedding_engine = get_embedding_engine()
    if embedding_engine.tokenizer:
        return embedding_engine.tokenizer.count_tokens(word)
@ -16,12 +33,22 @@ def chunk_by_sentence(
    data: str, maximum_size: Optional[int] = None
 ) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
    """
-    Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
+    Splits text into sentences while preserving word and paragraph boundaries.

-    Notes:
-        - Relies on the `chunk_by_word` function for word-level tokenization and classification.
-        - Ensures sentences within paragraphs are uniquely identifiable using UUIDs.
-        - Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type.
+    This function processes the input string, dividing it into sentences based on word-level
+    tokenization. Each sentence is identified with a unique UUID, and it handles scenarios
+    where the text may end mid-sentence by tagging it with a specific type. If a maximum
+    sentence length is specified, the function ensures that sentences do not exceed this
+    length, raising a ValueError if an individual word surpasses it. The function utilizes
+    an external word processing function `chunk_by_word` to determine the structure of the
+    text.
+
+    Parameters:
+    -----------
+
+        - data (str): The input text to be split into sentences.
+        - maximum_size (Optional[int]): An optional limit on the maximum size of sentences
+          generated. (default None)
    """
    sentence = ""
    paragraph_id = uuid4()
--- a/cognee/tasks/chunks/chunk_by_word.py
+++ b/cognee/tasks/chunks/chunk_by_word.py
@ -8,15 +8,23 @@ PARAGRAPH_ENDINGS = r"[\n\r]"

 def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
    """
-    Determines if the current position represents a real paragraph ending.
+    Determine if the current position represents a valid paragraph end.

-    Args:
-        last_char: The last processed character
-        current_pos: Current position in the text
-        text: The input text
+    The function checks if the last character indicates a possible sentence ending, then
+    verifies if the subsequent characters lead to a valid paragraph end based on specific
+    conditions.
+
+    Parameters:
+    -----------
+
+        - last_char (str): The last processed character
+        - current_pos (int): Current position in the text
+        - text (str): The input text

    Returns:
-        bool: True if this is a real paragraph end, False otherwise
+    --------
+
+        - bool: True if this is a real paragraph end, False otherwise
    """
    if re.match(SENTENCE_ENDINGS, last_char):
        return True
@ -38,9 +46,16 @@ def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:

 def chunk_by_word(data: str) -> Iterator[Tuple[str, str]]:
    """
-    Chunks text into words and endings while preserving whitespace.
-    Whitespace is included with the preceding word.
-    Outputs can be joined with "" to recreate the original input.
+    Chunk text into words and sentence endings, preserving whitespace.
+
+    Whitespace is included with the preceding word. Outputs can be joined with "" to
+    recreate the original input.
+
+    Parameters:
+    -----------
+
+        - data (str): The input string of text to be chunked into words and sentence
+          endings.
    """
    current_chunk = ""
    i = 0
--- a/cognee/tasks/code/enrich_dependency_graph_checker.py
+++ b/cognee/tasks/code/enrich_dependency_graph_checker.py
@ -6,6 +6,14 @@ from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependenc


 def main():
+    """
+    Execute the main logic of the dependency graph processor.
+
+    This function sets up argument parsing to retrieve the repository path, checks the
+    existence of the specified path, and processes the repository to produce a dependency
+    graph. If the repository path does not exist, it logs an error message and terminates
+    without further execution.
+    """
    parser = argparse.ArgumentParser()
    parser.add_argument("repo_path", help="Path to the repository")
    args = parser.parse_args()
--- a/cognee/tasks/code/get_repo_dependency_graph_checker.py
+++ b/cognee/tasks/code/get_repo_dependency_graph_checker.py
@ -5,6 +5,14 @@ from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file


 def main():
+    """
+    Parse the command line arguments and print the repository file dependencies.
+
+    This function sets up an argument parser to retrieve the path of a repository. It checks
+    if the provided path exists and if it doesn’t, it prints an error message and exits. If
+    the path is valid, it calls an asynchronous function to get the dependencies and prints
+    the nodes and their relations in the dependency graph.
+    """
    parser = argparse.ArgumentParser()
    parser.add_argument("repo_path", help="Path to the repository")
    args = parser.parse_args()
--- a/cognee/tasks/completion/exceptions/exceptions.py
+++ b/cognee/tasks/completion/exceptions/exceptions.py
@ -3,6 +3,21 @@ from fastapi import status


 class NoRelevantDataError(CogneeApiError):
+    """
+    Represents an error when no relevant data is found during a search. This class is a
+    subclass of CogneeApiError.
+
+    Public methods:
+
+    - __init__
+
+    Instance variables:
+
+    - message
+    - name
+    - status_code
+    """
+
    def __init__(
        self,
        message: str = "Search did not find any data.",
--- a/cognee/tasks/documents/classify_documents.py
+++ b/cognee/tasks/documents/classify_documents.py
@ -52,7 +52,21 @@ EXTENSION_TO_DOCUMENT_CLASS = {


 def update_node_set(document):
-    """Extracts node_set from document's external_metadata."""
+    """
+    Extracts node_set from document's external_metadata.
+
+    Parses the external_metadata of the given document and updates the document's
+    belongs_to_set attribute with NodeSet objects generated from the node_set found in the
+    external_metadata. If the external_metadata is not valid JSON, is not a dictionary, does
+    not contain the 'node_set' key, or if node_set is not a list, the function has no effect
+    and will return early.
+
+    Parameters:
+    -----------
+
+        - document: The document object which contains external_metadata from which the
+          node_set will be extracted.
+    """
    try:
        external_metadata = json.loads(document.external_metadata)
    except json.JSONDecodeError:
@ -76,11 +90,26 @@ def update_node_set(document):

 async def classify_documents(data_documents: list[Data]) -> list[Document]:
    """
-    Classifies a list of data items into specific document types based on file extensions.
+    Classifies a list of data items into specific document types based on their file
+    extensions.

-    Notes:
-        - The function relies on `get_metadata` to retrieve metadata information for each data item.
-        - Ensure the `Data` objects and their attributes (e.g., `extension`, `id`) are valid before calling this function.
+    This function processes each item in the provided list of data documents, retrieves
+    relevant metadata, and creates instances of document classes mapped to their extensions.
+    It ensures that the data items are valid before performing the classification and
+    invokes `update_node_set` to extract and set relevant node information from the
+    document's external metadata.
+
+    Parameters:
+    -----------
+
+        - data_documents (list[Data]): A list of Data objects representing the documents to
+          be classified.
+
+    Returns:
+    --------
+
+        - list[Document]: A list of Document objects created based on the classified data
+          documents.
    """
    documents = []
    for data_item in data_documents:
--- a/cognee/tasks/graph/infer_data_ontology.py
+++ b/cognee/tasks/graph/infer_data_ontology.py
@ -33,6 +33,25 @@ logger = get_logger("task:infer_data_ontology")


 async def extract_ontology(content: str, response_model: Type[BaseModel]):
+    """
+    Extracts structured ontology from the provided content using a pre-defined LLM client.
+
+    This asynchronous function retrieves a system prompt from a file and utilizes an LLM
+    client to create a structured output based on the input content and specified response
+    model.
+
+    Parameters:
+    -----------
+
+        - content (str): The content from which to extract the ontology.
+        - response_model (Type[BaseModel]): The model that defines the structure of the
+          output ontology.
+
+    Returns:
+    --------
+
+        The structured ontology extracted from the content.
+    """
    llm_client = get_llm_client()

    system_prompt = read_query_prompt("extract_ontology.txt")
@ -43,10 +62,38 @@ async def extract_ontology(content: str, response_model: Type[BaseModel]):


 class OntologyEngine:
+    """
+    Manage ontology data and operations for graph structures, providing methods for data
+    loading, flattening models, and adding ontological relationships to a graph database.
+
+    Public methods:
+
+    - flatten_model
+    - recursive_flatten
+    - load_data
+    - add_graph_ontology
+    """
+
    async def flatten_model(
        self, model: NodeModel, parent_id: Optional[str] = None
    ) -> Dict[str, Any]:
-        """Flatten the model to a dictionary."""
+        """
+        Flatten the model to a dictionary including optional parent ID and relationship details
+        if available.
+
+        Parameters:
+        -----------
+
+            - model (NodeModel): The NodeModel instance to flatten.
+            - parent_id (Optional[str]): An optional ID of the parent node for hierarchical
+              purposes. (default None)
+
+        Returns:
+        --------
+
+            - Dict[str, Any]: A dictionary representation of the model with flattened
+              attributes.
+        """
        result = model.dict()
        result["parent_id"] = parent_id
        if model.default_relationship:
@ -62,7 +109,23 @@ class OntologyEngine:
    async def recursive_flatten(
        self, items: Union[List[Dict[str, Any]], Dict[str, Any]], parent_id: Optional[str] = None
    ) -> List[Dict[str, Any]]:
-        """Recursively flatten the items."""
+        """
+        Recursively flatten a hierarchical structure of models into a flat list of dictionaries.
+
+        Parameters:
+        -----------
+
+            - items (Union[List[Dict[str, Any]], Dict[str, Any]]): A list or dictionary
+              containing models to flatten.
+            - parent_id (Optional[str]): An optional ID of the parent node to maintain hierarchy
+              during flattening. (default None)
+
+        Returns:
+        --------
+
+            - List[Dict[str, Any]]: A flat list of dictionaries representing the hierarchical
+              model structure.
+        """
        flat_list = []

        if isinstance(items, list):
@ -76,7 +139,20 @@ class OntologyEngine:
        return flat_list

    async def load_data(self, file_path: str) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
-        """Load data from a JSON or CSV file."""
+        """
+        Load data from a specified JSON or CSV file and return it in a structured format.
+
+        Parameters:
+        -----------
+
+            - file_path (str): The path to the file to load data from.
+
+        Returns:
+        --------
+
+            - Union[List[Dict[str, Any]], Dict[str, Any]]: Parsed data from the file as either a
+              list of dictionaries or a single dictionary depending on content type.
+        """
        try:
            if file_path.endswith(".json"):
                async with aiofiles.open(file_path, mode="r") as f:
@ -96,7 +172,18 @@ class OntologyEngine:
            )

    async def add_graph_ontology(self, file_path: str = None, documents: list = None):
-        """Add graph ontology from a JSON or CSV file or infer from documents content."""
+        """
+        Add graph ontology from a JSON or CSV file, or infer relationships from provided
+        document content. Raise exceptions for invalid file types or missing entities.
+
+        Parameters:
+        -----------
+
+            - file_path (str): Optional path to a file containing data to be loaded. (default
+              None)
+            - documents (list): Optional list of document objects for content extraction if no
+              file path is provided. (default None)
+        """
        if file_path is None:
            initial_chunks_and_ids = []

@ -202,6 +289,17 @@ class OntologyEngine:


 async def infer_data_ontology(documents, ontology_model=KnowledgeGraph, root_node_id=None):
+    """
+    Infer data ontology from provided documents and optionally add it to a graph.
+
+    Parameters:
+    -----------
+
+        - documents: The documents from which to infer the ontology.
+        - ontology_model: The ontology model to use for the inference, defaults to
+          KnowledgeGraph. (default KnowledgeGraph)
+        - root_node_id: An optional root node identifier for the ontology. (default None)
+    """
    if ontology_model == KnowledgeGraph:
        ontology_engine = OntologyEngine()
        root_node_id = await ontology_engine.add_graph_ontology(documents=documents)
--- a/cognee/tasks/graph/models.py
+++ b/cognee/tasks/graph/models.py
@ -3,12 +3,40 @@ from pydantic import BaseModel, Field


 class RelationshipModel(BaseModel):
+    """
+    Represents a relationship between two entities in a model.
+
+    This class holds the type of the relationship and the identifiers for the source and
+    target entities. It includes the following public instance variables:
+
+    - type: A string indicating the type of relationship.
+    - source: A string representing the source entity of the relationship.
+    - target: A string representing the target entity of the relationship.
+    """
+
    type: str
    source: str
    target: str


 class NodeModel(BaseModel):
+    """
+    Represents a node in a hierarchical model structure with relationships to other nodes.
+
+    Public methods:
+
+    - __init__(self, node_id: str, name: str, default_relationship:
+    Optional[RelationshipModel] = None, children: List[Union[Dict[str, Any], NodeModel]] =
+    Field(default_factory=list))
+
+    Instance variables:
+
+    - node_id: Unique identifier for the node.
+    - name: Name of the node.
+    - default_relationship: Default relationship associated with the node, if any.
+    - children: List of child nodes or dictionaries representing children for this node.
+    """
+
    node_id: str
    name: str
    default_relationship: Optional[RelationshipModel] = None
@ -19,12 +47,28 @@ NodeModel.model_rebuild()


 class OntologyNode(BaseModel):
+    """
+    Represents a node in an ontology with a unique identifier, name, and description.
+    """
+
    id: str = Field(..., description="Unique identifier made from node name.")
    name: str
    description: str


 class OntologyEdge(BaseModel):
+    """
+    Represent an edge in an ontology, connecting a source and target with a specific
+    relationship type.
+
+    The class includes the following instance variables:
+    - id: A unique identifier for the edge.
+    - source_id: The identifier of the source node.
+    - target_id: The identifier of the target node.
+    - relationship_type: The type of relationship represented by this edge, defining how the
+    source and target are related.
+    """
+
    id: str
    source_id: str
    target_id: str
@ -32,5 +76,14 @@ class OntologyEdge(BaseModel):


 class GraphOntology(BaseModel):
+    """
+    Represents a graph-based structure of ontology consisting of nodes and edges.
+
+    The GraphOntology class contains a collection of OntologyNode instances representing the
+    nodes of the graph and OntologyEdge instances representing the relationships between
+    them. Public methods include the management of nodes and edges as well as any relevant
+    graph operations. Instance variables include a list of nodes and a list of edges.
+    """
+
    nodes: list[OntologyNode]
    edges: list[OntologyEdge]
--- a/cognee/tasks/ingestion/get_dlt_destination.py
+++ b/cognee/tasks/ingestion/get_dlt_destination.py
@ -10,11 +10,19 @@ from cognee.infrastructure.databases.relational import get_relational_config
@lru_cache
 def get_dlt_destination() -> Union[type[dlt.destinations.sqlalchemy], None]:
    """
-    Handles propagation of the cognee database configuration to the dlt library
+    Handle the propagation of the cognee database configuration to the dlt library.
+
+    This function determines the appropriate sqlalchemy destination based on the database
+    provider specified in the relational configuration. It constructs the destination
+    credentials for either sqlite or postgres databases accordingly. If the database
+    provider is neither sqlite nor postgres, it returns None.

    Returns:
-        sqlachemy: sqlachemy destination used by the dlt library
+    --------

+        - Union[type[dlt.destinations.sqlalchemy], None]: An instance of sqlalchemy
+          destination used by the dlt library, or None if the database provider is
+          unsupported.
    """
    relational_config = get_relational_config()

--- a/cognee/tasks/ingestion/transform_data.py
+++ b/cognee/tasks/ingestion/transform_data.py
@ -5,6 +5,26 @@ from typing import Union


 def get_data_from_llama_index(data_point: Union[Document, ImageDocument], dataset_name: str) -> str:
+    """
+    Retrieve the file path based on the data point type.
+
+    Ensure the data point is an instance of either Document or ImageDocument. If the data
+    point has a metadata or image path file path, return it; otherwise, save the data
+    point's text to a file and return the newly created file path.
+
+    Parameters:
+    -----------
+
+        - data_point (Union[Document, ImageDocument]): An instance of Document or
+          ImageDocument to extract data from.
+        - dataset_name (str): The name of the dataset associated with the data point.
+
+    Returns:
+    --------
+
+        - str: The file path as a string where the data is stored or the existing path from
+          the data point.
+    """
    # Specific type checking is used to ensure it's not a child class from Document
    if isinstance(data_point, Document) and type(data_point) is Document:
        file_path = data_point.metadata.get("file_path")
--- a/cognee/tasks/repo_processor/get_local_dependencies.py
+++ b/cognee/tasks/repo_processor/get_local_dependencies.py
@ -19,10 +19,34 @@ logger = get_logger()


 class FileParser:
+    """
+    Handles the parsing of files into source code and an abstract syntax tree
+    representation. Public methods include:
+
+    - parse_file: Parses a file and returns its source code and syntax tree representation.
+    """
+
    def __init__(self):
        self.parsed_files = {}

    async def parse_file(self, file_path: str) -> tuple[str, Tree]:
+        """
+        Parse a file and return its source code along with its syntax tree representation.
+
+        If the file has already been parsed, retrieve the result from memory instead of reading
+        the file again.
+
+        Parameters:
+        -----------
+
+            - file_path (str): The path of the file to parse.
+
+        Returns:
+        --------
+
+            - tuple[str, Tree]: A tuple containing the source code of the file and its
+              corresponding syntax tree representation.
+        """
        PY_LANGUAGE = Language(tspython.language())
        source_code_parser = Parser(PY_LANGUAGE)

@ -35,6 +59,24 @@ class FileParser:


 async def get_source_code(file_path: str):
+    """
+    Read source code from a file asynchronously.
+
+    This function attempts to open a file specified by the given file path, read its
+    contents, and return the source code. In case of any errors during the file reading
+    process, it logs an error message and returns None.
+
+    Parameters:
+    -----------
+
+        - file_path (str): The path to the file from which to read the source code.
+
+    Returns:
+    --------
+
+        Returns the contents of the file as a string if successful, or None if an error
+        occurs.
+    """
    try:
        async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
            source_code = await f.read()
@ -45,7 +87,22 @@ async def get_source_code(file_path: str):


 def resolve_module_path(module_name):
-    """Find the file path of a module."""
+    """
+    Find the file path of a module.
+
+    Return the file path of the specified module if found, or return None if the module does
+    not exist or cannot be located.
+
+    Parameters:
+    -----------
+
+        - module_name: The name of the module whose file path is to be resolved.
+
+    Returns:
+    --------
+
+        The file path of the module as a string or None if the module is not found.
+    """
    try:
        spec = importlib.util.find_spec(module_name)
        if spec and spec.origin:
@ -58,7 +115,23 @@ def resolve_module_path(module_name):
 def find_function_location(
    module_path: str, function_name: str, parser: FileParser
 ) -> Optional[tuple[str, str]]:
-    """Find the function definition in the module."""
+    """
+    Find the location of a function definition in a specified module.
+
+    Parameters:
+    -----------
+
+        - module_path (str): The path to the module where the function is defined.
+        - function_name (str): The name of the function whose location is to be found.
+        - parser (FileParser): An instance of FileParser used to parse the module's source
+          code.
+
+    Returns:
+    --------
+
+        - Optional[tuple[str, str]]: Returns a tuple containing the module path and the
+          start point of the function if found; otherwise, returns None.
+    """
    if not module_path or not os.path.exists(module_path):
        return None

@ -78,6 +151,24 @@ def find_function_location(
 async def get_local_script_dependencies(
    repo_path: str, script_path: str, detailed_extraction: bool = False
 ) -> CodeFile:
+    """
+    Retrieve local script dependencies and create a CodeFile object.
+
+    Parameters:
+    -----------
+
+        - repo_path (str): The path to the repository that contains the script.
+        - script_path (str): The path of the script for which dependencies are being
+          extracted.
+        - detailed_extraction (bool): A flag indicating whether to perform a detailed
+          extraction of code components.
+
+    Returns:
+    --------
+
+        - CodeFile: Returns a CodeFile object containing information about the script,
+          including its dependencies and definitions.
+    """
    code_file_parser = FileParser()
    source_code, source_code_tree = await code_file_parser.parse_file(script_path)

@ -113,6 +204,24 @@ async def get_local_script_dependencies(


 def find_node(nodes: list[Node], condition: callable) -> Node:
+    """
+    Find and return the first node that satisfies the given condition.
+
+    Iterate through the provided list of nodes and return the first node for which the
+    condition callable returns True. If no such node is found, return None.
+
+    Parameters:
+    -----------
+
+        - nodes (list[Node]): A list of Node objects to search through.
+        - condition (callable): A callable that takes a Node and returns a boolean
+          indicating if the node meets specified criteria.
+
+    Returns:
+    --------
+
+        - Node: The first Node that matches the condition, or None if no such node exists.
+    """
    for node in nodes:
        if condition(node):
            return node
@ -123,6 +232,30 @@ def find_node(nodes: list[Node], condition: callable) -> Node:
 async def extract_code_parts(
    tree_root: Node, script_path: str, existing_nodes: list[DataPoint] = {}
 ) -> AsyncGenerator[DataPoint, None]:
+    """
+    Extract code parts from a given AST node tree asynchronously.
+
+    Iteratively yields DataPoint nodes representing import statements, function definitions,
+    and class definitions found in the children of the specified tree root. The function
+    checks
+    if nodes are already present in the existing_nodes dictionary to prevent duplicates.
+    This function has to be used in an asynchronous context, and it requires a valid
+    tree_root
+    and proper initialization of existing_nodes.
+
+    Parameters:
+    -----------
+
+        - tree_root (Node): The root node of the AST tree containing code parts to extract.
+        - script_path (str): The file path of the script from which the AST was generated.
+        - existing_nodes (list[DataPoint]): A dictionary that holds already extracted
+          DataPoint nodes to avoid duplicates. (default {})
+
+    Returns:
+    --------
+
+        Yields DataPoint nodes representing imported modules, functions, and classes.
+    """
    for child_node in tree_root.children:
        if child_node.type == "import_statement" or child_node.type == "import_from_statement":
            parts = child_node.text.decode("utf-8").split()
--- a/cognee/tasks/repo_processor/get_non_code_files.py
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@ -2,7 +2,24 @@ import os


 async def get_non_py_files(repo_path):
-    """Get files that are not .py files and their contents"""
+    """
+    Get files that are not .py files and their contents.
+
+    Check if the specified repository path exists and if so, traverse the directory,
+    collecting the paths of files that do not have a .py extension and meet the
+    criteria set in the allowed and ignored patterns. Return a list of paths to
+    those files.
+
+    Parameters:
+    -----------
+
+        - repo_path: The file system path to the repository to scan for non-Python files.
+
+    Returns:
+    --------
+
+        A list of file paths that are not Python files and meet the specified criteria.
+    """
    if not os.path.exists(repo_path):
        return {}

@ -111,6 +128,22 @@ async def get_non_py_files(repo_path):
    }

    def should_process(path):
+        """
+        Determine if a file should be processed based on its extension and path patterns.
+
+        This function checks if the file extension is in the allowed list and ensures that none
+        of the ignored patterns are present in the provided file path.
+
+        Parameters:
+        -----------
+
+            - path: The file path to check for processing eligibility.
+
+        Returns:
+        --------
+
+            Returns True if the file should be processed; otherwise, False.
+        """
        _, ext = os.path.splitext(path)
        return ext in ALLOWED_EXTENSIONS and not any(
            pattern in path for pattern in IGNORED_PATTERNS
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@ -11,7 +11,24 @@ from cognee.shared.CodeGraphEntities import CodeFile, Repository


 async def get_source_code_files(repo_path):
-    """Get .py files and their source code"""
+    """
+    Retrieve Python source code files from the specified repository path.
+
+    This function scans the given repository path for files that have the .py extension
+    while excluding test files and files within a virtual environment. It returns a list of
+    absolute paths to the source code files that are not empty.
+
+    Parameters:
+    -----------
+
+        - repo_path: The file path to the repository to search for Python source files.
+
+    Returns:
+    --------
+
+        A list of absolute paths to .py files that contain source code, excluding empty
+        files, test files, and files from a virtual environment.
+    """
    if not os.path.exists(repo_path):
        return {}

@ -40,6 +57,26 @@ async def get_source_code_files(repo_path):


 def run_coroutine(coroutine_func, *args, **kwargs):
+    """
+    Run a coroutine function until it completes.
+
+    This function creates a new asyncio event loop, sets it as the current loop, and
+    executes the given coroutine function with the provided arguments. Once the coroutine
+    completes, the loop is closed. Intended for use in environments where an existing event
+    loop is not available or desirable.
+
+    Parameters:
+    -----------
+
+        - coroutine_func: The coroutine function to be run.
+        - *args: Positional arguments to pass to the coroutine function.
+        - **kwargs: Keyword arguments to pass to the coroutine function.
+
+    Returns:
+    --------
+
+        The result returned by the coroutine after completion.
+    """
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    result = loop.run_until_complete(coroutine_func(*args, **kwargs))
@ -50,7 +87,21 @@ def run_coroutine(coroutine_func, *args, **kwargs):
 async def get_repo_file_dependencies(
    repo_path: str, detailed_extraction: bool = False
 ) -> AsyncGenerator[DataPoint, None]:
-    """Generate a dependency graph for Python files in the given repository path."""
+    """
+    Generate a dependency graph for Python files in the given repository path.
+
+    Check the validity of the repository path and yield a repository object followed by the
+    dependencies of Python files within that repository. Raise a FileNotFoundError if the
+    provided path does not exist. The extraction of detailed dependencies can be controlled
+    via the `detailed_extraction` argument.
+
+    Parameters:
+    -----------
+
+        - repo_path (str): The file path to the repository where Python files are located.
+        - detailed_extraction (bool): A flag indicating whether to perform a detailed
+          extraction of dependencies (default is False). (default False)
+    """

    if not os.path.exists(repo_path):
        raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
--- a/cognee/tasks/summarization/mock_summary.py
+++ b/cognee/tasks/summarization/mock_summary.py
@ -2,6 +2,19 @@ from cognee.shared.data_models import SummarizedCode, SummarizedClass, Summarize


 def get_mock_summarized_code() -> SummarizedCode:
+    """
+    Return a summarized representation of mock code.
+
+    This function constructs and returns a `SummarizedCode` object that includes various
+    components such as file name, high-level summary, key features, imports, constants,
+    classes, and functions, all described with placeholders for mock data.
+
+    Returns:
+    --------
+
+        - SummarizedCode: A `SummarizedCode` object containing mock data for file summary,
+          features, imports, constants, classes, and functions.
+    """
    return SummarizedCode(
        file_name="mock_file.py",
        high_level_summary="This is a mock high-level summary.",
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@ -6,6 +6,15 @@ from cognee.shared.CodeGraphEntities import CodeFile, CodePart


 class TextSummary(DataPoint):
+    """
+    Represent a text summary derived from a document chunk.
+
+    This class encapsulates a text summary as well as its associated metadata. The public
+    instance variables include 'text' for the summary content and 'made_from' which
+    indicates the source document chunk. The 'metadata' instance variable contains
+    additional information such as indexed fields.
+    """
+
    text: str
    made_from: DocumentChunk

@ -13,6 +22,15 @@ class TextSummary(DataPoint):


 class CodeSummary(DataPoint):
+    """
+    Summarizes code and its components.
+
+    This class inherits from DataPoint and contains a text representation alongside the
+    summarized content, which can either be a full code file or a part of it. The metadata
+    dictionary defines index fields for the class's instances, particularly focusing on the
+    'text' attribute. Public attributes include 'text', 'summarizes', and 'metadata'.
+    """
+
    text: str
    summarizes: Union[CodeFile, CodePart]

--- a/cognee/tasks/summarization/summarize_text.py
+++ b/cognee/tasks/summarization/summarize_text.py
@ -11,6 +11,29 @@ from .models import TextSummary
 async def summarize_text(
    data_chunks: list[DocumentChunk], summarization_model: Type[BaseModel] = None
 ):
+    """
+    Summarize the text contained in the provided data chunks.
+
+    If no summarization model is provided, the function retrieves the default model from the
+    configuration. It processes the data chunks asynchronously and returns summaries for
+    each chunk. If the provided list of data chunks is empty, it simply returns the list as
+    is.
+
+    Parameters:
+    -----------
+
+        - data_chunks (list[DocumentChunk]): A list of DocumentChunk objects containing text
+          to be summarized.
+        - summarization_model (Type[BaseModel]): An optional model used for summarizing
+          text. If not provided, the default is fetched from the configuration. (default
+          None)
+
+    Returns:
+    --------
+
+        A list of TextSummary objects, each containing the summary of a corresponding
+        DocumentChunk.
+    """
    if len(data_chunks) == 0:
        return data_chunks

--- a/cognee/tasks/temporal_awareness/graphiti_model.py
+++ b/cognee/tasks/temporal_awareness/graphiti_model.py
@ -3,6 +3,18 @@ from typing import Optional


 class GraphitiNode(DataPoint):
+    """
+    Represent a node in a graph with optional content, name, and summary attributes.
+
+    This class extends DataPoint and includes a metadata dictionary that specifies the index
+    fields for the node's data. The public instance variables are:
+
+    - content: an optional string representing the content of the node.
+    - name: an optional string representing the name of the node.
+    - summary: an optional string providing a summary of the node.
+    - metadata: a dictionary outlining the fields used for indexing.
+    """
+
    content: Optional[str] = None
    name: Optional[str] = None
    summary: Optional[str] = None