From bb68d6a0dff8caedf12161dc1eca7dbe5cf8c559 Mon Sep 17 00:00:00 2001 From: Daniel Molnar Date: Tue, 27 May 2025 21:33:16 +0200 Subject: [PATCH] Docstring tasks. (#878) ## Description ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- cognee/tasks/chunks/chunk_by_paragraph.py | 23 ++- cognee/tasks/chunks/chunk_by_sentence.py | 37 ++++- cognee/tasks/chunks/chunk_by_word.py | 33 +++-- .../code/enrich_dependency_graph_checker.py | 8 + .../code/get_repo_dependency_graph_checker.py | 8 + .../tasks/completion/exceptions/exceptions.py | 15 ++ cognee/tasks/documents/classify_documents.py | 39 ++++- cognee/tasks/graph/infer_data_ontology.py | 106 +++++++++++++- cognee/tasks/graph/models.py | 53 +++++++ cognee/tasks/ingestion/get_dlt_destination.py | 12 +- cognee/tasks/ingestion/transform_data.py | 20 +++ .../repo_processor/get_local_dependencies.py | 137 +++++++++++++++++- .../repo_processor/get_non_code_files.py | 35 ++++- .../get_repo_file_dependencies.py | 55 ++++++- cognee/tasks/summarization/mock_summary.py | 13 ++ cognee/tasks/summarization/models.py | 18 +++ cognee/tasks/summarization/summarize_text.py | 23 +++ .../temporal_awareness/graphiti_model.py | 12 ++ 18 files changed, 610 insertions(+), 37 deletions(-) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index d4b434eee..1668276c5 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -10,14 +10,23 @@ def chunk_by_paragraph( batch_paragraphs: bool = True, ) -> Iterator[Dict[str, Any]]: """ - Chunks text by paragraph while preserving exact text reconstruction capability. - When chunks are joined with empty string "", they reproduce the original text exactly. + Chunk the input text by paragraph while enabling exact text reconstruction. - Notes: - - Tokenization is handled using our tokenization adapters, ensuring compatibility with the vector engine's embedding model. - - If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk. - - Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed. - - Remaining text at the end of the input will be yielded as a final chunk. + This function divides the given text data into smaller chunks based on the specified + maximum chunk size. It ensures that when the generated chunks are concatenated, they + reproduce the original text accurately. The tokenization process is handled by adapters + compatible with the vector engine's embedding model, and the function can operate in + either batch mode or paragraph mode, based on the `batch_paragraphs` flag. + + Parameters: + ----------- + + - data (str): The input text to be chunked. + - max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or + words. + - batch_paragraphs (bool): Flag indicating whether to yield each paragraph as a + separate chunk. If set to False, individual paragraphs are yielded as they are + processed. (default True) """ current_chunk = "" chunk_index = 0 diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 75ffc39e8..d37aa2a66 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -5,6 +5,23 @@ from cognee.infrastructure.databases.vector.embeddings import get_embedding_engi def get_word_size(word: str) -> int: + """ + Calculate the size of a given word in terms of tokens. + + If an embedding engine's tokenizer is available, count the tokens for the provided word. + If the tokenizer is not available, assume the word counts as one token. + + Parameters: + ----------- + + - word (str): The word for which the token size is to be calculated. + + Returns: + -------- + + - int: The number of tokens representing the word, typically an integer, depending + on the tokenizer's output. + """ embedding_engine = get_embedding_engine() if embedding_engine.tokenizer: return embedding_engine.tokenizer.count_tokens(word) @@ -16,12 +33,22 @@ def chunk_by_sentence( data: str, maximum_size: Optional[int] = None ) -> Iterator[Tuple[UUID, str, int, Optional[str]]]: """ - Splits the input text into sentences based on word-level processing, with optional sentence length constraints. + Splits text into sentences while preserving word and paragraph boundaries. - Notes: - - Relies on the `chunk_by_word` function for word-level tokenization and classification. - - Ensures sentences within paragraphs are uniquely identifiable using UUIDs. - - Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type. + This function processes the input string, dividing it into sentences based on word-level + tokenization. Each sentence is identified with a unique UUID, and it handles scenarios + where the text may end mid-sentence by tagging it with a specific type. If a maximum + sentence length is specified, the function ensures that sentences do not exceed this + length, raising a ValueError if an individual word surpasses it. The function utilizes + an external word processing function `chunk_by_word` to determine the structure of the + text. + + Parameters: + ----------- + + - data (str): The input text to be split into sentences. + - maximum_size (Optional[int]): An optional limit on the maximum size of sentences + generated. (default None) """ sentence = "" paragraph_id = uuid4() diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index f0782cd9c..51ffd39d3 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -8,15 +8,23 @@ PARAGRAPH_ENDINGS = r"[\n\r]" def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool: """ - Determines if the current position represents a real paragraph ending. + Determine if the current position represents a valid paragraph end. - Args: - last_char: The last processed character - current_pos: Current position in the text - text: The input text + The function checks if the last character indicates a possible sentence ending, then + verifies if the subsequent characters lead to a valid paragraph end based on specific + conditions. + + Parameters: + ----------- + + - last_char (str): The last processed character + - current_pos (int): Current position in the text + - text (str): The input text Returns: - bool: True if this is a real paragraph end, False otherwise + -------- + + - bool: True if this is a real paragraph end, False otherwise """ if re.match(SENTENCE_ENDINGS, last_char): return True @@ -38,9 +46,16 @@ def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool: def chunk_by_word(data: str) -> Iterator[Tuple[str, str]]: """ - Chunks text into words and endings while preserving whitespace. - Whitespace is included with the preceding word. - Outputs can be joined with "" to recreate the original input. + Chunk text into words and sentence endings, preserving whitespace. + + Whitespace is included with the preceding word. Outputs can be joined with "" to + recreate the original input. + + Parameters: + ----------- + + - data (str): The input string of text to be chunked into words and sentence + endings. """ current_chunk = "" i = 0 diff --git a/cognee/tasks/code/enrich_dependency_graph_checker.py b/cognee/tasks/code/enrich_dependency_graph_checker.py index 7b04e0357..1b3a80210 100644 --- a/cognee/tasks/code/enrich_dependency_graph_checker.py +++ b/cognee/tasks/code/enrich_dependency_graph_checker.py @@ -6,6 +6,14 @@ from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependenc def main(): + """ + Execute the main logic of the dependency graph processor. + + This function sets up argument parsing to retrieve the repository path, checks the + existence of the specified path, and processes the repository to produce a dependency + graph. If the repository path does not exist, it logs an error message and terminates + without further execution. + """ parser = argparse.ArgumentParser() parser.add_argument("repo_path", help="Path to the repository") args = parser.parse_args() diff --git a/cognee/tasks/code/get_repo_dependency_graph_checker.py b/cognee/tasks/code/get_repo_dependency_graph_checker.py index 3a393d3f3..0e68cf7fe 100644 --- a/cognee/tasks/code/get_repo_dependency_graph_checker.py +++ b/cognee/tasks/code/get_repo_dependency_graph_checker.py @@ -5,6 +5,14 @@ from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file def main(): + """ + Parse the command line arguments and print the repository file dependencies. + + This function sets up an argument parser to retrieve the path of a repository. It checks + if the provided path exists and if it doesn’t, it prints an error message and exits. If + the path is valid, it calls an asynchronous function to get the dependencies and prints + the nodes and their relations in the dependency graph. + """ parser = argparse.ArgumentParser() parser.add_argument("repo_path", help="Path to the repository") args = parser.parse_args() diff --git a/cognee/tasks/completion/exceptions/exceptions.py b/cognee/tasks/completion/exceptions/exceptions.py index aebece145..ac105a966 100644 --- a/cognee/tasks/completion/exceptions/exceptions.py +++ b/cognee/tasks/completion/exceptions/exceptions.py @@ -3,6 +3,21 @@ from fastapi import status class NoRelevantDataError(CogneeApiError): + """ + Represents an error when no relevant data is found during a search. This class is a + subclass of CogneeApiError. + + Public methods: + + - __init__ + + Instance variables: + + - message + - name + - status_code + """ + def __init__( self, message: str = "Search did not find any data.", diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index 97ff3e483..673e17c75 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -52,7 +52,21 @@ EXTENSION_TO_DOCUMENT_CLASS = { def update_node_set(document): - """Extracts node_set from document's external_metadata.""" + """ + Extracts node_set from document's external_metadata. + + Parses the external_metadata of the given document and updates the document's + belongs_to_set attribute with NodeSet objects generated from the node_set found in the + external_metadata. If the external_metadata is not valid JSON, is not a dictionary, does + not contain the 'node_set' key, or if node_set is not a list, the function has no effect + and will return early. + + Parameters: + ----------- + + - document: The document object which contains external_metadata from which the + node_set will be extracted. + """ try: external_metadata = json.loads(document.external_metadata) except json.JSONDecodeError: @@ -76,11 +90,26 @@ def update_node_set(document): async def classify_documents(data_documents: list[Data]) -> list[Document]: """ - Classifies a list of data items into specific document types based on file extensions. + Classifies a list of data items into specific document types based on their file + extensions. - Notes: - - The function relies on `get_metadata` to retrieve metadata information for each data item. - - Ensure the `Data` objects and their attributes (e.g., `extension`, `id`) are valid before calling this function. + This function processes each item in the provided list of data documents, retrieves + relevant metadata, and creates instances of document classes mapped to their extensions. + It ensures that the data items are valid before performing the classification and + invokes `update_node_set` to extract and set relevant node information from the + document's external metadata. + + Parameters: + ----------- + + - data_documents (list[Data]): A list of Data objects representing the documents to + be classified. + + Returns: + -------- + + - list[Document]: A list of Document objects created based on the classified data + documents. """ documents = [] for data_item in data_documents: diff --git a/cognee/tasks/graph/infer_data_ontology.py b/cognee/tasks/graph/infer_data_ontology.py index bec7d6fb3..23ede6656 100644 --- a/cognee/tasks/graph/infer_data_ontology.py +++ b/cognee/tasks/graph/infer_data_ontology.py @@ -33,6 +33,25 @@ logger = get_logger("task:infer_data_ontology") async def extract_ontology(content: str, response_model: Type[BaseModel]): + """ + Extracts structured ontology from the provided content using a pre-defined LLM client. + + This asynchronous function retrieves a system prompt from a file and utilizes an LLM + client to create a structured output based on the input content and specified response + model. + + Parameters: + ----------- + + - content (str): The content from which to extract the ontology. + - response_model (Type[BaseModel]): The model that defines the structure of the + output ontology. + + Returns: + -------- + + The structured ontology extracted from the content. + """ llm_client = get_llm_client() system_prompt = read_query_prompt("extract_ontology.txt") @@ -43,10 +62,38 @@ async def extract_ontology(content: str, response_model: Type[BaseModel]): class OntologyEngine: + """ + Manage ontology data and operations for graph structures, providing methods for data + loading, flattening models, and adding ontological relationships to a graph database. + + Public methods: + + - flatten_model + - recursive_flatten + - load_data + - add_graph_ontology + """ + async def flatten_model( self, model: NodeModel, parent_id: Optional[str] = None ) -> Dict[str, Any]: - """Flatten the model to a dictionary.""" + """ + Flatten the model to a dictionary including optional parent ID and relationship details + if available. + + Parameters: + ----------- + + - model (NodeModel): The NodeModel instance to flatten. + - parent_id (Optional[str]): An optional ID of the parent node for hierarchical + purposes. (default None) + + Returns: + -------- + + - Dict[str, Any]: A dictionary representation of the model with flattened + attributes. + """ result = model.dict() result["parent_id"] = parent_id if model.default_relationship: @@ -62,7 +109,23 @@ class OntologyEngine: async def recursive_flatten( self, items: Union[List[Dict[str, Any]], Dict[str, Any]], parent_id: Optional[str] = None ) -> List[Dict[str, Any]]: - """Recursively flatten the items.""" + """ + Recursively flatten a hierarchical structure of models into a flat list of dictionaries. + + Parameters: + ----------- + + - items (Union[List[Dict[str, Any]], Dict[str, Any]]): A list or dictionary + containing models to flatten. + - parent_id (Optional[str]): An optional ID of the parent node to maintain hierarchy + during flattening. (default None) + + Returns: + -------- + + - List[Dict[str, Any]]: A flat list of dictionaries representing the hierarchical + model structure. + """ flat_list = [] if isinstance(items, list): @@ -76,7 +139,20 @@ class OntologyEngine: return flat_list async def load_data(self, file_path: str) -> Union[List[Dict[str, Any]], Dict[str, Any]]: - """Load data from a JSON or CSV file.""" + """ + Load data from a specified JSON or CSV file and return it in a structured format. + + Parameters: + ----------- + + - file_path (str): The path to the file to load data from. + + Returns: + -------- + + - Union[List[Dict[str, Any]], Dict[str, Any]]: Parsed data from the file as either a + list of dictionaries or a single dictionary depending on content type. + """ try: if file_path.endswith(".json"): async with aiofiles.open(file_path, mode="r") as f: @@ -96,7 +172,18 @@ class OntologyEngine: ) async def add_graph_ontology(self, file_path: str = None, documents: list = None): - """Add graph ontology from a JSON or CSV file or infer from documents content.""" + """ + Add graph ontology from a JSON or CSV file, or infer relationships from provided + document content. Raise exceptions for invalid file types or missing entities. + + Parameters: + ----------- + + - file_path (str): Optional path to a file containing data to be loaded. (default + None) + - documents (list): Optional list of document objects for content extraction if no + file path is provided. (default None) + """ if file_path is None: initial_chunks_and_ids = [] @@ -202,6 +289,17 @@ class OntologyEngine: async def infer_data_ontology(documents, ontology_model=KnowledgeGraph, root_node_id=None): + """ + Infer data ontology from provided documents and optionally add it to a graph. + + Parameters: + ----------- + + - documents: The documents from which to infer the ontology. + - ontology_model: The ontology model to use for the inference, defaults to + KnowledgeGraph. (default KnowledgeGraph) + - root_node_id: An optional root node identifier for the ontology. (default None) + """ if ontology_model == KnowledgeGraph: ontology_engine = OntologyEngine() root_node_id = await ontology_engine.add_graph_ontology(documents=documents) diff --git a/cognee/tasks/graph/models.py b/cognee/tasks/graph/models.py index bfffb0262..e7429824d 100644 --- a/cognee/tasks/graph/models.py +++ b/cognee/tasks/graph/models.py @@ -3,12 +3,40 @@ from pydantic import BaseModel, Field class RelationshipModel(BaseModel): + """ + Represents a relationship between two entities in a model. + + This class holds the type of the relationship and the identifiers for the source and + target entities. It includes the following public instance variables: + + - type: A string indicating the type of relationship. + - source: A string representing the source entity of the relationship. + - target: A string representing the target entity of the relationship. + """ + type: str source: str target: str class NodeModel(BaseModel): + """ + Represents a node in a hierarchical model structure with relationships to other nodes. + + Public methods: + + - __init__(self, node_id: str, name: str, default_relationship: + Optional[RelationshipModel] = None, children: List[Union[Dict[str, Any], NodeModel]] = + Field(default_factory=list)) + + Instance variables: + + - node_id: Unique identifier for the node. + - name: Name of the node. + - default_relationship: Default relationship associated with the node, if any. + - children: List of child nodes or dictionaries representing children for this node. + """ + node_id: str name: str default_relationship: Optional[RelationshipModel] = None @@ -19,12 +47,28 @@ NodeModel.model_rebuild() class OntologyNode(BaseModel): + """ + Represents a node in an ontology with a unique identifier, name, and description. + """ + id: str = Field(..., description="Unique identifier made from node name.") name: str description: str class OntologyEdge(BaseModel): + """ + Represent an edge in an ontology, connecting a source and target with a specific + relationship type. + + The class includes the following instance variables: + - id: A unique identifier for the edge. + - source_id: The identifier of the source node. + - target_id: The identifier of the target node. + - relationship_type: The type of relationship represented by this edge, defining how the + source and target are related. + """ + id: str source_id: str target_id: str @@ -32,5 +76,14 @@ class OntologyEdge(BaseModel): class GraphOntology(BaseModel): + """ + Represents a graph-based structure of ontology consisting of nodes and edges. + + The GraphOntology class contains a collection of OntologyNode instances representing the + nodes of the graph and OntologyEdge instances representing the relationships between + them. Public methods include the management of nodes and edges as well as any relevant + graph operations. Instance variables include a list of nodes and a list of edges. + """ + nodes: list[OntologyNode] edges: list[OntologyEdge] diff --git a/cognee/tasks/ingestion/get_dlt_destination.py b/cognee/tasks/ingestion/get_dlt_destination.py index 2de0d7f0d..131834f0b 100644 --- a/cognee/tasks/ingestion/get_dlt_destination.py +++ b/cognee/tasks/ingestion/get_dlt_destination.py @@ -10,11 +10,19 @@ from cognee.infrastructure.databases.relational import get_relational_config @lru_cache def get_dlt_destination() -> Union[type[dlt.destinations.sqlalchemy], None]: """ - Handles propagation of the cognee database configuration to the dlt library + Handle the propagation of the cognee database configuration to the dlt library. + + This function determines the appropriate sqlalchemy destination based on the database + provider specified in the relational configuration. It constructs the destination + credentials for either sqlite or postgres databases accordingly. If the database + provider is neither sqlite nor postgres, it returns None. Returns: - sqlachemy: sqlachemy destination used by the dlt library + -------- + - Union[type[dlt.destinations.sqlalchemy], None]: An instance of sqlalchemy + destination used by the dlt library, or None if the database provider is + unsupported. """ relational_config = get_relational_config() diff --git a/cognee/tasks/ingestion/transform_data.py b/cognee/tasks/ingestion/transform_data.py index cc75c7a65..cd35364d7 100644 --- a/cognee/tasks/ingestion/transform_data.py +++ b/cognee/tasks/ingestion/transform_data.py @@ -5,6 +5,26 @@ from typing import Union def get_data_from_llama_index(data_point: Union[Document, ImageDocument], dataset_name: str) -> str: + """ + Retrieve the file path based on the data point type. + + Ensure the data point is an instance of either Document or ImageDocument. If the data + point has a metadata or image path file path, return it; otherwise, save the data + point's text to a file and return the newly created file path. + + Parameters: + ----------- + + - data_point (Union[Document, ImageDocument]): An instance of Document or + ImageDocument to extract data from. + - dataset_name (str): The name of the dataset associated with the data point. + + Returns: + -------- + + - str: The file path as a string where the data is stored or the existing path from + the data point. + """ # Specific type checking is used to ensure it's not a child class from Document if isinstance(data_point, Document) and type(data_point) is Document: file_path = data_point.metadata.get("file_path") diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py index a406d6023..ed8e4e14b 100644 --- a/cognee/tasks/repo_processor/get_local_dependencies.py +++ b/cognee/tasks/repo_processor/get_local_dependencies.py @@ -19,10 +19,34 @@ logger = get_logger() class FileParser: + """ + Handles the parsing of files into source code and an abstract syntax tree + representation. Public methods include: + + - parse_file: Parses a file and returns its source code and syntax tree representation. + """ + def __init__(self): self.parsed_files = {} async def parse_file(self, file_path: str) -> tuple[str, Tree]: + """ + Parse a file and return its source code along with its syntax tree representation. + + If the file has already been parsed, retrieve the result from memory instead of reading + the file again. + + Parameters: + ----------- + + - file_path (str): The path of the file to parse. + + Returns: + -------- + + - tuple[str, Tree]: A tuple containing the source code of the file and its + corresponding syntax tree representation. + """ PY_LANGUAGE = Language(tspython.language()) source_code_parser = Parser(PY_LANGUAGE) @@ -35,6 +59,24 @@ class FileParser: async def get_source_code(file_path: str): + """ + Read source code from a file asynchronously. + + This function attempts to open a file specified by the given file path, read its + contents, and return the source code. In case of any errors during the file reading + process, it logs an error message and returns None. + + Parameters: + ----------- + + - file_path (str): The path to the file from which to read the source code. + + Returns: + -------- + + Returns the contents of the file as a string if successful, or None if an error + occurs. + """ try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: source_code = await f.read() @@ -45,7 +87,22 @@ async def get_source_code(file_path: str): def resolve_module_path(module_name): - """Find the file path of a module.""" + """ + Find the file path of a module. + + Return the file path of the specified module if found, or return None if the module does + not exist or cannot be located. + + Parameters: + ----------- + + - module_name: The name of the module whose file path is to be resolved. + + Returns: + -------- + + The file path of the module as a string or None if the module is not found. + """ try: spec = importlib.util.find_spec(module_name) if spec and spec.origin: @@ -58,7 +115,23 @@ def resolve_module_path(module_name): def find_function_location( module_path: str, function_name: str, parser: FileParser ) -> Optional[tuple[str, str]]: - """Find the function definition in the module.""" + """ + Find the location of a function definition in a specified module. + + Parameters: + ----------- + + - module_path (str): The path to the module where the function is defined. + - function_name (str): The name of the function whose location is to be found. + - parser (FileParser): An instance of FileParser used to parse the module's source + code. + + Returns: + -------- + + - Optional[tuple[str, str]]: Returns a tuple containing the module path and the + start point of the function if found; otherwise, returns None. + """ if not module_path or not os.path.exists(module_path): return None @@ -78,6 +151,24 @@ def find_function_location( async def get_local_script_dependencies( repo_path: str, script_path: str, detailed_extraction: bool = False ) -> CodeFile: + """ + Retrieve local script dependencies and create a CodeFile object. + + Parameters: + ----------- + + - repo_path (str): The path to the repository that contains the script. + - script_path (str): The path of the script for which dependencies are being + extracted. + - detailed_extraction (bool): A flag indicating whether to perform a detailed + extraction of code components. + + Returns: + -------- + + - CodeFile: Returns a CodeFile object containing information about the script, + including its dependencies and definitions. + """ code_file_parser = FileParser() source_code, source_code_tree = await code_file_parser.parse_file(script_path) @@ -113,6 +204,24 @@ async def get_local_script_dependencies( def find_node(nodes: list[Node], condition: callable) -> Node: + """ + Find and return the first node that satisfies the given condition. + + Iterate through the provided list of nodes and return the first node for which the + condition callable returns True. If no such node is found, return None. + + Parameters: + ----------- + + - nodes (list[Node]): A list of Node objects to search through. + - condition (callable): A callable that takes a Node and returns a boolean + indicating if the node meets specified criteria. + + Returns: + -------- + + - Node: The first Node that matches the condition, or None if no such node exists. + """ for node in nodes: if condition(node): return node @@ -123,6 +232,30 @@ def find_node(nodes: list[Node], condition: callable) -> Node: async def extract_code_parts( tree_root: Node, script_path: str, existing_nodes: list[DataPoint] = {} ) -> AsyncGenerator[DataPoint, None]: + """ + Extract code parts from a given AST node tree asynchronously. + + Iteratively yields DataPoint nodes representing import statements, function definitions, + and class definitions found in the children of the specified tree root. The function + checks + if nodes are already present in the existing_nodes dictionary to prevent duplicates. + This function has to be used in an asynchronous context, and it requires a valid + tree_root + and proper initialization of existing_nodes. + + Parameters: + ----------- + + - tree_root (Node): The root node of the AST tree containing code parts to extract. + - script_path (str): The file path of the script from which the AST was generated. + - existing_nodes (list[DataPoint]): A dictionary that holds already extracted + DataPoint nodes to avoid duplicates. (default {}) + + Returns: + -------- + + Yields DataPoint nodes representing imported modules, functions, and classes. + """ for child_node in tree_root.children: if child_node.type == "import_statement" or child_node.type == "import_from_statement": parts = child_node.text.decode("utf-8").split() diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py index 5117f261d..b9ab1d4c6 100644 --- a/cognee/tasks/repo_processor/get_non_code_files.py +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -2,7 +2,24 @@ import os async def get_non_py_files(repo_path): - """Get files that are not .py files and their contents""" + """ + Get files that are not .py files and their contents. + + Check if the specified repository path exists and if so, traverse the directory, + collecting the paths of files that do not have a .py extension and meet the + criteria set in the allowed and ignored patterns. Return a list of paths to + those files. + + Parameters: + ----------- + + - repo_path: The file system path to the repository to scan for non-Python files. + + Returns: + -------- + + A list of file paths that are not Python files and meet the specified criteria. + """ if not os.path.exists(repo_path): return {} @@ -111,6 +128,22 @@ async def get_non_py_files(repo_path): } def should_process(path): + """ + Determine if a file should be processed based on its extension and path patterns. + + This function checks if the file extension is in the allowed list and ensures that none + of the ignored patterns are present in the provided file path. + + Parameters: + ----------- + + - path: The file path to check for processing eligibility. + + Returns: + -------- + + Returns True if the file should be processed; otherwise, False. + """ _, ext = os.path.splitext(path) return ext in ALLOWED_EXTENSIONS and not any( pattern in path for pattern in IGNORED_PATTERNS diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 92e2d7910..232850936 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -11,7 +11,24 @@ from cognee.shared.CodeGraphEntities import CodeFile, Repository async def get_source_code_files(repo_path): - """Get .py files and their source code""" + """ + Retrieve Python source code files from the specified repository path. + + This function scans the given repository path for files that have the .py extension + while excluding test files and files within a virtual environment. It returns a list of + absolute paths to the source code files that are not empty. + + Parameters: + ----------- + + - repo_path: The file path to the repository to search for Python source files. + + Returns: + -------- + + A list of absolute paths to .py files that contain source code, excluding empty + files, test files, and files from a virtual environment. + """ if not os.path.exists(repo_path): return {} @@ -40,6 +57,26 @@ async def get_source_code_files(repo_path): def run_coroutine(coroutine_func, *args, **kwargs): + """ + Run a coroutine function until it completes. + + This function creates a new asyncio event loop, sets it as the current loop, and + executes the given coroutine function with the provided arguments. Once the coroutine + completes, the loop is closed. Intended for use in environments where an existing event + loop is not available or desirable. + + Parameters: + ----------- + + - coroutine_func: The coroutine function to be run. + - *args: Positional arguments to pass to the coroutine function. + - **kwargs: Keyword arguments to pass to the coroutine function. + + Returns: + -------- + + The result returned by the coroutine after completion. + """ loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) result = loop.run_until_complete(coroutine_func(*args, **kwargs)) @@ -50,7 +87,21 @@ def run_coroutine(coroutine_func, *args, **kwargs): async def get_repo_file_dependencies( repo_path: str, detailed_extraction: bool = False ) -> AsyncGenerator[DataPoint, None]: - """Generate a dependency graph for Python files in the given repository path.""" + """ + Generate a dependency graph for Python files in the given repository path. + + Check the validity of the repository path and yield a repository object followed by the + dependencies of Python files within that repository. Raise a FileNotFoundError if the + provided path does not exist. The extraction of detailed dependencies can be controlled + via the `detailed_extraction` argument. + + Parameters: + ----------- + + - repo_path (str): The file path to the repository where Python files are located. + - detailed_extraction (bool): A flag indicating whether to perform a detailed + extraction of dependencies (default is False). (default False) + """ if not os.path.exists(repo_path): raise FileNotFoundError(f"Repository path {repo_path} does not exist.") diff --git a/cognee/tasks/summarization/mock_summary.py b/cognee/tasks/summarization/mock_summary.py index f60ce2d82..6d288b927 100644 --- a/cognee/tasks/summarization/mock_summary.py +++ b/cognee/tasks/summarization/mock_summary.py @@ -2,6 +2,19 @@ from cognee.shared.data_models import SummarizedCode, SummarizedClass, Summarize def get_mock_summarized_code() -> SummarizedCode: + """ + Return a summarized representation of mock code. + + This function constructs and returns a `SummarizedCode` object that includes various + components such as file name, high-level summary, key features, imports, constants, + classes, and functions, all described with placeholders for mock data. + + Returns: + -------- + + - SummarizedCode: A `SummarizedCode` object containing mock data for file summary, + features, imports, constants, classes, and functions. + """ return SummarizedCode( file_name="mock_file.py", high_level_summary="This is a mock high-level summary.", diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index aac1d4178..75ed82d50 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -6,6 +6,15 @@ from cognee.shared.CodeGraphEntities import CodeFile, CodePart class TextSummary(DataPoint): + """ + Represent a text summary derived from a document chunk. + + This class encapsulates a text summary as well as its associated metadata. The public + instance variables include 'text' for the summary content and 'made_from' which + indicates the source document chunk. The 'metadata' instance variable contains + additional information such as indexed fields. + """ + text: str made_from: DocumentChunk @@ -13,6 +22,15 @@ class TextSummary(DataPoint): class CodeSummary(DataPoint): + """ + Summarizes code and its components. + + This class inherits from DataPoint and contains a text representation alongside the + summarized content, which can either be a full code file or a part of it. The metadata + dictionary defines index fields for the class's instances, particularly focusing on the + 'text' attribute. Public attributes include 'text', 'summarizes', and 'metadata'. + """ + text: str summarizes: Union[CodeFile, CodePart] diff --git a/cognee/tasks/summarization/summarize_text.py b/cognee/tasks/summarization/summarize_text.py index cca41ae88..9a8b7cbd7 100644 --- a/cognee/tasks/summarization/summarize_text.py +++ b/cognee/tasks/summarization/summarize_text.py @@ -11,6 +11,29 @@ from .models import TextSummary async def summarize_text( data_chunks: list[DocumentChunk], summarization_model: Type[BaseModel] = None ): + """ + Summarize the text contained in the provided data chunks. + + If no summarization model is provided, the function retrieves the default model from the + configuration. It processes the data chunks asynchronously and returns summaries for + each chunk. If the provided list of data chunks is empty, it simply returns the list as + is. + + Parameters: + ----------- + + - data_chunks (list[DocumentChunk]): A list of DocumentChunk objects containing text + to be summarized. + - summarization_model (Type[BaseModel]): An optional model used for summarizing + text. If not provided, the default is fetched from the configuration. (default + None) + + Returns: + -------- + + A list of TextSummary objects, each containing the summary of a corresponding + DocumentChunk. + """ if len(data_chunks) == 0: return data_chunks diff --git a/cognee/tasks/temporal_awareness/graphiti_model.py b/cognee/tasks/temporal_awareness/graphiti_model.py index 89aef540b..c2a2eeb69 100644 --- a/cognee/tasks/temporal_awareness/graphiti_model.py +++ b/cognee/tasks/temporal_awareness/graphiti_model.py @@ -3,6 +3,18 @@ from typing import Optional class GraphitiNode(DataPoint): + """ + Represent a node in a graph with optional content, name, and summary attributes. + + This class extends DataPoint and includes a metadata dictionary that specifies the index + fields for the node's data. The public instance variables are: + + - content: an optional string representing the content of the node. + - name: an optional string representing the name of the node. + - summary: an optional string providing a summary of the node. + - metadata: a dictionary outlining the fields used for indexing. + """ + content: Optional[str] = None name: Optional[str] = None summary: Optional[str] = None