Docstring tasks. (#878)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Daniel Molnar 2025-05-27 21:33:16 +02:00 committed by GitHub
parent ec68e99438
commit bb68d6a0df
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 610 additions and 37 deletions

View file

@ -10,14 +10,23 @@ def chunk_by_paragraph(
batch_paragraphs: bool = True,
) -> Iterator[Dict[str, Any]]:
"""
Chunks text by paragraph while preserving exact text reconstruction capability.
When chunks are joined with empty string "", they reproduce the original text exactly.
Chunk the input text by paragraph while enabling exact text reconstruction.
Notes:
- Tokenization is handled using our tokenization adapters, ensuring compatibility with the vector engine's embedding model.
- If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk.
- Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed.
- Remaining text at the end of the input will be yielded as a final chunk.
This function divides the given text data into smaller chunks based on the specified
maximum chunk size. It ensures that when the generated chunks are concatenated, they
reproduce the original text accurately. The tokenization process is handled by adapters
compatible with the vector engine's embedding model, and the function can operate in
either batch mode or paragraph mode, based on the `batch_paragraphs` flag.
Parameters:
-----------
- data (str): The input text to be chunked.
- max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or
words.
- batch_paragraphs (bool): Flag indicating whether to yield each paragraph as a
separate chunk. If set to False, individual paragraphs are yielded as they are
processed. (default True)
"""
current_chunk = ""
chunk_index = 0

View file

@ -5,6 +5,23 @@ from cognee.infrastructure.databases.vector.embeddings import get_embedding_engi
def get_word_size(word: str) -> int:
"""
Calculate the size of a given word in terms of tokens.
If an embedding engine's tokenizer is available, count the tokens for the provided word.
If the tokenizer is not available, assume the word counts as one token.
Parameters:
-----------
- word (str): The word for which the token size is to be calculated.
Returns:
--------
- int: The number of tokens representing the word, typically an integer, depending
on the tokenizer's output.
"""
embedding_engine = get_embedding_engine()
if embedding_engine.tokenizer:
return embedding_engine.tokenizer.count_tokens(word)
@ -16,12 +33,22 @@ def chunk_by_sentence(
data: str, maximum_size: Optional[int] = None
) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
"""
Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
Splits text into sentences while preserving word and paragraph boundaries.
Notes:
- Relies on the `chunk_by_word` function for word-level tokenization and classification.
- Ensures sentences within paragraphs are uniquely identifiable using UUIDs.
- Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type.
This function processes the input string, dividing it into sentences based on word-level
tokenization. Each sentence is identified with a unique UUID, and it handles scenarios
where the text may end mid-sentence by tagging it with a specific type. If a maximum
sentence length is specified, the function ensures that sentences do not exceed this
length, raising a ValueError if an individual word surpasses it. The function utilizes
an external word processing function `chunk_by_word` to determine the structure of the
text.
Parameters:
-----------
- data (str): The input text to be split into sentences.
- maximum_size (Optional[int]): An optional limit on the maximum size of sentences
generated. (default None)
"""
sentence = ""
paragraph_id = uuid4()

View file

@ -8,15 +8,23 @@ PARAGRAPH_ENDINGS = r"[\n\r]"
def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
"""
Determines if the current position represents a real paragraph ending.
Determine if the current position represents a valid paragraph end.
Args:
last_char: The last processed character
current_pos: Current position in the text
text: The input text
The function checks if the last character indicates a possible sentence ending, then
verifies if the subsequent characters lead to a valid paragraph end based on specific
conditions.
Parameters:
-----------
- last_char (str): The last processed character
- current_pos (int): Current position in the text
- text (str): The input text
Returns:
bool: True if this is a real paragraph end, False otherwise
--------
- bool: True if this is a real paragraph end, False otherwise
"""
if re.match(SENTENCE_ENDINGS, last_char):
return True
@ -38,9 +46,16 @@ def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
def chunk_by_word(data: str) -> Iterator[Tuple[str, str]]:
"""
Chunks text into words and endings while preserving whitespace.
Whitespace is included with the preceding word.
Outputs can be joined with "" to recreate the original input.
Chunk text into words and sentence endings, preserving whitespace.
Whitespace is included with the preceding word. Outputs can be joined with "" to
recreate the original input.
Parameters:
-----------
- data (str): The input string of text to be chunked into words and sentence
endings.
"""
current_chunk = ""
i = 0

View file

@ -6,6 +6,14 @@ from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependenc
def main():
"""
Execute the main logic of the dependency graph processor.
This function sets up argument parsing to retrieve the repository path, checks the
existence of the specified path, and processes the repository to produce a dependency
graph. If the repository path does not exist, it logs an error message and terminates
without further execution.
"""
parser = argparse.ArgumentParser()
parser.add_argument("repo_path", help="Path to the repository")
args = parser.parse_args()

View file

@ -5,6 +5,14 @@ from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file
def main():
"""
Parse the command line arguments and print the repository file dependencies.
This function sets up an argument parser to retrieve the path of a repository. It checks
if the provided path exists and if it doesnt, it prints an error message and exits. If
the path is valid, it calls an asynchronous function to get the dependencies and prints
the nodes and their relations in the dependency graph.
"""
parser = argparse.ArgumentParser()
parser.add_argument("repo_path", help="Path to the repository")
args = parser.parse_args()

View file

@ -3,6 +3,21 @@ from fastapi import status
class NoRelevantDataError(CogneeApiError):
"""
Represents an error when no relevant data is found during a search. This class is a
subclass of CogneeApiError.
Public methods:
- __init__
Instance variables:
- message
- name
- status_code
"""
def __init__(
self,
message: str = "Search did not find any data.",

View file

@ -52,7 +52,21 @@ EXTENSION_TO_DOCUMENT_CLASS = {
def update_node_set(document):
"""Extracts node_set from document's external_metadata."""
"""
Extracts node_set from document's external_metadata.
Parses the external_metadata of the given document and updates the document's
belongs_to_set attribute with NodeSet objects generated from the node_set found in the
external_metadata. If the external_metadata is not valid JSON, is not a dictionary, does
not contain the 'node_set' key, or if node_set is not a list, the function has no effect
and will return early.
Parameters:
-----------
- document: The document object which contains external_metadata from which the
node_set will be extracted.
"""
try:
external_metadata = json.loads(document.external_metadata)
except json.JSONDecodeError:
@ -76,11 +90,26 @@ def update_node_set(document):
async def classify_documents(data_documents: list[Data]) -> list[Document]:
"""
Classifies a list of data items into specific document types based on file extensions.
Classifies a list of data items into specific document types based on their file
extensions.
Notes:
- The function relies on `get_metadata` to retrieve metadata information for each data item.
- Ensure the `Data` objects and their attributes (e.g., `extension`, `id`) are valid before calling this function.
This function processes each item in the provided list of data documents, retrieves
relevant metadata, and creates instances of document classes mapped to their extensions.
It ensures that the data items are valid before performing the classification and
invokes `update_node_set` to extract and set relevant node information from the
document's external metadata.
Parameters:
-----------
- data_documents (list[Data]): A list of Data objects representing the documents to
be classified.
Returns:
--------
- list[Document]: A list of Document objects created based on the classified data
documents.
"""
documents = []
for data_item in data_documents:

View file

@ -33,6 +33,25 @@ logger = get_logger("task:infer_data_ontology")
async def extract_ontology(content: str, response_model: Type[BaseModel]):
"""
Extracts structured ontology from the provided content using a pre-defined LLM client.
This asynchronous function retrieves a system prompt from a file and utilizes an LLM
client to create a structured output based on the input content and specified response
model.
Parameters:
-----------
- content (str): The content from which to extract the ontology.
- response_model (Type[BaseModel]): The model that defines the structure of the
output ontology.
Returns:
--------
The structured ontology extracted from the content.
"""
llm_client = get_llm_client()
system_prompt = read_query_prompt("extract_ontology.txt")
@ -43,10 +62,38 @@ async def extract_ontology(content: str, response_model: Type[BaseModel]):
class OntologyEngine:
"""
Manage ontology data and operations for graph structures, providing methods for data
loading, flattening models, and adding ontological relationships to a graph database.
Public methods:
- flatten_model
- recursive_flatten
- load_data
- add_graph_ontology
"""
async def flatten_model(
self, model: NodeModel, parent_id: Optional[str] = None
) -> Dict[str, Any]:
"""Flatten the model to a dictionary."""
"""
Flatten the model to a dictionary including optional parent ID and relationship details
if available.
Parameters:
-----------
- model (NodeModel): The NodeModel instance to flatten.
- parent_id (Optional[str]): An optional ID of the parent node for hierarchical
purposes. (default None)
Returns:
--------
- Dict[str, Any]: A dictionary representation of the model with flattened
attributes.
"""
result = model.dict()
result["parent_id"] = parent_id
if model.default_relationship:
@ -62,7 +109,23 @@ class OntologyEngine:
async def recursive_flatten(
self, items: Union[List[Dict[str, Any]], Dict[str, Any]], parent_id: Optional[str] = None
) -> List[Dict[str, Any]]:
"""Recursively flatten the items."""
"""
Recursively flatten a hierarchical structure of models into a flat list of dictionaries.
Parameters:
-----------
- items (Union[List[Dict[str, Any]], Dict[str, Any]]): A list or dictionary
containing models to flatten.
- parent_id (Optional[str]): An optional ID of the parent node to maintain hierarchy
during flattening. (default None)
Returns:
--------
- List[Dict[str, Any]]: A flat list of dictionaries representing the hierarchical
model structure.
"""
flat_list = []
if isinstance(items, list):
@ -76,7 +139,20 @@ class OntologyEngine:
return flat_list
async def load_data(self, file_path: str) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
"""Load data from a JSON or CSV file."""
"""
Load data from a specified JSON or CSV file and return it in a structured format.
Parameters:
-----------
- file_path (str): The path to the file to load data from.
Returns:
--------
- Union[List[Dict[str, Any]], Dict[str, Any]]: Parsed data from the file as either a
list of dictionaries or a single dictionary depending on content type.
"""
try:
if file_path.endswith(".json"):
async with aiofiles.open(file_path, mode="r") as f:
@ -96,7 +172,18 @@ class OntologyEngine:
)
async def add_graph_ontology(self, file_path: str = None, documents: list = None):
"""Add graph ontology from a JSON or CSV file or infer from documents content."""
"""
Add graph ontology from a JSON or CSV file, or infer relationships from provided
document content. Raise exceptions for invalid file types or missing entities.
Parameters:
-----------
- file_path (str): Optional path to a file containing data to be loaded. (default
None)
- documents (list): Optional list of document objects for content extraction if no
file path is provided. (default None)
"""
if file_path is None:
initial_chunks_and_ids = []
@ -202,6 +289,17 @@ class OntologyEngine:
async def infer_data_ontology(documents, ontology_model=KnowledgeGraph, root_node_id=None):
"""
Infer data ontology from provided documents and optionally add it to a graph.
Parameters:
-----------
- documents: The documents from which to infer the ontology.
- ontology_model: The ontology model to use for the inference, defaults to
KnowledgeGraph. (default KnowledgeGraph)
- root_node_id: An optional root node identifier for the ontology. (default None)
"""
if ontology_model == KnowledgeGraph:
ontology_engine = OntologyEngine()
root_node_id = await ontology_engine.add_graph_ontology(documents=documents)

View file

@ -3,12 +3,40 @@ from pydantic import BaseModel, Field
class RelationshipModel(BaseModel):
"""
Represents a relationship between two entities in a model.
This class holds the type of the relationship and the identifiers for the source and
target entities. It includes the following public instance variables:
- type: A string indicating the type of relationship.
- source: A string representing the source entity of the relationship.
- target: A string representing the target entity of the relationship.
"""
type: str
source: str
target: str
class NodeModel(BaseModel):
"""
Represents a node in a hierarchical model structure with relationships to other nodes.
Public methods:
- __init__(self, node_id: str, name: str, default_relationship:
Optional[RelationshipModel] = None, children: List[Union[Dict[str, Any], NodeModel]] =
Field(default_factory=list))
Instance variables:
- node_id: Unique identifier for the node.
- name: Name of the node.
- default_relationship: Default relationship associated with the node, if any.
- children: List of child nodes or dictionaries representing children for this node.
"""
node_id: str
name: str
default_relationship: Optional[RelationshipModel] = None
@ -19,12 +47,28 @@ NodeModel.model_rebuild()
class OntologyNode(BaseModel):
"""
Represents a node in an ontology with a unique identifier, name, and description.
"""
id: str = Field(..., description="Unique identifier made from node name.")
name: str
description: str
class OntologyEdge(BaseModel):
"""
Represent an edge in an ontology, connecting a source and target with a specific
relationship type.
The class includes the following instance variables:
- id: A unique identifier for the edge.
- source_id: The identifier of the source node.
- target_id: The identifier of the target node.
- relationship_type: The type of relationship represented by this edge, defining how the
source and target are related.
"""
id: str
source_id: str
target_id: str
@ -32,5 +76,14 @@ class OntologyEdge(BaseModel):
class GraphOntology(BaseModel):
"""
Represents a graph-based structure of ontology consisting of nodes and edges.
The GraphOntology class contains a collection of OntologyNode instances representing the
nodes of the graph and OntologyEdge instances representing the relationships between
them. Public methods include the management of nodes and edges as well as any relevant
graph operations. Instance variables include a list of nodes and a list of edges.
"""
nodes: list[OntologyNode]
edges: list[OntologyEdge]

View file

@ -10,11 +10,19 @@ from cognee.infrastructure.databases.relational import get_relational_config
@lru_cache
def get_dlt_destination() -> Union[type[dlt.destinations.sqlalchemy], None]:
"""
Handles propagation of the cognee database configuration to the dlt library
Handle the propagation of the cognee database configuration to the dlt library.
This function determines the appropriate sqlalchemy destination based on the database
provider specified in the relational configuration. It constructs the destination
credentials for either sqlite or postgres databases accordingly. If the database
provider is neither sqlite nor postgres, it returns None.
Returns:
sqlachemy: sqlachemy destination used by the dlt library
--------
- Union[type[dlt.destinations.sqlalchemy], None]: An instance of sqlalchemy
destination used by the dlt library, or None if the database provider is
unsupported.
"""
relational_config = get_relational_config()

View file

@ -5,6 +5,26 @@ from typing import Union
def get_data_from_llama_index(data_point: Union[Document, ImageDocument], dataset_name: str) -> str:
"""
Retrieve the file path based on the data point type.
Ensure the data point is an instance of either Document or ImageDocument. If the data
point has a metadata or image path file path, return it; otherwise, save the data
point's text to a file and return the newly created file path.
Parameters:
-----------
- data_point (Union[Document, ImageDocument]): An instance of Document or
ImageDocument to extract data from.
- dataset_name (str): The name of the dataset associated with the data point.
Returns:
--------
- str: The file path as a string where the data is stored or the existing path from
the data point.
"""
# Specific type checking is used to ensure it's not a child class from Document
if isinstance(data_point, Document) and type(data_point) is Document:
file_path = data_point.metadata.get("file_path")

View file

@ -19,10 +19,34 @@ logger = get_logger()
class FileParser:
"""
Handles the parsing of files into source code and an abstract syntax tree
representation. Public methods include:
- parse_file: Parses a file and returns its source code and syntax tree representation.
"""
def __init__(self):
self.parsed_files = {}
async def parse_file(self, file_path: str) -> tuple[str, Tree]:
"""
Parse a file and return its source code along with its syntax tree representation.
If the file has already been parsed, retrieve the result from memory instead of reading
the file again.
Parameters:
-----------
- file_path (str): The path of the file to parse.
Returns:
--------
- tuple[str, Tree]: A tuple containing the source code of the file and its
corresponding syntax tree representation.
"""
PY_LANGUAGE = Language(tspython.language())
source_code_parser = Parser(PY_LANGUAGE)
@ -35,6 +59,24 @@ class FileParser:
async def get_source_code(file_path: str):
"""
Read source code from a file asynchronously.
This function attempts to open a file specified by the given file path, read its
contents, and return the source code. In case of any errors during the file reading
process, it logs an error message and returns None.
Parameters:
-----------
- file_path (str): The path to the file from which to read the source code.
Returns:
--------
Returns the contents of the file as a string if successful, or None if an error
occurs.
"""
try:
async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
source_code = await f.read()
@ -45,7 +87,22 @@ async def get_source_code(file_path: str):
def resolve_module_path(module_name):
"""Find the file path of a module."""
"""
Find the file path of a module.
Return the file path of the specified module if found, or return None if the module does
not exist or cannot be located.
Parameters:
-----------
- module_name: The name of the module whose file path is to be resolved.
Returns:
--------
The file path of the module as a string or None if the module is not found.
"""
try:
spec = importlib.util.find_spec(module_name)
if spec and spec.origin:
@ -58,7 +115,23 @@ def resolve_module_path(module_name):
def find_function_location(
module_path: str, function_name: str, parser: FileParser
) -> Optional[tuple[str, str]]:
"""Find the function definition in the module."""
"""
Find the location of a function definition in a specified module.
Parameters:
-----------
- module_path (str): The path to the module where the function is defined.
- function_name (str): The name of the function whose location is to be found.
- parser (FileParser): An instance of FileParser used to parse the module's source
code.
Returns:
--------
- Optional[tuple[str, str]]: Returns a tuple containing the module path and the
start point of the function if found; otherwise, returns None.
"""
if not module_path or not os.path.exists(module_path):
return None
@ -78,6 +151,24 @@ def find_function_location(
async def get_local_script_dependencies(
repo_path: str, script_path: str, detailed_extraction: bool = False
) -> CodeFile:
"""
Retrieve local script dependencies and create a CodeFile object.
Parameters:
-----------
- repo_path (str): The path to the repository that contains the script.
- script_path (str): The path of the script for which dependencies are being
extracted.
- detailed_extraction (bool): A flag indicating whether to perform a detailed
extraction of code components.
Returns:
--------
- CodeFile: Returns a CodeFile object containing information about the script,
including its dependencies and definitions.
"""
code_file_parser = FileParser()
source_code, source_code_tree = await code_file_parser.parse_file(script_path)
@ -113,6 +204,24 @@ async def get_local_script_dependencies(
def find_node(nodes: list[Node], condition: callable) -> Node:
"""
Find and return the first node that satisfies the given condition.
Iterate through the provided list of nodes and return the first node for which the
condition callable returns True. If no such node is found, return None.
Parameters:
-----------
- nodes (list[Node]): A list of Node objects to search through.
- condition (callable): A callable that takes a Node and returns a boolean
indicating if the node meets specified criteria.
Returns:
--------
- Node: The first Node that matches the condition, or None if no such node exists.
"""
for node in nodes:
if condition(node):
return node
@ -123,6 +232,30 @@ def find_node(nodes: list[Node], condition: callable) -> Node:
async def extract_code_parts(
tree_root: Node, script_path: str, existing_nodes: list[DataPoint] = {}
) -> AsyncGenerator[DataPoint, None]:
"""
Extract code parts from a given AST node tree asynchronously.
Iteratively yields DataPoint nodes representing import statements, function definitions,
and class definitions found in the children of the specified tree root. The function
checks
if nodes are already present in the existing_nodes dictionary to prevent duplicates.
This function has to be used in an asynchronous context, and it requires a valid
tree_root
and proper initialization of existing_nodes.
Parameters:
-----------
- tree_root (Node): The root node of the AST tree containing code parts to extract.
- script_path (str): The file path of the script from which the AST was generated.
- existing_nodes (list[DataPoint]): A dictionary that holds already extracted
DataPoint nodes to avoid duplicates. (default {})
Returns:
--------
Yields DataPoint nodes representing imported modules, functions, and classes.
"""
for child_node in tree_root.children:
if child_node.type == "import_statement" or child_node.type == "import_from_statement":
parts = child_node.text.decode("utf-8").split()

View file

@ -2,7 +2,24 @@ import os
async def get_non_py_files(repo_path):
"""Get files that are not .py files and their contents"""
"""
Get files that are not .py files and their contents.
Check if the specified repository path exists and if so, traverse the directory,
collecting the paths of files that do not have a .py extension and meet the
criteria set in the allowed and ignored patterns. Return a list of paths to
those files.
Parameters:
-----------
- repo_path: The file system path to the repository to scan for non-Python files.
Returns:
--------
A list of file paths that are not Python files and meet the specified criteria.
"""
if not os.path.exists(repo_path):
return {}
@ -111,6 +128,22 @@ async def get_non_py_files(repo_path):
}
def should_process(path):
"""
Determine if a file should be processed based on its extension and path patterns.
This function checks if the file extension is in the allowed list and ensures that none
of the ignored patterns are present in the provided file path.
Parameters:
-----------
- path: The file path to check for processing eligibility.
Returns:
--------
Returns True if the file should be processed; otherwise, False.
"""
_, ext = os.path.splitext(path)
return ext in ALLOWED_EXTENSIONS and not any(
pattern in path for pattern in IGNORED_PATTERNS

View file

@ -11,7 +11,24 @@ from cognee.shared.CodeGraphEntities import CodeFile, Repository
async def get_source_code_files(repo_path):
"""Get .py files and their source code"""
"""
Retrieve Python source code files from the specified repository path.
This function scans the given repository path for files that have the .py extension
while excluding test files and files within a virtual environment. It returns a list of
absolute paths to the source code files that are not empty.
Parameters:
-----------
- repo_path: The file path to the repository to search for Python source files.
Returns:
--------
A list of absolute paths to .py files that contain source code, excluding empty
files, test files, and files from a virtual environment.
"""
if not os.path.exists(repo_path):
return {}
@ -40,6 +57,26 @@ async def get_source_code_files(repo_path):
def run_coroutine(coroutine_func, *args, **kwargs):
"""
Run a coroutine function until it completes.
This function creates a new asyncio event loop, sets it as the current loop, and
executes the given coroutine function with the provided arguments. Once the coroutine
completes, the loop is closed. Intended for use in environments where an existing event
loop is not available or desirable.
Parameters:
-----------
- coroutine_func: The coroutine function to be run.
- *args: Positional arguments to pass to the coroutine function.
- **kwargs: Keyword arguments to pass to the coroutine function.
Returns:
--------
The result returned by the coroutine after completion.
"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
result = loop.run_until_complete(coroutine_func(*args, **kwargs))
@ -50,7 +87,21 @@ def run_coroutine(coroutine_func, *args, **kwargs):
async def get_repo_file_dependencies(
repo_path: str, detailed_extraction: bool = False
) -> AsyncGenerator[DataPoint, None]:
"""Generate a dependency graph for Python files in the given repository path."""
"""
Generate a dependency graph for Python files in the given repository path.
Check the validity of the repository path and yield a repository object followed by the
dependencies of Python files within that repository. Raise a FileNotFoundError if the
provided path does not exist. The extraction of detailed dependencies can be controlled
via the `detailed_extraction` argument.
Parameters:
-----------
- repo_path (str): The file path to the repository where Python files are located.
- detailed_extraction (bool): A flag indicating whether to perform a detailed
extraction of dependencies (default is False). (default False)
"""
if not os.path.exists(repo_path):
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")

View file

@ -2,6 +2,19 @@ from cognee.shared.data_models import SummarizedCode, SummarizedClass, Summarize
def get_mock_summarized_code() -> SummarizedCode:
"""
Return a summarized representation of mock code.
This function constructs and returns a `SummarizedCode` object that includes various
components such as file name, high-level summary, key features, imports, constants,
classes, and functions, all described with placeholders for mock data.
Returns:
--------
- SummarizedCode: A `SummarizedCode` object containing mock data for file summary,
features, imports, constants, classes, and functions.
"""
return SummarizedCode(
file_name="mock_file.py",
high_level_summary="This is a mock high-level summary.",

View file

@ -6,6 +6,15 @@ from cognee.shared.CodeGraphEntities import CodeFile, CodePart
class TextSummary(DataPoint):
"""
Represent a text summary derived from a document chunk.
This class encapsulates a text summary as well as its associated metadata. The public
instance variables include 'text' for the summary content and 'made_from' which
indicates the source document chunk. The 'metadata' instance variable contains
additional information such as indexed fields.
"""
text: str
made_from: DocumentChunk
@ -13,6 +22,15 @@ class TextSummary(DataPoint):
class CodeSummary(DataPoint):
"""
Summarizes code and its components.
This class inherits from DataPoint and contains a text representation alongside the
summarized content, which can either be a full code file or a part of it. The metadata
dictionary defines index fields for the class's instances, particularly focusing on the
'text' attribute. Public attributes include 'text', 'summarizes', and 'metadata'.
"""
text: str
summarizes: Union[CodeFile, CodePart]

View file

@ -11,6 +11,29 @@ from .models import TextSummary
async def summarize_text(
data_chunks: list[DocumentChunk], summarization_model: Type[BaseModel] = None
):
"""
Summarize the text contained in the provided data chunks.
If no summarization model is provided, the function retrieves the default model from the
configuration. It processes the data chunks asynchronously and returns summaries for
each chunk. If the provided list of data chunks is empty, it simply returns the list as
is.
Parameters:
-----------
- data_chunks (list[DocumentChunk]): A list of DocumentChunk objects containing text
to be summarized.
- summarization_model (Type[BaseModel]): An optional model used for summarizing
text. If not provided, the default is fetched from the configuration. (default
None)
Returns:
--------
A list of TextSummary objects, each containing the summary of a corresponding
DocumentChunk.
"""
if len(data_chunks) == 0:
return data_chunks

View file

@ -3,6 +3,18 @@ from typing import Optional
class GraphitiNode(DataPoint):
"""
Represent a node in a graph with optional content, name, and summary attributes.
This class extends DataPoint and includes a metadata dictionary that specifies the index
fields for the node's data. The public instance variables are:
- content: an optional string representing the content of the node.
- name: an optional string representing the name of the node.
- summary: an optional string providing a summary of the node.
- metadata: a dictionary outlining the fields used for indexing.
"""
content: Optional[str] = None
name: Optional[str] = None
summary: Optional[str] = None