Docstring tasks. (#878)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
parent
ec68e99438
commit
bb68d6a0df
18 changed files with 610 additions and 37 deletions
|
|
@ -10,14 +10,23 @@ def chunk_by_paragraph(
|
|||
batch_paragraphs: bool = True,
|
||||
) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Chunks text by paragraph while preserving exact text reconstruction capability.
|
||||
When chunks are joined with empty string "", they reproduce the original text exactly.
|
||||
Chunk the input text by paragraph while enabling exact text reconstruction.
|
||||
|
||||
Notes:
|
||||
- Tokenization is handled using our tokenization adapters, ensuring compatibility with the vector engine's embedding model.
|
||||
- If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk.
|
||||
- Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed.
|
||||
- Remaining text at the end of the input will be yielded as a final chunk.
|
||||
This function divides the given text data into smaller chunks based on the specified
|
||||
maximum chunk size. It ensures that when the generated chunks are concatenated, they
|
||||
reproduce the original text accurately. The tokenization process is handled by adapters
|
||||
compatible with the vector engine's embedding model, and the function can operate in
|
||||
either batch mode or paragraph mode, based on the `batch_paragraphs` flag.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- data (str): The input text to be chunked.
|
||||
- max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or
|
||||
words.
|
||||
- batch_paragraphs (bool): Flag indicating whether to yield each paragraph as a
|
||||
separate chunk. If set to False, individual paragraphs are yielded as they are
|
||||
processed. (default True)
|
||||
"""
|
||||
current_chunk = ""
|
||||
chunk_index = 0
|
||||
|
|
|
|||
|
|
@ -5,6 +5,23 @@ from cognee.infrastructure.databases.vector.embeddings import get_embedding_engi
|
|||
|
||||
|
||||
def get_word_size(word: str) -> int:
|
||||
"""
|
||||
Calculate the size of a given word in terms of tokens.
|
||||
|
||||
If an embedding engine's tokenizer is available, count the tokens for the provided word.
|
||||
If the tokenizer is not available, assume the word counts as one token.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- word (str): The word for which the token size is to be calculated.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- int: The number of tokens representing the word, typically an integer, depending
|
||||
on the tokenizer's output.
|
||||
"""
|
||||
embedding_engine = get_embedding_engine()
|
||||
if embedding_engine.tokenizer:
|
||||
return embedding_engine.tokenizer.count_tokens(word)
|
||||
|
|
@ -16,12 +33,22 @@ def chunk_by_sentence(
|
|||
data: str, maximum_size: Optional[int] = None
|
||||
) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
|
||||
"""
|
||||
Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
|
||||
Splits text into sentences while preserving word and paragraph boundaries.
|
||||
|
||||
Notes:
|
||||
- Relies on the `chunk_by_word` function for word-level tokenization and classification.
|
||||
- Ensures sentences within paragraphs are uniquely identifiable using UUIDs.
|
||||
- Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type.
|
||||
This function processes the input string, dividing it into sentences based on word-level
|
||||
tokenization. Each sentence is identified with a unique UUID, and it handles scenarios
|
||||
where the text may end mid-sentence by tagging it with a specific type. If a maximum
|
||||
sentence length is specified, the function ensures that sentences do not exceed this
|
||||
length, raising a ValueError if an individual word surpasses it. The function utilizes
|
||||
an external word processing function `chunk_by_word` to determine the structure of the
|
||||
text.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- data (str): The input text to be split into sentences.
|
||||
- maximum_size (Optional[int]): An optional limit on the maximum size of sentences
|
||||
generated. (default None)
|
||||
"""
|
||||
sentence = ""
|
||||
paragraph_id = uuid4()
|
||||
|
|
|
|||
|
|
@ -8,15 +8,23 @@ PARAGRAPH_ENDINGS = r"[\n\r]"
|
|||
|
||||
def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
|
||||
"""
|
||||
Determines if the current position represents a real paragraph ending.
|
||||
Determine if the current position represents a valid paragraph end.
|
||||
|
||||
Args:
|
||||
last_char: The last processed character
|
||||
current_pos: Current position in the text
|
||||
text: The input text
|
||||
The function checks if the last character indicates a possible sentence ending, then
|
||||
verifies if the subsequent characters lead to a valid paragraph end based on specific
|
||||
conditions.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- last_char (str): The last processed character
|
||||
- current_pos (int): Current position in the text
|
||||
- text (str): The input text
|
||||
|
||||
Returns:
|
||||
bool: True if this is a real paragraph end, False otherwise
|
||||
--------
|
||||
|
||||
- bool: True if this is a real paragraph end, False otherwise
|
||||
"""
|
||||
if re.match(SENTENCE_ENDINGS, last_char):
|
||||
return True
|
||||
|
|
@ -38,9 +46,16 @@ def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
|
|||
|
||||
def chunk_by_word(data: str) -> Iterator[Tuple[str, str]]:
|
||||
"""
|
||||
Chunks text into words and endings while preserving whitespace.
|
||||
Whitespace is included with the preceding word.
|
||||
Outputs can be joined with "" to recreate the original input.
|
||||
Chunk text into words and sentence endings, preserving whitespace.
|
||||
|
||||
Whitespace is included with the preceding word. Outputs can be joined with "" to
|
||||
recreate the original input.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- data (str): The input string of text to be chunked into words and sentence
|
||||
endings.
|
||||
"""
|
||||
current_chunk = ""
|
||||
i = 0
|
||||
|
|
|
|||
|
|
@ -6,6 +6,14 @@ from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependenc
|
|||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Execute the main logic of the dependency graph processor.
|
||||
|
||||
This function sets up argument parsing to retrieve the repository path, checks the
|
||||
existence of the specified path, and processes the repository to produce a dependency
|
||||
graph. If the repository path does not exist, it logs an error message and terminates
|
||||
without further execution.
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("repo_path", help="Path to the repository")
|
||||
args = parser.parse_args()
|
||||
|
|
|
|||
|
|
@ -5,6 +5,14 @@ from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file
|
|||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Parse the command line arguments and print the repository file dependencies.
|
||||
|
||||
This function sets up an argument parser to retrieve the path of a repository. It checks
|
||||
if the provided path exists and if it doesn’t, it prints an error message and exits. If
|
||||
the path is valid, it calls an asynchronous function to get the dependencies and prints
|
||||
the nodes and their relations in the dependency graph.
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("repo_path", help="Path to the repository")
|
||||
args = parser.parse_args()
|
||||
|
|
|
|||
|
|
@ -3,6 +3,21 @@ from fastapi import status
|
|||
|
||||
|
||||
class NoRelevantDataError(CogneeApiError):
|
||||
"""
|
||||
Represents an error when no relevant data is found during a search. This class is a
|
||||
subclass of CogneeApiError.
|
||||
|
||||
Public methods:
|
||||
|
||||
- __init__
|
||||
|
||||
Instance variables:
|
||||
|
||||
- message
|
||||
- name
|
||||
- status_code
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str = "Search did not find any data.",
|
||||
|
|
|
|||
|
|
@ -52,7 +52,21 @@ EXTENSION_TO_DOCUMENT_CLASS = {
|
|||
|
||||
|
||||
def update_node_set(document):
|
||||
"""Extracts node_set from document's external_metadata."""
|
||||
"""
|
||||
Extracts node_set from document's external_metadata.
|
||||
|
||||
Parses the external_metadata of the given document and updates the document's
|
||||
belongs_to_set attribute with NodeSet objects generated from the node_set found in the
|
||||
external_metadata. If the external_metadata is not valid JSON, is not a dictionary, does
|
||||
not contain the 'node_set' key, or if node_set is not a list, the function has no effect
|
||||
and will return early.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- document: The document object which contains external_metadata from which the
|
||||
node_set will be extracted.
|
||||
"""
|
||||
try:
|
||||
external_metadata = json.loads(document.external_metadata)
|
||||
except json.JSONDecodeError:
|
||||
|
|
@ -76,11 +90,26 @@ def update_node_set(document):
|
|||
|
||||
async def classify_documents(data_documents: list[Data]) -> list[Document]:
|
||||
"""
|
||||
Classifies a list of data items into specific document types based on file extensions.
|
||||
Classifies a list of data items into specific document types based on their file
|
||||
extensions.
|
||||
|
||||
Notes:
|
||||
- The function relies on `get_metadata` to retrieve metadata information for each data item.
|
||||
- Ensure the `Data` objects and their attributes (e.g., `extension`, `id`) are valid before calling this function.
|
||||
This function processes each item in the provided list of data documents, retrieves
|
||||
relevant metadata, and creates instances of document classes mapped to their extensions.
|
||||
It ensures that the data items are valid before performing the classification and
|
||||
invokes `update_node_set` to extract and set relevant node information from the
|
||||
document's external metadata.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- data_documents (list[Data]): A list of Data objects representing the documents to
|
||||
be classified.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- list[Document]: A list of Document objects created based on the classified data
|
||||
documents.
|
||||
"""
|
||||
documents = []
|
||||
for data_item in data_documents:
|
||||
|
|
|
|||
|
|
@ -33,6 +33,25 @@ logger = get_logger("task:infer_data_ontology")
|
|||
|
||||
|
||||
async def extract_ontology(content: str, response_model: Type[BaseModel]):
|
||||
"""
|
||||
Extracts structured ontology from the provided content using a pre-defined LLM client.
|
||||
|
||||
This asynchronous function retrieves a system prompt from a file and utilizes an LLM
|
||||
client to create a structured output based on the input content and specified response
|
||||
model.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- content (str): The content from which to extract the ontology.
|
||||
- response_model (Type[BaseModel]): The model that defines the structure of the
|
||||
output ontology.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
The structured ontology extracted from the content.
|
||||
"""
|
||||
llm_client = get_llm_client()
|
||||
|
||||
system_prompt = read_query_prompt("extract_ontology.txt")
|
||||
|
|
@ -43,10 +62,38 @@ async def extract_ontology(content: str, response_model: Type[BaseModel]):
|
|||
|
||||
|
||||
class OntologyEngine:
|
||||
"""
|
||||
Manage ontology data and operations for graph structures, providing methods for data
|
||||
loading, flattening models, and adding ontological relationships to a graph database.
|
||||
|
||||
Public methods:
|
||||
|
||||
- flatten_model
|
||||
- recursive_flatten
|
||||
- load_data
|
||||
- add_graph_ontology
|
||||
"""
|
||||
|
||||
async def flatten_model(
|
||||
self, model: NodeModel, parent_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Flatten the model to a dictionary."""
|
||||
"""
|
||||
Flatten the model to a dictionary including optional parent ID and relationship details
|
||||
if available.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- model (NodeModel): The NodeModel instance to flatten.
|
||||
- parent_id (Optional[str]): An optional ID of the parent node for hierarchical
|
||||
purposes. (default None)
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- Dict[str, Any]: A dictionary representation of the model with flattened
|
||||
attributes.
|
||||
"""
|
||||
result = model.dict()
|
||||
result["parent_id"] = parent_id
|
||||
if model.default_relationship:
|
||||
|
|
@ -62,7 +109,23 @@ class OntologyEngine:
|
|||
async def recursive_flatten(
|
||||
self, items: Union[List[Dict[str, Any]], Dict[str, Any]], parent_id: Optional[str] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Recursively flatten the items."""
|
||||
"""
|
||||
Recursively flatten a hierarchical structure of models into a flat list of dictionaries.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- items (Union[List[Dict[str, Any]], Dict[str, Any]]): A list or dictionary
|
||||
containing models to flatten.
|
||||
- parent_id (Optional[str]): An optional ID of the parent node to maintain hierarchy
|
||||
during flattening. (default None)
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- List[Dict[str, Any]]: A flat list of dictionaries representing the hierarchical
|
||||
model structure.
|
||||
"""
|
||||
flat_list = []
|
||||
|
||||
if isinstance(items, list):
|
||||
|
|
@ -76,7 +139,20 @@ class OntologyEngine:
|
|||
return flat_list
|
||||
|
||||
async def load_data(self, file_path: str) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
|
||||
"""Load data from a JSON or CSV file."""
|
||||
"""
|
||||
Load data from a specified JSON or CSV file and return it in a structured format.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- file_path (str): The path to the file to load data from.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- Union[List[Dict[str, Any]], Dict[str, Any]]: Parsed data from the file as either a
|
||||
list of dictionaries or a single dictionary depending on content type.
|
||||
"""
|
||||
try:
|
||||
if file_path.endswith(".json"):
|
||||
async with aiofiles.open(file_path, mode="r") as f:
|
||||
|
|
@ -96,7 +172,18 @@ class OntologyEngine:
|
|||
)
|
||||
|
||||
async def add_graph_ontology(self, file_path: str = None, documents: list = None):
|
||||
"""Add graph ontology from a JSON or CSV file or infer from documents content."""
|
||||
"""
|
||||
Add graph ontology from a JSON or CSV file, or infer relationships from provided
|
||||
document content. Raise exceptions for invalid file types or missing entities.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- file_path (str): Optional path to a file containing data to be loaded. (default
|
||||
None)
|
||||
- documents (list): Optional list of document objects for content extraction if no
|
||||
file path is provided. (default None)
|
||||
"""
|
||||
if file_path is None:
|
||||
initial_chunks_and_ids = []
|
||||
|
||||
|
|
@ -202,6 +289,17 @@ class OntologyEngine:
|
|||
|
||||
|
||||
async def infer_data_ontology(documents, ontology_model=KnowledgeGraph, root_node_id=None):
|
||||
"""
|
||||
Infer data ontology from provided documents and optionally add it to a graph.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- documents: The documents from which to infer the ontology.
|
||||
- ontology_model: The ontology model to use for the inference, defaults to
|
||||
KnowledgeGraph. (default KnowledgeGraph)
|
||||
- root_node_id: An optional root node identifier for the ontology. (default None)
|
||||
"""
|
||||
if ontology_model == KnowledgeGraph:
|
||||
ontology_engine = OntologyEngine()
|
||||
root_node_id = await ontology_engine.add_graph_ontology(documents=documents)
|
||||
|
|
|
|||
|
|
@ -3,12 +3,40 @@ from pydantic import BaseModel, Field
|
|||
|
||||
|
||||
class RelationshipModel(BaseModel):
|
||||
"""
|
||||
Represents a relationship between two entities in a model.
|
||||
|
||||
This class holds the type of the relationship and the identifiers for the source and
|
||||
target entities. It includes the following public instance variables:
|
||||
|
||||
- type: A string indicating the type of relationship.
|
||||
- source: A string representing the source entity of the relationship.
|
||||
- target: A string representing the target entity of the relationship.
|
||||
"""
|
||||
|
||||
type: str
|
||||
source: str
|
||||
target: str
|
||||
|
||||
|
||||
class NodeModel(BaseModel):
|
||||
"""
|
||||
Represents a node in a hierarchical model structure with relationships to other nodes.
|
||||
|
||||
Public methods:
|
||||
|
||||
- __init__(self, node_id: str, name: str, default_relationship:
|
||||
Optional[RelationshipModel] = None, children: List[Union[Dict[str, Any], NodeModel]] =
|
||||
Field(default_factory=list))
|
||||
|
||||
Instance variables:
|
||||
|
||||
- node_id: Unique identifier for the node.
|
||||
- name: Name of the node.
|
||||
- default_relationship: Default relationship associated with the node, if any.
|
||||
- children: List of child nodes or dictionaries representing children for this node.
|
||||
"""
|
||||
|
||||
node_id: str
|
||||
name: str
|
||||
default_relationship: Optional[RelationshipModel] = None
|
||||
|
|
@ -19,12 +47,28 @@ NodeModel.model_rebuild()
|
|||
|
||||
|
||||
class OntologyNode(BaseModel):
|
||||
"""
|
||||
Represents a node in an ontology with a unique identifier, name, and description.
|
||||
"""
|
||||
|
||||
id: str = Field(..., description="Unique identifier made from node name.")
|
||||
name: str
|
||||
description: str
|
||||
|
||||
|
||||
class OntologyEdge(BaseModel):
|
||||
"""
|
||||
Represent an edge in an ontology, connecting a source and target with a specific
|
||||
relationship type.
|
||||
|
||||
The class includes the following instance variables:
|
||||
- id: A unique identifier for the edge.
|
||||
- source_id: The identifier of the source node.
|
||||
- target_id: The identifier of the target node.
|
||||
- relationship_type: The type of relationship represented by this edge, defining how the
|
||||
source and target are related.
|
||||
"""
|
||||
|
||||
id: str
|
||||
source_id: str
|
||||
target_id: str
|
||||
|
|
@ -32,5 +76,14 @@ class OntologyEdge(BaseModel):
|
|||
|
||||
|
||||
class GraphOntology(BaseModel):
|
||||
"""
|
||||
Represents a graph-based structure of ontology consisting of nodes and edges.
|
||||
|
||||
The GraphOntology class contains a collection of OntologyNode instances representing the
|
||||
nodes of the graph and OntologyEdge instances representing the relationships between
|
||||
them. Public methods include the management of nodes and edges as well as any relevant
|
||||
graph operations. Instance variables include a list of nodes and a list of edges.
|
||||
"""
|
||||
|
||||
nodes: list[OntologyNode]
|
||||
edges: list[OntologyEdge]
|
||||
|
|
|
|||
|
|
@ -10,11 +10,19 @@ from cognee.infrastructure.databases.relational import get_relational_config
|
|||
@lru_cache
|
||||
def get_dlt_destination() -> Union[type[dlt.destinations.sqlalchemy], None]:
|
||||
"""
|
||||
Handles propagation of the cognee database configuration to the dlt library
|
||||
Handle the propagation of the cognee database configuration to the dlt library.
|
||||
|
||||
This function determines the appropriate sqlalchemy destination based on the database
|
||||
provider specified in the relational configuration. It constructs the destination
|
||||
credentials for either sqlite or postgres databases accordingly. If the database
|
||||
provider is neither sqlite nor postgres, it returns None.
|
||||
|
||||
Returns:
|
||||
sqlachemy: sqlachemy destination used by the dlt library
|
||||
--------
|
||||
|
||||
- Union[type[dlt.destinations.sqlalchemy], None]: An instance of sqlalchemy
|
||||
destination used by the dlt library, or None if the database provider is
|
||||
unsupported.
|
||||
"""
|
||||
relational_config = get_relational_config()
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,26 @@ from typing import Union
|
|||
|
||||
|
||||
def get_data_from_llama_index(data_point: Union[Document, ImageDocument], dataset_name: str) -> str:
|
||||
"""
|
||||
Retrieve the file path based on the data point type.
|
||||
|
||||
Ensure the data point is an instance of either Document or ImageDocument. If the data
|
||||
point has a metadata or image path file path, return it; otherwise, save the data
|
||||
point's text to a file and return the newly created file path.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- data_point (Union[Document, ImageDocument]): An instance of Document or
|
||||
ImageDocument to extract data from.
|
||||
- dataset_name (str): The name of the dataset associated with the data point.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- str: The file path as a string where the data is stored or the existing path from
|
||||
the data point.
|
||||
"""
|
||||
# Specific type checking is used to ensure it's not a child class from Document
|
||||
if isinstance(data_point, Document) and type(data_point) is Document:
|
||||
file_path = data_point.metadata.get("file_path")
|
||||
|
|
|
|||
|
|
@ -19,10 +19,34 @@ logger = get_logger()
|
|||
|
||||
|
||||
class FileParser:
|
||||
"""
|
||||
Handles the parsing of files into source code and an abstract syntax tree
|
||||
representation. Public methods include:
|
||||
|
||||
- parse_file: Parses a file and returns its source code and syntax tree representation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.parsed_files = {}
|
||||
|
||||
async def parse_file(self, file_path: str) -> tuple[str, Tree]:
|
||||
"""
|
||||
Parse a file and return its source code along with its syntax tree representation.
|
||||
|
||||
If the file has already been parsed, retrieve the result from memory instead of reading
|
||||
the file again.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- file_path (str): The path of the file to parse.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- tuple[str, Tree]: A tuple containing the source code of the file and its
|
||||
corresponding syntax tree representation.
|
||||
"""
|
||||
PY_LANGUAGE = Language(tspython.language())
|
||||
source_code_parser = Parser(PY_LANGUAGE)
|
||||
|
||||
|
|
@ -35,6 +59,24 @@ class FileParser:
|
|||
|
||||
|
||||
async def get_source_code(file_path: str):
|
||||
"""
|
||||
Read source code from a file asynchronously.
|
||||
|
||||
This function attempts to open a file specified by the given file path, read its
|
||||
contents, and return the source code. In case of any errors during the file reading
|
||||
process, it logs an error message and returns None.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- file_path (str): The path to the file from which to read the source code.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
Returns the contents of the file as a string if successful, or None if an error
|
||||
occurs.
|
||||
"""
|
||||
try:
|
||||
async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
|
||||
source_code = await f.read()
|
||||
|
|
@ -45,7 +87,22 @@ async def get_source_code(file_path: str):
|
|||
|
||||
|
||||
def resolve_module_path(module_name):
|
||||
"""Find the file path of a module."""
|
||||
"""
|
||||
Find the file path of a module.
|
||||
|
||||
Return the file path of the specified module if found, or return None if the module does
|
||||
not exist or cannot be located.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- module_name: The name of the module whose file path is to be resolved.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
The file path of the module as a string or None if the module is not found.
|
||||
"""
|
||||
try:
|
||||
spec = importlib.util.find_spec(module_name)
|
||||
if spec and spec.origin:
|
||||
|
|
@ -58,7 +115,23 @@ def resolve_module_path(module_name):
|
|||
def find_function_location(
|
||||
module_path: str, function_name: str, parser: FileParser
|
||||
) -> Optional[tuple[str, str]]:
|
||||
"""Find the function definition in the module."""
|
||||
"""
|
||||
Find the location of a function definition in a specified module.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- module_path (str): The path to the module where the function is defined.
|
||||
- function_name (str): The name of the function whose location is to be found.
|
||||
- parser (FileParser): An instance of FileParser used to parse the module's source
|
||||
code.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- Optional[tuple[str, str]]: Returns a tuple containing the module path and the
|
||||
start point of the function if found; otherwise, returns None.
|
||||
"""
|
||||
if not module_path or not os.path.exists(module_path):
|
||||
return None
|
||||
|
||||
|
|
@ -78,6 +151,24 @@ def find_function_location(
|
|||
async def get_local_script_dependencies(
|
||||
repo_path: str, script_path: str, detailed_extraction: bool = False
|
||||
) -> CodeFile:
|
||||
"""
|
||||
Retrieve local script dependencies and create a CodeFile object.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- repo_path (str): The path to the repository that contains the script.
|
||||
- script_path (str): The path of the script for which dependencies are being
|
||||
extracted.
|
||||
- detailed_extraction (bool): A flag indicating whether to perform a detailed
|
||||
extraction of code components.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- CodeFile: Returns a CodeFile object containing information about the script,
|
||||
including its dependencies and definitions.
|
||||
"""
|
||||
code_file_parser = FileParser()
|
||||
source_code, source_code_tree = await code_file_parser.parse_file(script_path)
|
||||
|
||||
|
|
@ -113,6 +204,24 @@ async def get_local_script_dependencies(
|
|||
|
||||
|
||||
def find_node(nodes: list[Node], condition: callable) -> Node:
|
||||
"""
|
||||
Find and return the first node that satisfies the given condition.
|
||||
|
||||
Iterate through the provided list of nodes and return the first node for which the
|
||||
condition callable returns True. If no such node is found, return None.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- nodes (list[Node]): A list of Node objects to search through.
|
||||
- condition (callable): A callable that takes a Node and returns a boolean
|
||||
indicating if the node meets specified criteria.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- Node: The first Node that matches the condition, or None if no such node exists.
|
||||
"""
|
||||
for node in nodes:
|
||||
if condition(node):
|
||||
return node
|
||||
|
|
@ -123,6 +232,30 @@ def find_node(nodes: list[Node], condition: callable) -> Node:
|
|||
async def extract_code_parts(
|
||||
tree_root: Node, script_path: str, existing_nodes: list[DataPoint] = {}
|
||||
) -> AsyncGenerator[DataPoint, None]:
|
||||
"""
|
||||
Extract code parts from a given AST node tree asynchronously.
|
||||
|
||||
Iteratively yields DataPoint nodes representing import statements, function definitions,
|
||||
and class definitions found in the children of the specified tree root. The function
|
||||
checks
|
||||
if nodes are already present in the existing_nodes dictionary to prevent duplicates.
|
||||
This function has to be used in an asynchronous context, and it requires a valid
|
||||
tree_root
|
||||
and proper initialization of existing_nodes.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- tree_root (Node): The root node of the AST tree containing code parts to extract.
|
||||
- script_path (str): The file path of the script from which the AST was generated.
|
||||
- existing_nodes (list[DataPoint]): A dictionary that holds already extracted
|
||||
DataPoint nodes to avoid duplicates. (default {})
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
Yields DataPoint nodes representing imported modules, functions, and classes.
|
||||
"""
|
||||
for child_node in tree_root.children:
|
||||
if child_node.type == "import_statement" or child_node.type == "import_from_statement":
|
||||
parts = child_node.text.decode("utf-8").split()
|
||||
|
|
|
|||
|
|
@ -2,7 +2,24 @@ import os
|
|||
|
||||
|
||||
async def get_non_py_files(repo_path):
|
||||
"""Get files that are not .py files and their contents"""
|
||||
"""
|
||||
Get files that are not .py files and their contents.
|
||||
|
||||
Check if the specified repository path exists and if so, traverse the directory,
|
||||
collecting the paths of files that do not have a .py extension and meet the
|
||||
criteria set in the allowed and ignored patterns. Return a list of paths to
|
||||
those files.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- repo_path: The file system path to the repository to scan for non-Python files.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
A list of file paths that are not Python files and meet the specified criteria.
|
||||
"""
|
||||
if not os.path.exists(repo_path):
|
||||
return {}
|
||||
|
||||
|
|
@ -111,6 +128,22 @@ async def get_non_py_files(repo_path):
|
|||
}
|
||||
|
||||
def should_process(path):
|
||||
"""
|
||||
Determine if a file should be processed based on its extension and path patterns.
|
||||
|
||||
This function checks if the file extension is in the allowed list and ensures that none
|
||||
of the ignored patterns are present in the provided file path.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- path: The file path to check for processing eligibility.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
Returns True if the file should be processed; otherwise, False.
|
||||
"""
|
||||
_, ext = os.path.splitext(path)
|
||||
return ext in ALLOWED_EXTENSIONS and not any(
|
||||
pattern in path for pattern in IGNORED_PATTERNS
|
||||
|
|
|
|||
|
|
@ -11,7 +11,24 @@ from cognee.shared.CodeGraphEntities import CodeFile, Repository
|
|||
|
||||
|
||||
async def get_source_code_files(repo_path):
|
||||
"""Get .py files and their source code"""
|
||||
"""
|
||||
Retrieve Python source code files from the specified repository path.
|
||||
|
||||
This function scans the given repository path for files that have the .py extension
|
||||
while excluding test files and files within a virtual environment. It returns a list of
|
||||
absolute paths to the source code files that are not empty.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- repo_path: The file path to the repository to search for Python source files.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
A list of absolute paths to .py files that contain source code, excluding empty
|
||||
files, test files, and files from a virtual environment.
|
||||
"""
|
||||
if not os.path.exists(repo_path):
|
||||
return {}
|
||||
|
||||
|
|
@ -40,6 +57,26 @@ async def get_source_code_files(repo_path):
|
|||
|
||||
|
||||
def run_coroutine(coroutine_func, *args, **kwargs):
|
||||
"""
|
||||
Run a coroutine function until it completes.
|
||||
|
||||
This function creates a new asyncio event loop, sets it as the current loop, and
|
||||
executes the given coroutine function with the provided arguments. Once the coroutine
|
||||
completes, the loop is closed. Intended for use in environments where an existing event
|
||||
loop is not available or desirable.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- coroutine_func: The coroutine function to be run.
|
||||
- *args: Positional arguments to pass to the coroutine function.
|
||||
- **kwargs: Keyword arguments to pass to the coroutine function.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
The result returned by the coroutine after completion.
|
||||
"""
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
result = loop.run_until_complete(coroutine_func(*args, **kwargs))
|
||||
|
|
@ -50,7 +87,21 @@ def run_coroutine(coroutine_func, *args, **kwargs):
|
|||
async def get_repo_file_dependencies(
|
||||
repo_path: str, detailed_extraction: bool = False
|
||||
) -> AsyncGenerator[DataPoint, None]:
|
||||
"""Generate a dependency graph for Python files in the given repository path."""
|
||||
"""
|
||||
Generate a dependency graph for Python files in the given repository path.
|
||||
|
||||
Check the validity of the repository path and yield a repository object followed by the
|
||||
dependencies of Python files within that repository. Raise a FileNotFoundError if the
|
||||
provided path does not exist. The extraction of detailed dependencies can be controlled
|
||||
via the `detailed_extraction` argument.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- repo_path (str): The file path to the repository where Python files are located.
|
||||
- detailed_extraction (bool): A flag indicating whether to perform a detailed
|
||||
extraction of dependencies (default is False). (default False)
|
||||
"""
|
||||
|
||||
if not os.path.exists(repo_path):
|
||||
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
|
||||
|
|
|
|||
|
|
@ -2,6 +2,19 @@ from cognee.shared.data_models import SummarizedCode, SummarizedClass, Summarize
|
|||
|
||||
|
||||
def get_mock_summarized_code() -> SummarizedCode:
|
||||
"""
|
||||
Return a summarized representation of mock code.
|
||||
|
||||
This function constructs and returns a `SummarizedCode` object that includes various
|
||||
components such as file name, high-level summary, key features, imports, constants,
|
||||
classes, and functions, all described with placeholders for mock data.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- SummarizedCode: A `SummarizedCode` object containing mock data for file summary,
|
||||
features, imports, constants, classes, and functions.
|
||||
"""
|
||||
return SummarizedCode(
|
||||
file_name="mock_file.py",
|
||||
high_level_summary="This is a mock high-level summary.",
|
||||
|
|
|
|||
|
|
@ -6,6 +6,15 @@ from cognee.shared.CodeGraphEntities import CodeFile, CodePart
|
|||
|
||||
|
||||
class TextSummary(DataPoint):
|
||||
"""
|
||||
Represent a text summary derived from a document chunk.
|
||||
|
||||
This class encapsulates a text summary as well as its associated metadata. The public
|
||||
instance variables include 'text' for the summary content and 'made_from' which
|
||||
indicates the source document chunk. The 'metadata' instance variable contains
|
||||
additional information such as indexed fields.
|
||||
"""
|
||||
|
||||
text: str
|
||||
made_from: DocumentChunk
|
||||
|
||||
|
|
@ -13,6 +22,15 @@ class TextSummary(DataPoint):
|
|||
|
||||
|
||||
class CodeSummary(DataPoint):
|
||||
"""
|
||||
Summarizes code and its components.
|
||||
|
||||
This class inherits from DataPoint and contains a text representation alongside the
|
||||
summarized content, which can either be a full code file or a part of it. The metadata
|
||||
dictionary defines index fields for the class's instances, particularly focusing on the
|
||||
'text' attribute. Public attributes include 'text', 'summarizes', and 'metadata'.
|
||||
"""
|
||||
|
||||
text: str
|
||||
summarizes: Union[CodeFile, CodePart]
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,29 @@ from .models import TextSummary
|
|||
async def summarize_text(
|
||||
data_chunks: list[DocumentChunk], summarization_model: Type[BaseModel] = None
|
||||
):
|
||||
"""
|
||||
Summarize the text contained in the provided data chunks.
|
||||
|
||||
If no summarization model is provided, the function retrieves the default model from the
|
||||
configuration. It processes the data chunks asynchronously and returns summaries for
|
||||
each chunk. If the provided list of data chunks is empty, it simply returns the list as
|
||||
is.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- data_chunks (list[DocumentChunk]): A list of DocumentChunk objects containing text
|
||||
to be summarized.
|
||||
- summarization_model (Type[BaseModel]): An optional model used for summarizing
|
||||
text. If not provided, the default is fetched from the configuration. (default
|
||||
None)
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
A list of TextSummary objects, each containing the summary of a corresponding
|
||||
DocumentChunk.
|
||||
"""
|
||||
if len(data_chunks) == 0:
|
||||
return data_chunks
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,18 @@ from typing import Optional
|
|||
|
||||
|
||||
class GraphitiNode(DataPoint):
|
||||
"""
|
||||
Represent a node in a graph with optional content, name, and summary attributes.
|
||||
|
||||
This class extends DataPoint and includes a metadata dictionary that specifies the index
|
||||
fields for the node's data. The public instance variables are:
|
||||
|
||||
- content: an optional string representing the content of the node.
|
||||
- name: an optional string representing the name of the node.
|
||||
- summary: an optional string providing a summary of the node.
|
||||
- metadata: a dictionary outlining the fields used for indexing.
|
||||
"""
|
||||
|
||||
content: Optional[str] = None
|
||||
name: Optional[str] = None
|
||||
summary: Optional[str] = None
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue