From 2c351c499d8dcf458463905629b2c5c13b755fdb Mon Sep 17 00:00:00 2001 From: hande-k Date: Fri, 17 Jan 2025 10:30:34 +0100 Subject: [PATCH 1/6] add docstrings any typing to cognee tasks --- .../chunk_naive_llm_classifier.py | 24 ++++++++++++++++++- cognee/tasks/chunks/chunk_by_paragraph.py | 6 +++++ cognee/tasks/chunks/chunk_by_sentence.py | 16 ++++++++++--- cognee/tasks/chunks/chunk_by_word.py | 4 +++- cognee/tasks/chunks/query_chunks.py | 8 +++++++ .../chunks/remove_disconnected_chunks.py | 8 +++++++ .../completion/graph_query_completion.py | 12 ++++++++++ cognee/tasks/completion/query_completion.py | 9 +++++++ .../check_permissions_on_documents.py | 13 +++++++++- cognee/tasks/documents/classify_documents.py | 7 ++++++ .../extract_chunks_from_documents.py | 11 +++++++-- cognee/tasks/graph/extract_graph_from_code.py | 13 ++++++++-- cognee/tasks/graph/extract_graph_from_data.py | 11 +++++++-- 13 files changed, 130 insertions(+), 12 deletions(-) diff --git a/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py b/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py index 4c058de18..7cab3f73d 100644 --- a/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +++ b/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py @@ -10,7 +10,29 @@ from cognee.modules.chunking.models.DocumentChunk import DocumentChunk async def chunk_naive_llm_classifier( data_chunks: list[DocumentChunk], classification_model: Type[BaseModel] -): +) -> list[DocumentChunk]: + """ + Classifies a list of document chunks using a specified classification model and updates vector and graph databases with the classification results. + + Vector Database Structure: + - Collection Name: `classification` + - Payload Schema: + - uuid (str): Unique identifier for the classification. + - text (str): Text label of the classification. + - chunk_id (str): Identifier of the chunk associated with this classification. + - document_id (str): Identifier of the document associated with this classification. + + Graph Database Structure: + - Nodes: + - Represent document chunks, classification types, and classification subtypes. + - Edges: + - `is_media_type`: Links document chunks to their classification type. + - `is_subtype_of`: Links classification subtypes to their parent type. + - `is_classified_as`: Links document chunks to their classification subtypes. + Notes: + - The function assumes that vector and graph database engines (`get_vector_engine` and `get_graph_engine`) are properly initialized and accessible. + - Classification labels are processed to ensure uniqueness using UUIDs based on their values. + """ if len(data_chunks) == 0: return data_chunks diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 24d566074..9cb1f5b54 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -17,6 +17,12 @@ def chunk_by_paragraph( """ Chunks text by paragraph while preserving exact text reconstruction capability. When chunks are joined with empty string "", they reproduce the original text exactly. + + Notes: + - Tokenization is handled using the `tiktoken` library, ensuring compatibility with the vector engine's embedding model. + - If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk. + - Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed. + - Remaining text at the end of the input will be yielded as a final chunk. """ current_chunk = "" current_word_count = 0 diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py index 128ed4047..7c8798927 100644 --- a/cognee/tasks/chunks/chunk_by_sentence.py +++ b/cognee/tasks/chunks/chunk_by_sentence.py @@ -1,9 +1,19 @@ -from uuid import uuid4 -from typing import Optional +from uuid import uuid4, UUID +from typing import Optional, Iterator, Tuple from .chunk_by_word import chunk_by_word -def chunk_by_sentence(data: str, maximum_length: Optional[int] = None): +def chunk_by_sentence( + data: str, maximum_length: Optional[int] = None +) -> Iterator[Tuple[UUID, str, int, Optional[str]]]: + """ + Splits the input text into sentences based on word-level processing, with optional sentence length constraints. + + Notes: + - Relies on the `chunk_by_word` function for word-level tokenization and classification. + - Ensures sentences within paragraphs are uniquely identifiable using UUIDs. + - Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type. + """ sentence = "" paragraph_id = uuid4() word_count = 0 diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py index c42d0cfa1..3a0d6911a 100644 --- a/cognee/tasks/chunks/chunk_by_word.py +++ b/cognee/tasks/chunks/chunk_by_word.py @@ -1,4 +1,6 @@ import re +from typing import Iterator, Tuple + SENTENCE_ENDINGS = r"[.;!?…]" PARAGRAPH_ENDINGS = r"[\n\r]" @@ -34,7 +36,7 @@ def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool: return False -def chunk_by_word(data: str): +def chunk_by_word(data: str) -> Iterator[Tuple[str, str]]: """ Chunks text into words and endings while preserving whitespace. Whitespace is included with the preceding word. diff --git a/cognee/tasks/chunks/query_chunks.py b/cognee/tasks/chunks/query_chunks.py index 5a6d4f666..c0023dbbd 100644 --- a/cognee/tasks/chunks/query_chunks.py +++ b/cognee/tasks/chunks/query_chunks.py @@ -3,11 +3,19 @@ from cognee.infrastructure.databases.vector import get_vector_engine async def query_chunks(query: str) -> list[dict]: """ + + Queries the vector database to retrieve chunks related to the given query string. + Parameters: - query (str): The query string to filter nodes by. Returns: - list(dict): A list of objects providing information about the chunks related to query. + + Notes: + - The function uses the `search` method of the vector engine to find matches. + - Limits the results to the top 5 matching chunks to balance performance and relevance. + - Ensure that the vector database is properly initialized and contains the "document_chunk_text" collection. """ vector_engine = get_vector_engine() diff --git a/cognee/tasks/chunks/remove_disconnected_chunks.py b/cognee/tasks/chunks/remove_disconnected_chunks.py index 60443ecfc..723e4ca63 100644 --- a/cognee/tasks/chunks/remove_disconnected_chunks.py +++ b/cognee/tasks/chunks/remove_disconnected_chunks.py @@ -3,6 +3,14 @@ from cognee.modules.chunking.models.DocumentChunk import DocumentChunk async def remove_disconnected_chunks(data_chunks: list[DocumentChunk]) -> list[DocumentChunk]: + """ + Removes disconnected or obsolete chunks from the graph database. + + Notes: + - Obsolete chunks are defined as chunks with no "next_chunk" predecessor. + - Fully disconnected nodes are identified and deleted separately. + - This function assumes that the graph database is properly initialized and accessible. + """ graph_engine = await get_graph_engine() document_ids = set((data_chunk.document_id for data_chunk in data_chunks)) diff --git a/cognee/tasks/completion/graph_query_completion.py b/cognee/tasks/completion/graph_query_completion.py index b130d4f7b..16d27c721 100644 --- a/cognee/tasks/completion/graph_query_completion.py +++ b/cognee/tasks/completion/graph_query_completion.py @@ -6,6 +6,10 @@ from cognee.modules.retrieval.brute_force_triplet_search import brute_force_trip def retrieved_edges_to_string(retrieved_edges: list) -> str: + """ + Converts a list of retrieved graph edges into a human-readable string format. + + """ edge_strings = [] for edge in retrieved_edges: node1_string = edge.node1.attributes.get("text") or edge.node1.attributes.get("name") @@ -18,11 +22,19 @@ def retrieved_edges_to_string(retrieved_edges: list) -> str: async def graph_query_completion(query: str) -> list: """ + Executes a query on the graph database and retrieves a relevant completion based on the found data. + Parameters: - query (str): The query string to compute. Returns: - list: Answer to the query. + + Notes: + - The `brute_force_triplet_search` is used to retrieve relevant graph data. + - Prompts are dynamically rendered and provided to the LLM for contextual understanding. + - Ensure that the LLM client and graph database are properly configured and accessible. + """ found_triplets = await brute_force_triplet_search(query, top_k=5) diff --git a/cognee/tasks/completion/query_completion.py b/cognee/tasks/completion/query_completion.py index 12e5168b0..51599ad9c 100644 --- a/cognee/tasks/completion/query_completion.py +++ b/cognee/tasks/completion/query_completion.py @@ -6,11 +6,20 @@ from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt async def query_completion(query: str) -> list: """ + + Executes a query against a vector database and computes a relevant response using an LLM. + Parameters: - query (str): The query string to compute. Returns: - list: Answer to the query. + + Notes: + - Limits the search to the top 1 matching chunk for simplicity and relevance. + - Ensure that the vector database and LLM client are properly configured and accessible. + - The response model used for the LLM output is expected to be a string. + """ vector_engine = get_vector_engine() diff --git a/cognee/tasks/documents/check_permissions_on_documents.py b/cognee/tasks/documents/check_permissions_on_documents.py index 7d18cbf3a..41666f2b2 100644 --- a/cognee/tasks/documents/check_permissions_on_documents.py +++ b/cognee/tasks/documents/check_permissions_on_documents.py @@ -1,8 +1,19 @@ from cognee.modules.data.processing.document_types import Document from cognee.modules.users.permissions.methods import check_permission_on_documents +from typing import List -async def check_permissions_on_documents(documents: list[Document], user, permissions): +async def check_permissions_on_documents( + documents: list[Document], user, permissions +) -> List[Document]: + """ + Validates a user's permissions on a list of documents. + + Notes: + - This function assumes that `check_permission_on_documents` raises an exception if the permission check fails. + - It is designed to validate multiple permissions in a sequential manner for the same set of documents. + - Ensure that the `Document` and `user` objects conform to the expected structure and interfaces. + """ document_ids = [document.id for document in documents] for permission in permissions: diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index 118da5738..5c84afc7e 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -50,6 +50,13 @@ EXTENSION_TO_DOCUMENT_CLASS = { async def classify_documents(data_documents: list[Data]) -> list[Document]: + """ + Classifies a list of data items into specific document types based on file extensions. + + Notes: + - The function relies on `get_metadata` to retrieve metadata information for each data item. + - Ensure the `Data` objects and their attributes (e.g., `extension`, `id`) are valid before calling this function. + """ documents = [] for data_item in data_documents: metadata = await get_metadata(data_item.id) diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index 5ce224002..ecdd6817d 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, AsyncGenerator from cognee.modules.data.processing.document_types.Document import Document @@ -8,7 +8,14 @@ async def extract_chunks_from_documents( chunk_size: int = 1024, chunker="text_chunker", max_tokens: Optional[int] = None, -): +) -> AsyncGenerator: + """ + Extracts chunks of data from a list of documents based on the specified chunking parameters. + + Notes: + - The `read` method of the `Document` class must be implemented to support the chunking operation. + - The `chunker` parameter determines the chunking logic and should align with the document type. + """ for document in documents: for document_chunk in document.read( chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens diff --git a/cognee/tasks/graph/extract_graph_from_code.py b/cognee/tasks/graph/extract_graph_from_code.py index 8688b0af5..42375a683 100644 --- a/cognee/tasks/graph/extract_graph_from_code.py +++ b/cognee/tasks/graph/extract_graph_from_code.py @@ -1,12 +1,21 @@ import asyncio -from typing import Type +from typing import Type, List from pydantic import BaseModel from cognee.modules.data.extraction.knowledge_graph import extract_content_graph from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.tasks.storage import add_data_points -async def extract_graph_from_code(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]): +async def extract_graph_from_code( + data_chunks: list[DocumentChunk], graph_model: Type[BaseModel] +) -> List[DocumentChunk]: + """ + Extracts a knowledge graph from the text content of document chunks using a specified graph model. + + Notes: + - The `extract_content_graph` function processes each chunk's text to extract graph information. + - Graph nodes are stored using the `add_data_points` function for later retrieval or analysis. + """ chunk_graphs = await asyncio.gather( *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks] ) diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index b05c55a9e..4f025a914 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -1,5 +1,5 @@ import asyncio -from typing import Type +from typing import Type, List from pydantic import BaseModel @@ -13,7 +13,14 @@ from cognee.modules.graph.utils import ( from cognee.tasks.storage import add_data_points -async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]): +async def extract_graph_from_data( + data_chunks: list[DocumentChunk], graph_model: Type[BaseModel] +) -> List[DocumentChunk]: + """ + Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model. + + """ + chunk_graphs = await asyncio.gather( *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks] ) From 1e4f71dacb620b629d9caa60e2fb05ee24985ceb Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Mon, 20 Jan 2025 09:32:09 +0100 Subject: [PATCH 2/6] feat: adds windows test for dynamic_steps_example --- .../test_dynamic_steps_example_windows.yml | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 .github/workflows/test_dynamic_steps_example_windows.yml diff --git a/.github/workflows/test_dynamic_steps_example_windows.yml b/.github/workflows/test_dynamic_steps_example_windows.yml new file mode 100644 index 000000000..a97db9a5c --- /dev/null +++ b/.github/workflows/test_dynamic_steps_example_windows.yml @@ -0,0 +1,44 @@ +name: test-example-windows + +on: + workflow_dispatch: + pull_request: + types: [labeled, synchronize] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + run_notebook_test_windows: + name: test-windows + runs-on: windows-latest + defaults: + run: + shell: bash + steps: + - name: Check out + uses: actions/checkout@master + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.11.x' + + - name: Install Poetry + run: | + python -m pip install --upgrade pip + pip install poetry + + - name: Install dependencies + run: | + poetry install --no-interaction --all-extras + + - name: Execute Python Example + env: + ENV: 'dev' + PYTHONFAULTHANDLER: 1 + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} + GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} + run: poetry run python ./examples/python/dynamic_steps_example.py From b949b29fa7ffa90a417e2acdde572aa42240ab2a Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Mon, 20 Jan 2025 09:48:20 +0100 Subject: [PATCH 3/6] fix: changes graph DB to neo4j in windows test --- .github/workflows/test_dynamic_steps_example_windows.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_dynamic_steps_example_windows.yml b/.github/workflows/test_dynamic_steps_example_windows.yml index a97db9a5c..a3ee45da8 100644 --- a/.github/workflows/test_dynamic_steps_example_windows.yml +++ b/.github/workflows/test_dynamic_steps_example_windows.yml @@ -39,6 +39,7 @@ jobs: ENV: 'dev' PYTHONFAULTHANDLER: 1 LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} - GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} - GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} + GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }} + GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }} + GRAPH_DATABASE_USERNAME: "neo4j" run: poetry run python ./examples/python/dynamic_steps_example.py From 7932cf71593a6c81a644ccb0b6b07eab7a3494b6 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Mon, 20 Jan 2025 10:23:38 +0100 Subject: [PATCH 4/6] fix: sets graphdb back to networkx --- .github/workflows/test_dynamic_steps_example_windows.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/test_dynamic_steps_example_windows.yml b/.github/workflows/test_dynamic_steps_example_windows.yml index a3ee45da8..391f714da 100644 --- a/.github/workflows/test_dynamic_steps_example_windows.yml +++ b/.github/workflows/test_dynamic_steps_example_windows.yml @@ -39,7 +39,4 @@ jobs: ENV: 'dev' PYTHONFAULTHANDLER: 1 LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} - GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }} - GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }} - GRAPH_DATABASE_USERNAME: "neo4j" run: poetry run python ./examples/python/dynamic_steps_example.py From bf70705ed045fcb299fc06982e53620a4dd03565 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Mon, 20 Jan 2025 12:19:34 +0100 Subject: [PATCH 5/6] Fix: fixes networkx failed to load graph from file error --- .../databases/graph/networkx/adapter.py | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index 75969c798..2cb6c2729 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -250,7 +250,8 @@ class NetworkXAdapter(GraphDBInterface): graph_data = nx.readwrite.json_graph.node_link_data(self.graph) async with aiofiles.open(file_path, "w") as file: - await file.write(json.dumps(graph_data, cls=JSONEncoder)) + json_data = json.dumps(graph_data, cls=JSONEncoder) + await file.write(json_data) async def load_graph_from_file(self, file_path: str = None): """Asynchronously load the graph from a file in JSON format.""" @@ -265,19 +266,32 @@ class NetworkXAdapter(GraphDBInterface): graph_data = json.loads(await file.read()) for node in graph_data["nodes"]: try: - node["id"] = UUID(node["id"]) + if not isinstance(node["id"], UUID): + node["id"] = UUID(node["id"]) except Exception as e: print(e) pass - if "updated_at" in node: + + if isinstance(node.get("updated_at"), int): + node["updated_at"] = datetime.fromtimestamp( + node["updated_at"] / 1000, tz=timezone.utc + ) + elif isinstance(node.get("updated_at"), str): node["updated_at"] = datetime.strptime( node["updated_at"], "%Y-%m-%dT%H:%M:%S.%f%z" ) for edge in graph_data["links"]: try: - source_id = UUID(edge["source"]) - target_id = UUID(edge["target"]) + if not isinstance(edge["source"], UUID): + source_id = UUID(edge["source"]) + else: + source_id = edge["source"] + + if not isinstance(edge["target"], UUID): + target_id = UUID(edge["target"]) + else: + target_id = edge["target"] edge["source"] = source_id edge["target"] = target_id @@ -287,7 +301,11 @@ class NetworkXAdapter(GraphDBInterface): print(e) pass - if "updated_at" in edge: + if isinstance(edge["updated_at"], int): # Handle timestamp in milliseconds + edge["updated_at"] = datetime.fromtimestamp( + edge["updated_at"] / 1000, tz=timezone.utc + ) + elif isinstance(edge["updated_at"], str): edge["updated_at"] = datetime.strptime( edge["updated_at"], "%Y-%m-%dT%H:%M:%S.%f%z" ) From 957ab81879528c34ac231f78c6692abad1a62e29 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Mon, 20 Jan 2025 12:27:22 +0100 Subject: [PATCH 6/6] chore: renaming test --- .github/workflows/test_dynamic_steps_example_windows.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_dynamic_steps_example_windows.yml b/.github/workflows/test_dynamic_steps_example_windows.yml index 391f714da..d0a9c93e3 100644 --- a/.github/workflows/test_dynamic_steps_example_windows.yml +++ b/.github/workflows/test_dynamic_steps_example_windows.yml @@ -1,4 +1,4 @@ -name: test-example-windows +name: test on: workflow_dispatch: @@ -11,7 +11,7 @@ concurrency: jobs: run_notebook_test_windows: - name: test-windows + name: windows-latest runs-on: windows-latest defaults: run: