fix: batch datapoints on save to limit bandwidth size

This commit is contained in:
Boris Arzentar 2025-05-12 11:28:13 +02:00
parent f999232b8e
commit a1e605ca97
22 changed files with 4251 additions and 3889 deletions

View file

@ -45,42 +45,48 @@ def record_graph_changes(func):
async with db_engine.get_async_session() as session:
if func.__name__ == "add_nodes":
nodes: List[DataPoint] = args[0]
relationship_ledgers = []
for node in nodes:
try:
node_id = UUID(str(node.id))
relationship = GraphRelationshipLedger(
id=uuid5(NAMESPACE_OID, f"{datetime.now(timezone.utc).timestamp()}"),
source_node_id=node_id,
destination_node_id=node_id,
creator_function=f"{creator}.node",
node_label=getattr(node, "name", None) or str(node.id),
)
session.add(relationship)
await session.flush()
except Exception as e:
logger.debug(f"Error adding relationship: {e}")
await session.rollback()
continue
node_id = UUID(str(node.id))
relationship_ledgers.append(GraphRelationshipLedger(
id=uuid5(NAMESPACE_OID, f"{datetime.now(timezone.utc).timestamp()}"),
source_node_id=node_id,
destination_node_id=node_id,
creator_function=f"{creator}.node",
node_label=getattr(node, "name", None) or str(node.id),
))
try:
session.add_all(relationship_ledgers)
await session.flush()
except Exception as e:
logger.debug(f"Error adding relationship: {e}")
await session.rollback()
elif func.__name__ == "add_edges":
edges = args[0]
relationship_ledgers = []
for edge in edges:
try:
source_id = UUID(str(edge[0]))
target_id = UUID(str(edge[1]))
rel_type = str(edge[2])
relationship = GraphRelationshipLedger(
id=uuid5(NAMESPACE_OID, f"{datetime.now(timezone.utc).timestamp()}"),
source_node_id=source_id,
destination_node_id=target_id,
creator_function=f"{creator}.{rel_type}",
)
session.add(relationship)
await session.flush()
except Exception as e:
logger.debug(f"Error adding relationship: {e}")
await session.rollback()
continue
source_id = UUID(str(edge[0]))
target_id = UUID(str(edge[1]))
rel_type = str(edge[2])
relationship_ledgers.append(GraphRelationshipLedger(
id=uuid5(NAMESPACE_OID, f"{datetime.now(timezone.utc).timestamp()}"),
source_node_id=source_id,
destination_node_id=target_id,
creator_function=f"{creator}.{rel_type}",
))
try:
session.add_all(relationship_ledgers)
await session.flush()
except Exception as e:
logger.debug(f"Error adding relationship: {e}")
await session.rollback()
try:
await session.commit()

View file

@ -1,5 +1,4 @@
from typing import BinaryIO, TypedDict
import hashlib
from .guess_file_type import guess_file_type
from cognee.shared.utils import get_file_content_hash

View file

@ -4,6 +4,9 @@ import litellm
import instructor
from typing import Type
from pydantic import BaseModel
from openai import ContentFilterFinishReasonError
from litellm.exceptions import ContentPolicyViolationError
from instructor.exceptions import InstructorRetryException
from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError
from cognee.infrastructure.llm.llm_interface import LLMInterface
@ -67,7 +70,17 @@ class GenericAPIAdapter(LLMInterface):
api_base=self.endpoint,
response_model=response_model,
)
except litellm.exceptions.ContentPolicyViolationError:
except (
ContentFilterFinishReasonError,
ContentPolicyViolationError,
InstructorRetryException,
) as error:
if (
isinstance(error, InstructorRetryException)
and not "content management policy" in str(error).lower()
):
raise error
if not (self.fallback_model and self.fallback_api_key and self.fallback_endpoint):
raise ContentPolicyFilterError(
f"The provided input contains content that is not aligned with our content policy: {text_input}"
@ -92,7 +105,17 @@ class GenericAPIAdapter(LLMInterface):
api_base=self.fallback_endpoint,
response_model=response_model,
)
except litellm.exceptions.ContentPolicyViolationError:
raise ContentPolicyFilterError(
f"The provided input contains content that is not aligned with our content policy: {text_input}"
)
except (
ContentFilterFinishReasonError,
ContentPolicyViolationError,
InstructorRetryException,
) as error:
if (
isinstance(error, InstructorRetryException)
and not "content management policy" in str(error).lower()
):
raise error
else:
raise ContentPolicyFilterError(
f"The provided input contains content that is not aligned with our content policy: {text_input}"
)

View file

@ -5,6 +5,8 @@ import instructor
from typing import Type
from pydantic import BaseModel
from openai import ContentFilterFinishReasonError
from litellm.exceptions import ContentPolicyViolationError
from instructor.exceptions import InstructorRetryException
from cognee.exceptions import InvalidValueError
from cognee.infrastructure.llm.prompts import read_query_prompt
@ -87,7 +89,17 @@ class OpenAIAdapter(LLMInterface):
response_model=response_model,
max_retries=self.MAX_RETRIES,
)
except ContentFilterFinishReasonError:
except (
ContentFilterFinishReasonError,
ContentPolicyViolationError,
InstructorRetryException,
) as error:
if (
isinstance(error, InstructorRetryException)
and not "content management policy" in str(error).lower()
):
raise error
if not (self.fallback_model and self.fallback_api_key):
raise ContentPolicyFilterError(
f"The provided input contains content that is not aligned with our content policy: {text_input}"
@ -112,10 +124,20 @@ class OpenAIAdapter(LLMInterface):
response_model=response_model,
max_retries=self.MAX_RETRIES,
)
except ContentFilterFinishReasonError:
raise ContentPolicyFilterError(
f"The provided input contains content that is not aligned with our content policy: {text_input}"
)
except (
ContentFilterFinishReasonError,
ContentPolicyViolationError,
InstructorRetryException,
) as error:
if (
isinstance(error, InstructorRetryException)
and not "content management policy" in str(error).lower()
):
raise error
else:
raise ContentPolicyFilterError(
f"The provided input contains content that is not aligned with our content policy: {text_input}"
)
@observe
@sleep_and_retry_sync()

View file

@ -5,6 +5,7 @@ from .open_data_file import open_data_file
class TextDocument(Document):
type: str = "text"
mime_type: str = "text/plain"
def read(self, chunker_cls: Chunker, max_chunk_size: int):
def get_text():

View file

@ -45,11 +45,18 @@ def expand_with_nodes_and_edges(
type_node_key = f"{type_node_id}_type"
if type_node_key not in added_nodes_map and type_node_key not in key_mapping:
(
ontology_entity_type_nodes,
ontology_entity_type_edges,
ontology_closest_class_node,
) = ontology_resolver.get_subgraph(node_name=type_node_name, node_type="classes")
if ontology_resolver:
(
ontology_entity_type_nodes,
ontology_entity_type_edges,
ontology_closest_class_node,
) = ontology_resolver.get_subgraph(
node_name=type_node_name, node_type="classes"
)
else:
ontology_entity_type_nodes = []
ontology_entity_type_edges = []
ontology_closest_class_node = None
if ontology_closest_class_node:
name_mapping[type_node_name] = ontology_closest_class_node.name
@ -125,9 +132,14 @@ def expand_with_nodes_and_edges(
entity_node_key = f"{node_id}_entity"
if entity_node_key not in added_nodes_map and entity_node_key not in key_mapping:
ontology_entity_nodes, ontology_entity_edges, start_ent_ont = (
ontology_resolver.get_subgraph(node_name=node_name, node_type="individuals")
)
if ontology_resolver:
ontology_entity_nodes, ontology_entity_edges, start_ent_ont = (
ontology_resolver.get_subgraph(node_name=node_name, node_type="individuals")
)
else:
ontology_entity_nodes = []
ontology_entity_edges = []
start_ent_ont = None
if start_ent_ont:
name_mapping[node_name] = start_ent_ont.name
@ -234,7 +246,7 @@ def expand_with_nodes_and_edges(
)
existing_edges_map[edge_key] = True
graph_nodes = data_chunks + list(added_ontology_nodes_map.values())
graph_nodes = list(added_ontology_nodes_map.values())
graph_edges = relationships + ontology_relationships
return graph_nodes, graph_edges

View file

@ -0,0 +1,6 @@
import hashlib
def get_text_content_hash(text: str) -> str:
encoded_text = text.encode("utf-8")
return hashlib.md5(encoded_text).hexdigest()

View file

@ -1,6 +1,5 @@
import asyncio
from typing import Type, List, Optional
from typing import Type, List
from pydantic import BaseModel
from cognee.infrastructure.databases.graph import get_graph_engine
@ -12,7 +11,6 @@ from cognee.modules.graph.utils import (
retrieve_existing_edges,
)
from cognee.shared.data_models import KnowledgeGraph
from cognee.tasks.storage import add_data_points
async def integrate_chunk_graphs(
@ -28,7 +26,6 @@ async def integrate_chunk_graphs(
for chunk_index, chunk_graph in enumerate(chunk_graphs):
data_chunks[chunk_index].contains = chunk_graph
await add_data_points(chunk_graphs)
return data_chunks
existing_edges_map = await retrieve_existing_edges(
@ -41,7 +38,7 @@ async def integrate_chunk_graphs(
)
if len(graph_nodes) > 0:
await add_data_points(graph_nodes)
await graph_engine.add_nodes(graph_nodes)
if len(graph_edges) > 0:
await graph_engine.add_edges(graph_edges)

View file

@ -0,0 +1,76 @@
import asyncio
from typing import List
from uuid import NAMESPACE_OID, uuid5
from cognee.infrastructure.engine import DataPoint
from cognee.modules.graph.utils import get_graph_from_model
class Document(DataPoint):
path: str
metadata: dict = {"index_fields": []}
class DocumentChunk(DataPoint):
part_of: Document
text: str
contains: List["Entity"] = None
metadata: dict = {"index_fields": ["text"]}
class EntityType(DataPoint):
name: str
metadata: dict = {"index_fields": ["name"]}
class Entity(DataPoint):
name: str
is_type: EntityType
metadata: dict = {"index_fields": ["name"]}
DocumentChunk.model_rebuild()
async def get_graph_from_model_test():
document = Document(path="file_path")
document_chunk = DocumentChunk(
id=uuid5(NAMESPACE_OID, "file_name"),
text="some text",
part_of=document,
contains=[],
)
document_chunk.contains.append(
Entity(
name="Entity",
is_type=EntityType(
name="Type 1",
),
)
)
added_nodes = {}
added_edges = {}
visited_properties = {}
result = await get_graph_from_model(
document_chunk,
added_nodes=added_nodes,
added_edges=added_edges,
visited_properties=visited_properties,
)
nodes = result[0]
edges = result[1]
assert len(nodes) == 4
assert len(edges) == 3
document_chunk_node = next(filter(lambda node: node.type is "DocumentChunk", nodes))
assert not hasattr(document_chunk_node, "part_of"), "Expected part_of attribute to be removed"
if __name__ == "__main__":
asyncio.run(get_graph_from_model_test())

View file

@ -24,7 +24,7 @@ RUN pip install poetry
RUN poetry config virtualenvs.create false
RUN poetry install --extras neo4j --extras qdrant --no-root --without dev
RUN poetry install --extras neo4j --extras qdrant --no-root
COPY cognee/ /app/cognee
COPY distributed/ /app/distributed

View file

@ -1,24 +1,28 @@
import pathlib
from os import path
from cognee.api.v1.add import add
# from cognee.api.v1.add import add
from cognee.api.v1.prune import prune
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.infrastructure.llm.utils import get_max_chunk_tokens
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
from cognee.modules.data.models import Data
from cognee.modules.data.processing.document_types import Document
from cognee.modules.engine.operations.setup import setup
from cognee.modules.ingestion.get_text_content_hash import get_text_content_hash
from cognee.modules.pipelines.operations.run_tasks import run_tasks
from cognee.modules.pipelines.tasks.task import Task
from cognee.modules.users.methods.get_default_user import get_default_user
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
from cognee.modules.data.methods.get_datasets_by_name import get_datasets_by_name
# from cognee.modules.data.methods.get_dataset_data import get_dataset_data
# from cognee.modules.data.methods.get_datasets_by_name import get_datasets_by_name
from cognee.shared.logging_utils import get_logger
from cognee.tasks.documents.classify_documents import classify_documents
from cognee.tasks.documents.extract_chunks_from_documents import extract_chunks_from_documents
from distributed.app import app
from distributed.queues import finished_jobs_queue, save_data_points_queue
from distributed.models.TextDocument import TextDocument
from distributed.queues import save_data_points_queue
from distributed.workers.data_point_saver_worker import data_point_saver_worker
from distributed.workers.graph_extraction_worker import graph_extraction_worker
@ -28,10 +32,9 @@ logger = get_logger()
@app.local_entrypoint()
async def main():
# Clear queues
finished_jobs_queue.clear()
save_data_points_queue.clear()
dataset_name = "main"
# dataset_name = "main"
data_directory_name = ".data"
data_directory_path = path.join(pathlib.Path(__file__).parent, data_directory_name)
@ -45,18 +48,65 @@ async def main():
await prune.prune_data()
await prune.prune_system(metadata=True)
await setup()
# Add files to the metastore
await add(data=data_directory_path, dataset_name=dataset_name)
# await add(data=data_directory_path, dataset_name=dataset_name)
user = await get_default_user()
datasets = await get_datasets_by_name(dataset_name, user.id)
documents = await get_dataset_data(dataset_id=datasets[0].id)
# datasets = await get_datasets_by_name(dataset_name, user.id)
# documents = await get_dataset_data(dataset_id=datasets[0].id)
import duckdb
connection = duckdb.connect()
dataset_file_name = "de-00000-of-00003-f8e581c008ccc7f2.parquet"
dataset_file_path = path.join(data_directory_path, dataset_file_name)
df = connection.execute(f"SELECT * FROM '{dataset_file_path}'").fetchdf()
documents = []
for _, row in df.iterrows():
file_id = str(row["id"])
content = row["text"]
documents.append(
TextDocument(
name=file_id,
content=content,
raw_data_location=f"{dataset_file_name}_{file_id}",
external_metadata="",
)
)
documents: list[TextDocument] = documents[0:100]
print(f"We have {len(documents)} documents in the dataset.")
data_documents = [
Data(
id=document.id,
name=document.name,
raw_data_location=document.raw_data_location,
extension="txt",
mime_type=document.mime_type,
owner_id=user.id,
content_hash=get_text_content_hash(document.content),
external_metadata=document.external_metadata,
node_set=None,
token_count=-1,
)
for document in documents
]
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
session.add_all(data_documents)
await session.commit()
# Start data_point_saver_worker functions
for _ in range(number_of_data_saving_workers):
worker_future = data_point_saver_worker.spawn(total_number_of_workers=len(documents))
worker_future = data_point_saver_worker.spawn()
consumer_futures.append(worker_future)
producer_futures = []
@ -75,10 +125,9 @@ async def main():
for item in batch:
async for worker_feature in run_tasks(
[
Task(classify_documents),
Task(
extract_chunks_from_documents,
max_chunk_size=get_max_chunk_tokens(),
max_chunk_size=2000,
chunker=TextChunker,
),
Task(
@ -94,21 +143,25 @@ async def main():
pass
batch_results = []
for producer_future in producer_futures:
try:
result = producer_future.get()
except Exception as e:
result = e
batch_results.append(result)
print(f"Number of documents processed: {len(results)}")
results.extend(batch_results)
finished_jobs_queue.put(len(results))
save_data_points_queue.put(())
for consumer_future in consumer_futures:
try:
print("Finished but waiting")
print("Finished but waiting for saving worker to finish.")
consumer_final = consumer_future.get()
print(f"We got all futures {consumer_final}")
print(f"All workers are done: {consumer_final}")
except Exception as e:
logger.error(e)

View file

@ -16,9 +16,7 @@ image = (
Image.from_dockerfile(
path=pathlib.Path(path.join(path.dirname(__file__), "Dockerfile")).resolve(),
force_build=False,
).env(local_env_vars)
# .pip_install_from_pyproject(pyproject_toml=pathlib.Path(path.join(path.dirname(__file__), "../pyproject.toml")).resolve())
# .poetry_install_from_file(poetry_pyproject_toml=pathlib.Path(path.join(path.dirname(__file__), "../pyproject.toml")).resolve())
# .add_local_dir(pathlib.Path("./venv/bin").resolve(), remote_path="/app/.venv")
# .add_local_python_source(pathlib.Path("./cognee").resolve())
)
.env(local_env_vars)
.add_local_python_source("cognee", "entrypoint")
)

View file

@ -0,0 +1,15 @@
from cognee.modules.chunking.Chunker import Chunker
from cognee.modules.data.processing.document_types import Document
class TextDocument(Document):
type: str = "text"
mime_type: str = "text/plain"
content: str
def read(self, chunker_cls: Chunker, max_chunk_size: int):
def get_text():
yield self.content
chunker: Chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text)
yield from chunker.read()

View file

@ -1,10 +1,7 @@
from modal import Queue
# Create (or get) two queues:
# Create (or get) queues:
# - save_data_points_queue: Stores messages produced by the producer functions.
# - finished_jobs_queue: Keeps track of the number of finished producer jobs.
save_data_points_queue = Queue.from_name("save_data_points_queue", create_if_missing=True)
finished_jobs_queue = Queue.from_name("finished_jobs_queue", create_if_missing=True)

View file

@ -38,4 +38,4 @@ async def extract_graph_from_data(
data_chunks, chunk_graphs, ontology_adapter, existing_edges_map
)
return graph_nodes, graph_edges
return data_chunks, graph_nodes, graph_edges

View file

@ -1,38 +1,106 @@
import asyncio
from cognee.modules.graph.utils import deduplicate_nodes_and_edges, get_graph_from_model
# import json
# import asyncio
from pympler import asizeof
# from cognee.modules.storage.utils import JSONEncoder
from distributed.queues import save_data_points_queue
# from cognee.modules.graph.utils import get_graph_from_model
async def save_data_points(data_points_and_relationships: tuple[list, list]):
data_points = data_points_and_relationships[0]
data_point_connections = data_points_and_relationships[1]
# data_points = data_points_and_relationships[0]
# data_point_connections = data_points_and_relationships[1]
nodes = []
edges = []
# added_nodes = {}
# added_edges = {}
# visited_properties = {}
added_nodes = {}
added_edges = {}
visited_properties = {}
# nodes_and_edges: list[tuple] = await asyncio.gather(
# *[
# get_graph_from_model(
# data_point,
# added_nodes=added_nodes,
# added_edges=added_edges,
# visited_properties=visited_properties,
# )
# for data_point in data_points
# ]
# )
results = await asyncio.gather(
*[
get_graph_from_model(
data_point,
added_nodes=added_nodes,
added_edges=added_edges,
visited_properties=visited_properties,
)
for data_point in data_points
]
)
# graph_data_deduplication = GraphDataDeduplication()
# deduplicated_nodes_and_edges = [graph_data_deduplication.deduplicate_nodes_and_edges(nodes, edges + data_point_connections) for nodes, edges in nodes_and_edges]
for result_nodes, result_edges in results:
nodes.extend(result_nodes)
edges.extend(result_edges)
node_batch = []
edge_batch = []
nodes, edges = deduplicate_nodes_and_edges(nodes, edges + data_point_connections)
for nodes, edges in data_points_and_relationships:
for node in nodes:
if asizeof.asizeof(node) >= 500000:
print(f"Node too large:\n{node.id}\n")
# await index_data_points(nodes)
node_batch.append(node)
save_data_points_queue.put((nodes, edges))
if asizeof.asizeof(node_batch) >= 500000:
try_pushing_nodes_to_queue(node_batch)
node_batch = []
if len(node_batch) > 0:
try_pushing_nodes_to_queue(node_batch)
node_batch = []
for edge in edges:
edge_batch.append(edge)
if asizeof.asizeof(edge_batch) >= 500000:
try_pushing_edges_to_queue(edge_batch)
edge_batch = []
if len(edge_batch) > 0:
try_pushing_edges_to_queue(edge_batch)
edge_batch = []
# graph_data_deduplication.reset()
class GraphDataDeduplication:
nodes_and_edges_map: dict
def __init__(self):
self.reset()
def reset(self):
self.nodes_and_edges_map = {}
def deduplicate_nodes_and_edges(self, nodes: list, edges: list):
final_nodes = []
final_edges = []
for node in nodes:
node_key = str(node.id)
if node_key not in self.nodes_and_edges_map:
self.nodes_and_edges_map[node_key] = True
final_nodes.append(node)
for edge in edges:
edge_key = str(edge[0]) + str(edge[2]) + str(edge[1])
if edge_key not in self.nodes_and_edges_map:
self.nodes_and_edges_map[edge_key] = True
final_edges.append(edge)
return final_nodes, final_edges
def try_pushing_nodes_to_queue(node_batch):
try:
save_data_points_queue.put((node_batch, []))
except Exception as e:
first_half, second_half = node_batch[:len(node_batch) // 2], node_batch[len(node_batch) // 2:]
save_data_points_queue.put((first_half, []))
save_data_points_queue.put((second_half, []))
def try_pushing_edges_to_queue(edge_batch):
try:
save_data_points_queue.put(([], edge_batch))
except Exception as e:
first_half, second_half = edge_batch[:len(edge_batch) // 2], edge_batch[len(edge_batch) // 2:]
save_data_points_queue.put(([], first_half))
save_data_points_queue.put(([], second_half))

View file

@ -2,23 +2,26 @@ import asyncio
from typing import Type
from uuid import uuid5
from pydantic import BaseModel
from cognee.modules.graph.utils import get_graph_from_model
from cognee.tasks.summarization.models import TextSummary
from cognee.modules.data.extraction.extract_summary import extract_summary
from cognee.infrastructure.engine.models.DataPoint import DataPoint
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
from cognee.modules.data.extraction.extract_summary import extract_summary
async def summarize_text(
data_points_and_relationships: tuple[list[DocumentChunk], list],
data_points_and_relationships: tuple[list[DocumentChunk], list[DataPoint], list],
summarization_model: Type[BaseModel],
):
data_chunks = data_points_and_relationships[0]
edges = data_points_and_relationships[1]
document_chunks = data_points_and_relationships[0]
nodes = data_points_and_relationships[1]
relationships = data_points_and_relationships[2]
if len(data_chunks) == 0:
return data_chunks
if len(document_chunks) == 0:
return document_chunks
chunk_summaries = await asyncio.gather(
*[extract_summary(chunk.text, summarization_model) for chunk in data_chunks]
*[extract_summary(chunk.text, summarization_model) for chunk in document_chunks]
)
summaries = [
@ -27,7 +30,56 @@ async def summarize_text(
made_from=chunk,
text=chunk_summaries[chunk_index].summary,
)
for (chunk_index, chunk) in enumerate(data_chunks)
for (chunk_index, chunk) in enumerate(document_chunks)
]
return summaries, edges
data_points = summaries + nodes
added_nodes = {}
added_edges = {}
visited_properties = {}
nodes_and_edges: list[tuple] = await asyncio.gather(
*[
get_graph_from_model(
data_point,
added_nodes=added_nodes,
added_edges=added_edges,
visited_properties=visited_properties,
)
for data_point in data_points
]
)
graph_data_deduplication = GraphDataDeduplication()
deduplicated_nodes_and_edges = [graph_data_deduplication.deduplicate_nodes_and_edges(nodes, edges + relationships) for nodes, edges in nodes_and_edges]
return deduplicated_nodes_and_edges
class GraphDataDeduplication:
nodes_and_edges_map: dict
def __init__(self):
self.reset()
def reset(self):
self.nodes_and_edges_map = {}
def deduplicate_nodes_and_edges(self, nodes: list, edges: list):
final_nodes = []
final_edges = []
for node in nodes:
node_key = str(node.id)
if node_key not in self.nodes_and_edges_map:
self.nodes_and_edges_map[node_key] = True
final_nodes.append(node)
for edge in edges:
edge_key = str(edge[0]) + str(edge[2]) + str(edge[1])
if edge_key not in self.nodes_and_edges_map:
self.nodes_and_edges_map[edge_key] = True
final_edges.append(edge)
return final_nodes, final_edges

View file

@ -1,32 +1,37 @@
import asyncio
from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
from distributed.app import app
from distributed.modal_image import image
from distributed.queues import finished_jobs_queue, save_data_points_queue
from distributed.queues import save_data_points_queue
from cognee.infrastructure.databases.graph import get_graph_engine
@app.function(image=image, timeout=86400, max_containers=100)
async def data_point_saver_worker(total_number_of_workers: int):
@app.function(image=image, timeout=7200, max_containers=100)
async def data_point_saver_worker():
print("Started processing of nodes and edges; starting graph engine queue.")
graph_engine = await get_graph_engine()
while True:
if save_data_points_queue.len() != 0:
nodes_and_edges = save_data_points_queue.get(block=False)
if nodes_and_edges and len(nodes_and_edges) == 2:
await graph_engine.add_nodes(nodes_and_edges[0])
await graph_engine.add_edges(nodes_and_edges[1])
else:
print(f"Nodes and edges are: {nodes_and_edges}")
else:
await asyncio.sleep(5)
number_of_finished_jobs = finished_jobs_queue.get(block=False)
if number_of_finished_jobs == total_number_of_workers:
# We put it back for the other consumers to see that we finished
finished_jobs_queue.put(number_of_finished_jobs)
if len(nodes_and_edges) == 0:
print("Finished processing all nodes and edges; stopping graph engine queue.")
return True
if len(nodes_and_edges) == 2:
print(f"Processing {len(nodes_and_edges[0])} nodes and {len(nodes_and_edges[1])} edges.")
nodes = nodes_and_edges[0]
edges = nodes_and_edges[1]
if nodes:
await graph_engine.add_nodes(nodes)
if edges:
await graph_engine.add_edges(edges)
print(f"Finished processing nodes and edges.")
else:
print(f"No jobs, go to sleep.")
await asyncio.sleep(5)

View file

@ -13,7 +13,7 @@ from distributed.tasks.extract_graph_from_data import extract_graph_from_data
from distributed.tasks.save_data_points import save_data_points
@app.function(image=image, timeout=86400, max_containers=100)
@app.function(image=image, timeout=7200, max_containers=100)
async def graph_extraction_worker(user, document_name: str, document_chunks: list):
cognee_config = get_cognify_config()

43
poetry.lock generated
View file

@ -1225,7 +1225,7 @@ description = "Cross-platform colored terminal text."
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
groups = ["main"]
markers = "(sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\") and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\" or extra == \"codegraph\" or platform_system == \"Windows\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\" or extra == \"chromadb\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\" or extra == \"chromadb\" or extra == \"codegraph\") and (python_version < \"3.13\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\")"
markers = "(sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\") and (platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\" or extra == \"codegraph\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\" or extra == \"chromadb\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\" or extra == \"chromadb\" or extra == \"codegraph\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\") and (python_version < \"3.13\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\")"
files = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
@ -1238,7 +1238,7 @@ description = "Colored terminal output for Python's logging module"
optional = true
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
groups = ["main"]
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
files = [
{file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
{file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
@ -2298,7 +2298,7 @@ description = "The FlatBuffers serialization format for Python"
optional = true
python-versions = "*"
groups = ["main"]
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
files = [
{file = "flatbuffers-25.2.10-py2.py3-none-any.whl", hash = "sha256:ebba5f4d5ea615af3f7fd70fc310636fbb2bbd1f566ac0a23d98dd412de50051"},
{file = "flatbuffers-25.2.10.tar.gz", hash = "sha256:97e451377a41262f8d9bd4295cc836133415cc03d8cb966410a4af92eb00d26e"},
@ -2975,7 +2975,7 @@ description = "HTTP/2-based RPC framework"
optional = true
python-versions = ">=3.8"
groups = ["main"]
markers = "extra == \"gemini\" or extra == \"deepeval\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"milvus\""
markers = "extra == \"gemini\" or extra == \"deepeval\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"milvus\" or python_version < \"3.11\" and (extra == \"deepeval\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"milvus\")"
files = [
{file = "grpcio-1.67.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:8b0341d66a57f8a3119b77ab32207072be60c9bf79760fa609c5609f2deb1f3f"},
{file = "grpcio-1.67.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:f5a27dddefe0e2357d3e617b9079b4bfdc91341a91565111a21ed6ebbc51b22d"},
@ -3470,7 +3470,7 @@ description = "Human friendly output for text interfaces using Python"
optional = true
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
groups = ["main"]
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
files = [
{file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
{file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
@ -6003,7 +6003,7 @@ description = "Python library for arbitrary-precision floating-point arithmetic"
optional = true
python-versions = "*"
groups = ["main"]
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
files = [
{file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
{file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
@ -6585,7 +6585,7 @@ description = "ONNX Runtime is a runtime accelerator for Machine Learning models
optional = true
python-versions = ">=3.10"
groups = ["main"]
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
files = [
{file = "onnxruntime-1.21.1-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:daedb5d33d8963062a25f4a3c788262074587f685a19478ef759a911b4b12c25"},
{file = "onnxruntime-1.21.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a402f9bda0b1cc791d9cf31d23c471e8189a55369b49ef2b9d0854eb11d22c4"},
@ -6985,8 +6985,8 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
]
python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
@ -7605,7 +7605,7 @@ description = ""
optional = true
python-versions = ">=3.8"
groups = ["main"]
markers = "(extra == \"chromadb\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"deepeval\" or extra == \"milvus\" or extra == \"codegraph\") and python_version < \"3.13\" or (extra == \"chromadb\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"deepeval\" or extra == \"milvus\" or extra == \"codegraph\") and (extra == \"chromadb\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"deepeval\" or extra == \"milvus\")"
markers = "python_version == \"3.10\" and extra == \"codegraph\" or (extra == \"chromadb\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"deepeval\" or extra == \"milvus\") and python_version < \"3.11\" or (python_version == \"3.12\" or extra == \"gemini\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"deepeval\" or extra == \"milvus\" or extra == \"chromadb\") and (extra == \"codegraph\" or extra == \"chromadb\" or extra == \"gemini\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"deepeval\" or extra == \"milvus\") and python_version >= \"3.12\" or python_version == \"3.11\" and (extra == \"codegraph\" or extra == \"chromadb\" or extra == \"gemini\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"deepeval\" or extra == \"milvus\")"
files = [
{file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"},
{file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"},
@ -8189,8 +8189,8 @@ astroid = ">=3.3.8,<=3.4.0.dev0"
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
dill = [
{version = ">=0.2", markers = "python_version < \"3.11\""},
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
{version = ">=0.3.7", markers = "python_version >= \"3.12\""},
{version = ">=0.3.6", markers = "python_version == \"3.11\""},
]
isort = ">=4.2.5,<5.13 || >5.13,<7"
mccabe = ">=0.6,<0.8"
@ -8249,6 +8249,21 @@ bulk-writer = ["azure-storage-blob", "minio (>=7.0.0)", "pyarrow (>=12.0.0)", "r
dev = ["black", "grpcio (==1.62.2)", "grpcio-testing (==1.62.2)", "grpcio-tools (==1.62.2)", "pytest (>=5.3.4)", "pytest-cov (>=2.8.1)", "pytest-timeout (>=1.3.4)", "ruff (>0.4.0)"]
model = ["pymilvus.model (>=0.3.0)"]
[[package]]
name = "pympler"
version = "1.1"
description = "A development tool to measure, monitor and analyze the memory behavior of Python objects."
optional = false
python-versions = ">=3.6"
groups = ["main"]
files = [
{file = "Pympler-1.1-py3-none-any.whl", hash = "sha256:5b223d6027d0619584116a0cbc28e8d2e378f7a79c1e5e024f9ff3b673c58506"},
{file = "pympler-1.1.tar.gz", hash = "sha256:1eaa867cb8992c218430f1708fdaccda53df064144d1c5656b1e6f1ee6000424"},
]
[package.dependencies]
pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""}
[[package]]
name = "pypandoc"
version = "1.15"
@ -8319,7 +8334,7 @@ description = "A python implementation of GNU readline."
optional = true
python-versions = ">=3.8"
groups = ["main"]
markers = "sys_platform == \"win32\" and (extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or sys_platform == \"win32\" and extra == \"chromadb\""
markers = "sys_platform == \"win32\" and (python_version == \"3.10\" or extra == \"chromadb\" or extra == \"codegraph\") and (extra == \"chromadb\" or python_version == \"3.12\" or python_version == \"3.10\" or python_version == \"3.11\") and (extra == \"codegraph\" or extra == \"chromadb\")"
files = [
{file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"},
{file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"},
@ -8681,7 +8696,7 @@ description = "Python for Window Extensions"
optional = false
python-versions = "*"
groups = ["main"]
markers = "(extra == \"qdrant\" or extra == \"deepeval\") and (extra == \"qdrant\" or extra == \"deepeval\" or extra == \"notebook\" or extra == \"dev\") and platform_system == \"Windows\" or sys_platform == \"win32\""
markers = "sys_platform == \"win32\" or platform_system == \"Windows\""
files = [
{file = "pywin32-310-cp310-cp310-win32.whl", hash = "sha256:6dd97011efc8bf51d6793a82292419eba2c71cf8e7250cfac03bba284454abc1"},
{file = "pywin32-310-cp310-cp310-win_amd64.whl", hash = "sha256:c3e78706e4229b915a0821941a84e7ef420bf2b77e08c9dae3c76fd03fd2ae3d"},
@ -10388,7 +10403,7 @@ description = "Computer algebra system (CAS) in Python"
optional = true
python-versions = ">=3.9"
groups = ["main"]
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
files = [
{file = "sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5"},
{file = "sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517"},
@ -12138,4 +12153,4 @@ weaviate = ["weaviate-client"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<=3.13"
content-hash = "7936d855b80d15eba11e15ebeb7b8c50c5550cd9908aae006f42e554136b656d"
content-hash = "b957125d983288f42fea2de62a5dcd14fedd474063968e34bb685460ec73a658"

View file

@ -58,6 +58,7 @@ dependencies = [
"dlt[sqlalchemy]>=1.9.0,<2",
"sentry-sdk[fastapi]>=2.9.0,<3",
"structlog>=25.2.0,<26",
"pympler>=1.1",
]
[project.optional-dependencies]

7492
uv.lock generated

File diff suppressed because it is too large Load diff