fix: decrease batch size for chunks

2025-05-19 10:48:38 +02:00 · 2025-05-19 10:48:38 +02:00 · dda3460715
commit dda3460715
parent 6c42346359
5 changed files with 8 additions and 5 deletions
--- a/distributed/Dockerfile
+++ b/distributed/Dockerfile
@ -24,7 +24,7 @@ RUN pip install poetry

 RUN poetry config virtualenvs.create false

-RUN poetry install --extras neo4j --extras qdrant --no-root
+RUN poetry install --extras neo4j --extras postgres --no-root

 COPY cognee/ /app/cognee
 COPY distributed/ /app/distributed
--- a/distributed/entrypoint.py
+++ b/distributed/entrypoint.py
@ -133,7 +133,7 @@ async def main():
                    Task(
                        process_chunks_remotely,
                        document=item,
-                        task_config={"batch_size": 50},
+                        task_config={"batch_size": 10},
                    ),
                ],
                data=[item],
@ -155,6 +155,7 @@ async def main():
        print(f"Number of documents processed: {len(results)}")
        results.extend(batch_results)

+    # Push empty tuple into the queue to signal the end of data.
    save_data_points_queue.put(())

    for consumer_future in consumer_futures:
--- a/distributed/tasks/save_data_points.py
+++ b/distributed/tasks/save_data_points.py
@ -35,7 +35,9 @@ async def save_data_points(data_points_and_relationships: tuple[list, list]):
    for nodes, edges in data_points_and_relationships:
        for node in nodes:
            if asizeof.asizeof(node) >= 500000:
-                print(f"Node too large:\n{node.id}\n")
+                try_pushing_nodes_to_queue([node])
+                continue
+                # print(f"Node too large:\n{node.id}\n")

            node_batch.append(node)

--- a/distributed/workers/data_point_saver_worker.py
+++ b/distributed/workers/data_point_saver_worker.py
@ -7,7 +7,7 @@ from distributed.queues import save_data_points_queue
 from cognee.infrastructure.databases.graph import get_graph_engine


-@app.function(image=image, timeout=7200, max_containers=100)
+@app.function(image=image, timeout=86400, max_containers=100)
 async def data_point_saver_worker():
    print("Started processing of nodes and edges; starting graph engine queue.")
    graph_engine = await get_graph_engine()
--- a/distributed/workers/graph_extraction_worker.py
+++ b/distributed/workers/graph_extraction_worker.py
@ -13,7 +13,7 @@ from distributed.tasks.extract_graph_from_data import extract_graph_from_data
 from distributed.tasks.save_data_points import save_data_points


-@app.function(image=image, timeout=7200, max_containers=100)
+@app.function(image=image, timeout=86400, max_containers=100)
 async def graph_extraction_worker(user, document_name: str, document_chunks: list):
    cognee_config = get_cognify_config()