fix: decrease batch size for chunks

This commit is contained in:
Boris Arzentar 2025-05-19 10:48:38 +02:00
parent 6c42346359
commit dda3460715
5 changed files with 8 additions and 5 deletions

View file

@ -24,7 +24,7 @@ RUN pip install poetry
RUN poetry config virtualenvs.create false
RUN poetry install --extras neo4j --extras qdrant --no-root
RUN poetry install --extras neo4j --extras postgres --no-root
COPY cognee/ /app/cognee
COPY distributed/ /app/distributed

View file

@ -133,7 +133,7 @@ async def main():
Task(
process_chunks_remotely,
document=item,
task_config={"batch_size": 50},
task_config={"batch_size": 10},
),
],
data=[item],
@ -155,6 +155,7 @@ async def main():
print(f"Number of documents processed: {len(results)}")
results.extend(batch_results)
# Push empty tuple into the queue to signal the end of data.
save_data_points_queue.put(())
for consumer_future in consumer_futures:

View file

@ -35,7 +35,9 @@ async def save_data_points(data_points_and_relationships: tuple[list, list]):
for nodes, edges in data_points_and_relationships:
for node in nodes:
if asizeof.asizeof(node) >= 500000:
print(f"Node too large:\n{node.id}\n")
try_pushing_nodes_to_queue([node])
continue
# print(f"Node too large:\n{node.id}\n")
node_batch.append(node)

View file

@ -7,7 +7,7 @@ from distributed.queues import save_data_points_queue
from cognee.infrastructure.databases.graph import get_graph_engine
@app.function(image=image, timeout=7200, max_containers=100)
@app.function(image=image, timeout=86400, max_containers=100)
async def data_point_saver_worker():
print("Started processing of nodes and edges; starting graph engine queue.")
graph_engine = await get_graph_engine()

View file

@ -13,7 +13,7 @@ from distributed.tasks.extract_graph_from_data import extract_graph_from_data
from distributed.tasks.save_data_points import save_data_points
@app.function(image=image, timeout=7200, max_containers=100)
@app.function(image=image, timeout=86400, max_containers=100)
async def graph_extraction_worker(user, document_name: str, document_chunks: list):
cognee_config = get_cognify_config()