fix: decrease batch size for chunks
This commit is contained in:
parent
6c42346359
commit
dda3460715
5 changed files with 8 additions and 5 deletions
|
|
@ -24,7 +24,7 @@ RUN pip install poetry
|
||||||
|
|
||||||
RUN poetry config virtualenvs.create false
|
RUN poetry config virtualenvs.create false
|
||||||
|
|
||||||
RUN poetry install --extras neo4j --extras qdrant --no-root
|
RUN poetry install --extras neo4j --extras postgres --no-root
|
||||||
|
|
||||||
COPY cognee/ /app/cognee
|
COPY cognee/ /app/cognee
|
||||||
COPY distributed/ /app/distributed
|
COPY distributed/ /app/distributed
|
||||||
|
|
|
||||||
|
|
@ -133,7 +133,7 @@ async def main():
|
||||||
Task(
|
Task(
|
||||||
process_chunks_remotely,
|
process_chunks_remotely,
|
||||||
document=item,
|
document=item,
|
||||||
task_config={"batch_size": 50},
|
task_config={"batch_size": 10},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
data=[item],
|
data=[item],
|
||||||
|
|
@ -155,6 +155,7 @@ async def main():
|
||||||
print(f"Number of documents processed: {len(results)}")
|
print(f"Number of documents processed: {len(results)}")
|
||||||
results.extend(batch_results)
|
results.extend(batch_results)
|
||||||
|
|
||||||
|
# Push empty tuple into the queue to signal the end of data.
|
||||||
save_data_points_queue.put(())
|
save_data_points_queue.put(())
|
||||||
|
|
||||||
for consumer_future in consumer_futures:
|
for consumer_future in consumer_futures:
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,9 @@ async def save_data_points(data_points_and_relationships: tuple[list, list]):
|
||||||
for nodes, edges in data_points_and_relationships:
|
for nodes, edges in data_points_and_relationships:
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
if asizeof.asizeof(node) >= 500000:
|
if asizeof.asizeof(node) >= 500000:
|
||||||
print(f"Node too large:\n{node.id}\n")
|
try_pushing_nodes_to_queue([node])
|
||||||
|
continue
|
||||||
|
# print(f"Node too large:\n{node.id}\n")
|
||||||
|
|
||||||
node_batch.append(node)
|
node_batch.append(node)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from distributed.queues import save_data_points_queue
|
||||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||||
|
|
||||||
|
|
||||||
@app.function(image=image, timeout=7200, max_containers=100)
|
@app.function(image=image, timeout=86400, max_containers=100)
|
||||||
async def data_point_saver_worker():
|
async def data_point_saver_worker():
|
||||||
print("Started processing of nodes and edges; starting graph engine queue.")
|
print("Started processing of nodes and edges; starting graph engine queue.")
|
||||||
graph_engine = await get_graph_engine()
|
graph_engine = await get_graph_engine()
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ from distributed.tasks.extract_graph_from_data import extract_graph_from_data
|
||||||
from distributed.tasks.save_data_points import save_data_points
|
from distributed.tasks.save_data_points import save_data_points
|
||||||
|
|
||||||
|
|
||||||
@app.function(image=image, timeout=7200, max_containers=100)
|
@app.function(image=image, timeout=86400, max_containers=100)
|
||||||
async def graph_extraction_worker(user, document_name: str, document_chunks: list):
|
async def graph_extraction_worker(user, document_name: str, document_chunks: list):
|
||||||
cognee_config = get_cognify_config()
|
cognee_config = get_cognify_config()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue