fix: batch datapoints on save to limit bandwidth size
This commit is contained in:
parent
f999232b8e
commit
a1e605ca97
22 changed files with 4251 additions and 3889 deletions
|
|
@ -45,42 +45,48 @@ def record_graph_changes(func):
|
|||
async with db_engine.get_async_session() as session:
|
||||
if func.__name__ == "add_nodes":
|
||||
nodes: List[DataPoint] = args[0]
|
||||
|
||||
relationship_ledgers = []
|
||||
|
||||
for node in nodes:
|
||||
try:
|
||||
node_id = UUID(str(node.id))
|
||||
relationship = GraphRelationshipLedger(
|
||||
id=uuid5(NAMESPACE_OID, f"{datetime.now(timezone.utc).timestamp()}"),
|
||||
source_node_id=node_id,
|
||||
destination_node_id=node_id,
|
||||
creator_function=f"{creator}.node",
|
||||
node_label=getattr(node, "name", None) or str(node.id),
|
||||
)
|
||||
session.add(relationship)
|
||||
await session.flush()
|
||||
except Exception as e:
|
||||
logger.debug(f"Error adding relationship: {e}")
|
||||
await session.rollback()
|
||||
continue
|
||||
node_id = UUID(str(node.id))
|
||||
relationship_ledgers.append(GraphRelationshipLedger(
|
||||
id=uuid5(NAMESPACE_OID, f"{datetime.now(timezone.utc).timestamp()}"),
|
||||
source_node_id=node_id,
|
||||
destination_node_id=node_id,
|
||||
creator_function=f"{creator}.node",
|
||||
node_label=getattr(node, "name", None) or str(node.id),
|
||||
))
|
||||
|
||||
try:
|
||||
session.add_all(relationship_ledgers)
|
||||
await session.flush()
|
||||
except Exception as e:
|
||||
logger.debug(f"Error adding relationship: {e}")
|
||||
await session.rollback()
|
||||
|
||||
elif func.__name__ == "add_edges":
|
||||
edges = args[0]
|
||||
|
||||
relationship_ledgers = []
|
||||
|
||||
for edge in edges:
|
||||
try:
|
||||
source_id = UUID(str(edge[0]))
|
||||
target_id = UUID(str(edge[1]))
|
||||
rel_type = str(edge[2])
|
||||
relationship = GraphRelationshipLedger(
|
||||
id=uuid5(NAMESPACE_OID, f"{datetime.now(timezone.utc).timestamp()}"),
|
||||
source_node_id=source_id,
|
||||
destination_node_id=target_id,
|
||||
creator_function=f"{creator}.{rel_type}",
|
||||
)
|
||||
session.add(relationship)
|
||||
await session.flush()
|
||||
except Exception as e:
|
||||
logger.debug(f"Error adding relationship: {e}")
|
||||
await session.rollback()
|
||||
continue
|
||||
source_id = UUID(str(edge[0]))
|
||||
target_id = UUID(str(edge[1]))
|
||||
rel_type = str(edge[2])
|
||||
relationship_ledgers.append(GraphRelationshipLedger(
|
||||
id=uuid5(NAMESPACE_OID, f"{datetime.now(timezone.utc).timestamp()}"),
|
||||
source_node_id=source_id,
|
||||
destination_node_id=target_id,
|
||||
creator_function=f"{creator}.{rel_type}",
|
||||
))
|
||||
|
||||
try:
|
||||
session.add_all(relationship_ledgers)
|
||||
await session.flush()
|
||||
except Exception as e:
|
||||
logger.debug(f"Error adding relationship: {e}")
|
||||
await session.rollback()
|
||||
|
||||
try:
|
||||
await session.commit()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
from typing import BinaryIO, TypedDict
|
||||
import hashlib
|
||||
from .guess_file_type import guess_file_type
|
||||
from cognee.shared.utils import get_file_content_hash
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,9 @@ import litellm
|
|||
import instructor
|
||||
from typing import Type
|
||||
from pydantic import BaseModel
|
||||
from openai import ContentFilterFinishReasonError
|
||||
from litellm.exceptions import ContentPolicyViolationError
|
||||
from instructor.exceptions import InstructorRetryException
|
||||
|
||||
from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError
|
||||
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
||||
|
|
@ -67,7 +70,17 @@ class GenericAPIAdapter(LLMInterface):
|
|||
api_base=self.endpoint,
|
||||
response_model=response_model,
|
||||
)
|
||||
except litellm.exceptions.ContentPolicyViolationError:
|
||||
except (
|
||||
ContentFilterFinishReasonError,
|
||||
ContentPolicyViolationError,
|
||||
InstructorRetryException,
|
||||
) as error:
|
||||
if (
|
||||
isinstance(error, InstructorRetryException)
|
||||
and not "content management policy" in str(error).lower()
|
||||
):
|
||||
raise error
|
||||
|
||||
if not (self.fallback_model and self.fallback_api_key and self.fallback_endpoint):
|
||||
raise ContentPolicyFilterError(
|
||||
f"The provided input contains content that is not aligned with our content policy: {text_input}"
|
||||
|
|
@ -92,7 +105,17 @@ class GenericAPIAdapter(LLMInterface):
|
|||
api_base=self.fallback_endpoint,
|
||||
response_model=response_model,
|
||||
)
|
||||
except litellm.exceptions.ContentPolicyViolationError:
|
||||
raise ContentPolicyFilterError(
|
||||
f"The provided input contains content that is not aligned with our content policy: {text_input}"
|
||||
)
|
||||
except (
|
||||
ContentFilterFinishReasonError,
|
||||
ContentPolicyViolationError,
|
||||
InstructorRetryException,
|
||||
) as error:
|
||||
if (
|
||||
isinstance(error, InstructorRetryException)
|
||||
and not "content management policy" in str(error).lower()
|
||||
):
|
||||
raise error
|
||||
else:
|
||||
raise ContentPolicyFilterError(
|
||||
f"The provided input contains content that is not aligned with our content policy: {text_input}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ import instructor
|
|||
from typing import Type
|
||||
from pydantic import BaseModel
|
||||
from openai import ContentFilterFinishReasonError
|
||||
from litellm.exceptions import ContentPolicyViolationError
|
||||
from instructor.exceptions import InstructorRetryException
|
||||
|
||||
from cognee.exceptions import InvalidValueError
|
||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||
|
|
@ -87,7 +89,17 @@ class OpenAIAdapter(LLMInterface):
|
|||
response_model=response_model,
|
||||
max_retries=self.MAX_RETRIES,
|
||||
)
|
||||
except ContentFilterFinishReasonError:
|
||||
except (
|
||||
ContentFilterFinishReasonError,
|
||||
ContentPolicyViolationError,
|
||||
InstructorRetryException,
|
||||
) as error:
|
||||
if (
|
||||
isinstance(error, InstructorRetryException)
|
||||
and not "content management policy" in str(error).lower()
|
||||
):
|
||||
raise error
|
||||
|
||||
if not (self.fallback_model and self.fallback_api_key):
|
||||
raise ContentPolicyFilterError(
|
||||
f"The provided input contains content that is not aligned with our content policy: {text_input}"
|
||||
|
|
@ -112,10 +124,20 @@ class OpenAIAdapter(LLMInterface):
|
|||
response_model=response_model,
|
||||
max_retries=self.MAX_RETRIES,
|
||||
)
|
||||
except ContentFilterFinishReasonError:
|
||||
raise ContentPolicyFilterError(
|
||||
f"The provided input contains content that is not aligned with our content policy: {text_input}"
|
||||
)
|
||||
except (
|
||||
ContentFilterFinishReasonError,
|
||||
ContentPolicyViolationError,
|
||||
InstructorRetryException,
|
||||
) as error:
|
||||
if (
|
||||
isinstance(error, InstructorRetryException)
|
||||
and not "content management policy" in str(error).lower()
|
||||
):
|
||||
raise error
|
||||
else:
|
||||
raise ContentPolicyFilterError(
|
||||
f"The provided input contains content that is not aligned with our content policy: {text_input}"
|
||||
)
|
||||
|
||||
@observe
|
||||
@sleep_and_retry_sync()
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from .open_data_file import open_data_file
|
|||
|
||||
class TextDocument(Document):
|
||||
type: str = "text"
|
||||
mime_type: str = "text/plain"
|
||||
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||
def get_text():
|
||||
|
|
|
|||
|
|
@ -45,11 +45,18 @@ def expand_with_nodes_and_edges(
|
|||
type_node_key = f"{type_node_id}_type"
|
||||
|
||||
if type_node_key not in added_nodes_map and type_node_key not in key_mapping:
|
||||
(
|
||||
ontology_entity_type_nodes,
|
||||
ontology_entity_type_edges,
|
||||
ontology_closest_class_node,
|
||||
) = ontology_resolver.get_subgraph(node_name=type_node_name, node_type="classes")
|
||||
if ontology_resolver:
|
||||
(
|
||||
ontology_entity_type_nodes,
|
||||
ontology_entity_type_edges,
|
||||
ontology_closest_class_node,
|
||||
) = ontology_resolver.get_subgraph(
|
||||
node_name=type_node_name, node_type="classes"
|
||||
)
|
||||
else:
|
||||
ontology_entity_type_nodes = []
|
||||
ontology_entity_type_edges = []
|
||||
ontology_closest_class_node = None
|
||||
|
||||
if ontology_closest_class_node:
|
||||
name_mapping[type_node_name] = ontology_closest_class_node.name
|
||||
|
|
@ -125,9 +132,14 @@ def expand_with_nodes_and_edges(
|
|||
entity_node_key = f"{node_id}_entity"
|
||||
|
||||
if entity_node_key not in added_nodes_map and entity_node_key not in key_mapping:
|
||||
ontology_entity_nodes, ontology_entity_edges, start_ent_ont = (
|
||||
ontology_resolver.get_subgraph(node_name=node_name, node_type="individuals")
|
||||
)
|
||||
if ontology_resolver:
|
||||
ontology_entity_nodes, ontology_entity_edges, start_ent_ont = (
|
||||
ontology_resolver.get_subgraph(node_name=node_name, node_type="individuals")
|
||||
)
|
||||
else:
|
||||
ontology_entity_nodes = []
|
||||
ontology_entity_edges = []
|
||||
start_ent_ont = None
|
||||
|
||||
if start_ent_ont:
|
||||
name_mapping[node_name] = start_ent_ont.name
|
||||
|
|
@ -234,7 +246,7 @@ def expand_with_nodes_and_edges(
|
|||
)
|
||||
existing_edges_map[edge_key] = True
|
||||
|
||||
graph_nodes = data_chunks + list(added_ontology_nodes_map.values())
|
||||
graph_nodes = list(added_ontology_nodes_map.values())
|
||||
graph_edges = relationships + ontology_relationships
|
||||
|
||||
return graph_nodes, graph_edges
|
||||
|
|
|
|||
6
cognee/modules/ingestion/get_text_content_hash.py
Normal file
6
cognee/modules/ingestion/get_text_content_hash.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import hashlib
|
||||
|
||||
|
||||
def get_text_content_hash(text: str) -> str:
|
||||
encoded_text = text.encode("utf-8")
|
||||
return hashlib.md5(encoded_text).hexdigest()
|
||||
|
|
@ -1,6 +1,5 @@
|
|||
import asyncio
|
||||
from typing import Type, List, Optional
|
||||
|
||||
from typing import Type, List
|
||||
from pydantic import BaseModel
|
||||
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
|
|
@ -12,7 +11,6 @@ from cognee.modules.graph.utils import (
|
|||
retrieve_existing_edges,
|
||||
)
|
||||
from cognee.shared.data_models import KnowledgeGraph
|
||||
from cognee.tasks.storage import add_data_points
|
||||
|
||||
|
||||
async def integrate_chunk_graphs(
|
||||
|
|
@ -28,7 +26,6 @@ async def integrate_chunk_graphs(
|
|||
for chunk_index, chunk_graph in enumerate(chunk_graphs):
|
||||
data_chunks[chunk_index].contains = chunk_graph
|
||||
|
||||
await add_data_points(chunk_graphs)
|
||||
return data_chunks
|
||||
|
||||
existing_edges_map = await retrieve_existing_edges(
|
||||
|
|
@ -41,7 +38,7 @@ async def integrate_chunk_graphs(
|
|||
)
|
||||
|
||||
if len(graph_nodes) > 0:
|
||||
await add_data_points(graph_nodes)
|
||||
await graph_engine.add_nodes(graph_nodes)
|
||||
|
||||
if len(graph_edges) > 0:
|
||||
await graph_engine.add_edges(graph_edges)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
import asyncio
|
||||
from typing import List
|
||||
from uuid import NAMESPACE_OID, uuid5
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.modules.graph.utils import get_graph_from_model
|
||||
|
||||
|
||||
class Document(DataPoint):
|
||||
path: str
|
||||
metadata: dict = {"index_fields": []}
|
||||
|
||||
|
||||
class DocumentChunk(DataPoint):
|
||||
part_of: Document
|
||||
text: str
|
||||
contains: List["Entity"] = None
|
||||
metadata: dict = {"index_fields": ["text"]}
|
||||
|
||||
|
||||
class EntityType(DataPoint):
|
||||
name: str
|
||||
metadata: dict = {"index_fields": ["name"]}
|
||||
|
||||
|
||||
class Entity(DataPoint):
|
||||
name: str
|
||||
is_type: EntityType
|
||||
metadata: dict = {"index_fields": ["name"]}
|
||||
|
||||
|
||||
DocumentChunk.model_rebuild()
|
||||
|
||||
|
||||
async def get_graph_from_model_test():
|
||||
document = Document(path="file_path")
|
||||
|
||||
document_chunk = DocumentChunk(
|
||||
id=uuid5(NAMESPACE_OID, "file_name"),
|
||||
text="some text",
|
||||
part_of=document,
|
||||
contains=[],
|
||||
)
|
||||
|
||||
document_chunk.contains.append(
|
||||
Entity(
|
||||
name="Entity",
|
||||
is_type=EntityType(
|
||||
name="Type 1",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
added_nodes = {}
|
||||
added_edges = {}
|
||||
visited_properties = {}
|
||||
|
||||
result = await get_graph_from_model(
|
||||
document_chunk,
|
||||
added_nodes=added_nodes,
|
||||
added_edges=added_edges,
|
||||
visited_properties=visited_properties,
|
||||
)
|
||||
|
||||
nodes = result[0]
|
||||
edges = result[1]
|
||||
|
||||
assert len(nodes) == 4
|
||||
assert len(edges) == 3
|
||||
|
||||
document_chunk_node = next(filter(lambda node: node.type is "DocumentChunk", nodes))
|
||||
assert not hasattr(document_chunk_node, "part_of"), "Expected part_of attribute to be removed"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(get_graph_from_model_test())
|
||||
|
|
@ -24,7 +24,7 @@ RUN pip install poetry
|
|||
|
||||
RUN poetry config virtualenvs.create false
|
||||
|
||||
RUN poetry install --extras neo4j --extras qdrant --no-root --without dev
|
||||
RUN poetry install --extras neo4j --extras qdrant --no-root
|
||||
|
||||
COPY cognee/ /app/cognee
|
||||
COPY distributed/ /app/distributed
|
||||
|
|
|
|||
|
|
@ -1,24 +1,28 @@
|
|||
import pathlib
|
||||
from os import path
|
||||
|
||||
from cognee.api.v1.add import add
|
||||
# from cognee.api.v1.add import add
|
||||
from cognee.api.v1.prune import prune
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
from cognee.infrastructure.llm.utils import get_max_chunk_tokens
|
||||
from cognee.modules.chunking.TextChunker import TextChunker
|
||||
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
||||
from cognee.modules.data.models import Data
|
||||
from cognee.modules.data.processing.document_types import Document
|
||||
from cognee.modules.engine.operations.setup import setup
|
||||
from cognee.modules.ingestion.get_text_content_hash import get_text_content_hash
|
||||
from cognee.modules.pipelines.operations.run_tasks import run_tasks
|
||||
from cognee.modules.pipelines.tasks.task import Task
|
||||
from cognee.modules.users.methods.get_default_user import get_default_user
|
||||
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||
from cognee.modules.data.methods.get_datasets_by_name import get_datasets_by_name
|
||||
# from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||
# from cognee.modules.data.methods.get_datasets_by_name import get_datasets_by_name
|
||||
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee.tasks.documents.classify_documents import classify_documents
|
||||
from cognee.tasks.documents.extract_chunks_from_documents import extract_chunks_from_documents
|
||||
|
||||
from distributed.app import app
|
||||
from distributed.queues import finished_jobs_queue, save_data_points_queue
|
||||
from distributed.models.TextDocument import TextDocument
|
||||
from distributed.queues import save_data_points_queue
|
||||
from distributed.workers.data_point_saver_worker import data_point_saver_worker
|
||||
from distributed.workers.graph_extraction_worker import graph_extraction_worker
|
||||
|
||||
|
|
@ -28,10 +32,9 @@ logger = get_logger()
|
|||
@app.local_entrypoint()
|
||||
async def main():
|
||||
# Clear queues
|
||||
finished_jobs_queue.clear()
|
||||
save_data_points_queue.clear()
|
||||
|
||||
dataset_name = "main"
|
||||
# dataset_name = "main"
|
||||
data_directory_name = ".data"
|
||||
data_directory_path = path.join(pathlib.Path(__file__).parent, data_directory_name)
|
||||
|
||||
|
|
@ -45,18 +48,65 @@ async def main():
|
|||
await prune.prune_data()
|
||||
await prune.prune_system(metadata=True)
|
||||
|
||||
await setup()
|
||||
|
||||
# Add files to the metastore
|
||||
await add(data=data_directory_path, dataset_name=dataset_name)
|
||||
# await add(data=data_directory_path, dataset_name=dataset_name)
|
||||
|
||||
user = await get_default_user()
|
||||
datasets = await get_datasets_by_name(dataset_name, user.id)
|
||||
documents = await get_dataset_data(dataset_id=datasets[0].id)
|
||||
# datasets = await get_datasets_by_name(dataset_name, user.id)
|
||||
# documents = await get_dataset_data(dataset_id=datasets[0].id)
|
||||
|
||||
import duckdb
|
||||
|
||||
connection = duckdb.connect()
|
||||
dataset_file_name = "de-00000-of-00003-f8e581c008ccc7f2.parquet"
|
||||
dataset_file_path = path.join(data_directory_path, dataset_file_name)
|
||||
df = connection.execute(f"SELECT * FROM '{dataset_file_path}'").fetchdf()
|
||||
|
||||
documents = []
|
||||
|
||||
for _, row in df.iterrows():
|
||||
file_id = str(row["id"])
|
||||
content = row["text"]
|
||||
|
||||
documents.append(
|
||||
TextDocument(
|
||||
name=file_id,
|
||||
content=content,
|
||||
raw_data_location=f"{dataset_file_name}_{file_id}",
|
||||
external_metadata="",
|
||||
)
|
||||
)
|
||||
|
||||
documents: list[TextDocument] = documents[0:100]
|
||||
print(f"We have {len(documents)} documents in the dataset.")
|
||||
|
||||
data_documents = [
|
||||
Data(
|
||||
id=document.id,
|
||||
name=document.name,
|
||||
raw_data_location=document.raw_data_location,
|
||||
extension="txt",
|
||||
mime_type=document.mime_type,
|
||||
owner_id=user.id,
|
||||
content_hash=get_text_content_hash(document.content),
|
||||
external_metadata=document.external_metadata,
|
||||
node_set=None,
|
||||
token_count=-1,
|
||||
)
|
||||
for document in documents
|
||||
]
|
||||
|
||||
db_engine = get_relational_engine()
|
||||
|
||||
async with db_engine.get_async_session() as session:
|
||||
session.add_all(data_documents)
|
||||
await session.commit()
|
||||
|
||||
# Start data_point_saver_worker functions
|
||||
for _ in range(number_of_data_saving_workers):
|
||||
worker_future = data_point_saver_worker.spawn(total_number_of_workers=len(documents))
|
||||
worker_future = data_point_saver_worker.spawn()
|
||||
consumer_futures.append(worker_future)
|
||||
|
||||
producer_futures = []
|
||||
|
|
@ -75,10 +125,9 @@ async def main():
|
|||
for item in batch:
|
||||
async for worker_feature in run_tasks(
|
||||
[
|
||||
Task(classify_documents),
|
||||
Task(
|
||||
extract_chunks_from_documents,
|
||||
max_chunk_size=get_max_chunk_tokens(),
|
||||
max_chunk_size=2000,
|
||||
chunker=TextChunker,
|
||||
),
|
||||
Task(
|
||||
|
|
@ -94,21 +143,25 @@ async def main():
|
|||
pass
|
||||
|
||||
batch_results = []
|
||||
|
||||
for producer_future in producer_futures:
|
||||
try:
|
||||
result = producer_future.get()
|
||||
except Exception as e:
|
||||
result = e
|
||||
|
||||
batch_results.append(result)
|
||||
|
||||
print(f"Number of documents processed: {len(results)}")
|
||||
results.extend(batch_results)
|
||||
finished_jobs_queue.put(len(results))
|
||||
|
||||
save_data_points_queue.put(())
|
||||
|
||||
for consumer_future in consumer_futures:
|
||||
try:
|
||||
print("Finished but waiting")
|
||||
print("Finished but waiting for saving worker to finish.")
|
||||
consumer_final = consumer_future.get()
|
||||
print(f"We got all futures {consumer_final}")
|
||||
print(f"All workers are done: {consumer_final}")
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
|
|
|
|||
|
|
@ -16,9 +16,7 @@ image = (
|
|||
Image.from_dockerfile(
|
||||
path=pathlib.Path(path.join(path.dirname(__file__), "Dockerfile")).resolve(),
|
||||
force_build=False,
|
||||
).env(local_env_vars)
|
||||
# .pip_install_from_pyproject(pyproject_toml=pathlib.Path(path.join(path.dirname(__file__), "../pyproject.toml")).resolve())
|
||||
# .poetry_install_from_file(poetry_pyproject_toml=pathlib.Path(path.join(path.dirname(__file__), "../pyproject.toml")).resolve())
|
||||
# .add_local_dir(pathlib.Path("./venv/bin").resolve(), remote_path="/app/.venv")
|
||||
# .add_local_python_source(pathlib.Path("./cognee").resolve())
|
||||
)
|
||||
.env(local_env_vars)
|
||||
.add_local_python_source("cognee", "entrypoint")
|
||||
)
|
||||
|
|
|
|||
15
distributed/models/TextDocument.py
Normal file
15
distributed/models/TextDocument.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
from cognee.modules.chunking.Chunker import Chunker
|
||||
from cognee.modules.data.processing.document_types import Document
|
||||
|
||||
|
||||
class TextDocument(Document):
|
||||
type: str = "text"
|
||||
mime_type: str = "text/plain"
|
||||
content: str
|
||||
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||
def get_text():
|
||||
yield self.content
|
||||
|
||||
chunker: Chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text)
|
||||
yield from chunker.read()
|
||||
|
|
@ -1,10 +1,7 @@
|
|||
from modal import Queue
|
||||
|
||||
|
||||
# Create (or get) two queues:
|
||||
# Create (or get) queues:
|
||||
# - save_data_points_queue: Stores messages produced by the producer functions.
|
||||
# - finished_jobs_queue: Keeps track of the number of finished producer jobs.
|
||||
|
||||
save_data_points_queue = Queue.from_name("save_data_points_queue", create_if_missing=True)
|
||||
|
||||
finished_jobs_queue = Queue.from_name("finished_jobs_queue", create_if_missing=True)
|
||||
|
|
|
|||
|
|
@ -38,4 +38,4 @@ async def extract_graph_from_data(
|
|||
data_chunks, chunk_graphs, ontology_adapter, existing_edges_map
|
||||
)
|
||||
|
||||
return graph_nodes, graph_edges
|
||||
return data_chunks, graph_nodes, graph_edges
|
||||
|
|
|
|||
|
|
@ -1,38 +1,106 @@
|
|||
import asyncio
|
||||
|
||||
from cognee.modules.graph.utils import deduplicate_nodes_and_edges, get_graph_from_model
|
||||
# import json
|
||||
# import asyncio
|
||||
from pympler import asizeof
|
||||
# from cognee.modules.storage.utils import JSONEncoder
|
||||
from distributed.queues import save_data_points_queue
|
||||
# from cognee.modules.graph.utils import get_graph_from_model
|
||||
|
||||
|
||||
async def save_data_points(data_points_and_relationships: tuple[list, list]):
|
||||
data_points = data_points_and_relationships[0]
|
||||
data_point_connections = data_points_and_relationships[1]
|
||||
# data_points = data_points_and_relationships[0]
|
||||
# data_point_connections = data_points_and_relationships[1]
|
||||
|
||||
nodes = []
|
||||
edges = []
|
||||
# added_nodes = {}
|
||||
# added_edges = {}
|
||||
# visited_properties = {}
|
||||
|
||||
added_nodes = {}
|
||||
added_edges = {}
|
||||
visited_properties = {}
|
||||
# nodes_and_edges: list[tuple] = await asyncio.gather(
|
||||
# *[
|
||||
# get_graph_from_model(
|
||||
# data_point,
|
||||
# added_nodes=added_nodes,
|
||||
# added_edges=added_edges,
|
||||
# visited_properties=visited_properties,
|
||||
# )
|
||||
# for data_point in data_points
|
||||
# ]
|
||||
# )
|
||||
|
||||
results = await asyncio.gather(
|
||||
*[
|
||||
get_graph_from_model(
|
||||
data_point,
|
||||
added_nodes=added_nodes,
|
||||
added_edges=added_edges,
|
||||
visited_properties=visited_properties,
|
||||
)
|
||||
for data_point in data_points
|
||||
]
|
||||
)
|
||||
# graph_data_deduplication = GraphDataDeduplication()
|
||||
# deduplicated_nodes_and_edges = [graph_data_deduplication.deduplicate_nodes_and_edges(nodes, edges + data_point_connections) for nodes, edges in nodes_and_edges]
|
||||
|
||||
for result_nodes, result_edges in results:
|
||||
nodes.extend(result_nodes)
|
||||
edges.extend(result_edges)
|
||||
node_batch = []
|
||||
edge_batch = []
|
||||
|
||||
nodes, edges = deduplicate_nodes_and_edges(nodes, edges + data_point_connections)
|
||||
for nodes, edges in data_points_and_relationships:
|
||||
for node in nodes:
|
||||
if asizeof.asizeof(node) >= 500000:
|
||||
print(f"Node too large:\n{node.id}\n")
|
||||
|
||||
# await index_data_points(nodes)
|
||||
node_batch.append(node)
|
||||
|
||||
save_data_points_queue.put((nodes, edges))
|
||||
if asizeof.asizeof(node_batch) >= 500000:
|
||||
try_pushing_nodes_to_queue(node_batch)
|
||||
node_batch = []
|
||||
|
||||
if len(node_batch) > 0:
|
||||
try_pushing_nodes_to_queue(node_batch)
|
||||
node_batch = []
|
||||
|
||||
for edge in edges:
|
||||
edge_batch.append(edge)
|
||||
|
||||
if asizeof.asizeof(edge_batch) >= 500000:
|
||||
try_pushing_edges_to_queue(edge_batch)
|
||||
edge_batch = []
|
||||
|
||||
if len(edge_batch) > 0:
|
||||
try_pushing_edges_to_queue(edge_batch)
|
||||
edge_batch = []
|
||||
|
||||
# graph_data_deduplication.reset()
|
||||
|
||||
class GraphDataDeduplication:
|
||||
nodes_and_edges_map: dict
|
||||
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.nodes_and_edges_map = {}
|
||||
|
||||
def deduplicate_nodes_and_edges(self, nodes: list, edges: list):
|
||||
final_nodes = []
|
||||
final_edges = []
|
||||
|
||||
for node in nodes:
|
||||
node_key = str(node.id)
|
||||
if node_key not in self.nodes_and_edges_map:
|
||||
self.nodes_and_edges_map[node_key] = True
|
||||
final_nodes.append(node)
|
||||
|
||||
for edge in edges:
|
||||
edge_key = str(edge[0]) + str(edge[2]) + str(edge[1])
|
||||
if edge_key not in self.nodes_and_edges_map:
|
||||
self.nodes_and_edges_map[edge_key] = True
|
||||
final_edges.append(edge)
|
||||
|
||||
return final_nodes, final_edges
|
||||
|
||||
|
||||
def try_pushing_nodes_to_queue(node_batch):
|
||||
try:
|
||||
save_data_points_queue.put((node_batch, []))
|
||||
except Exception as e:
|
||||
first_half, second_half = node_batch[:len(node_batch) // 2], node_batch[len(node_batch) // 2:]
|
||||
save_data_points_queue.put((first_half, []))
|
||||
save_data_points_queue.put((second_half, []))
|
||||
|
||||
|
||||
def try_pushing_edges_to_queue(edge_batch):
|
||||
try:
|
||||
save_data_points_queue.put(([], edge_batch))
|
||||
except Exception as e:
|
||||
first_half, second_half = edge_batch[:len(edge_batch) // 2], edge_batch[len(edge_batch) // 2:]
|
||||
save_data_points_queue.put(([], first_half))
|
||||
save_data_points_queue.put(([], second_half))
|
||||
|
|
|
|||
|
|
@ -2,23 +2,26 @@ import asyncio
|
|||
from typing import Type
|
||||
from uuid import uuid5
|
||||
from pydantic import BaseModel
|
||||
from cognee.modules.graph.utils import get_graph_from_model
|
||||
from cognee.tasks.summarization.models import TextSummary
|
||||
from cognee.modules.data.extraction.extract_summary import extract_summary
|
||||
from cognee.infrastructure.engine.models.DataPoint import DataPoint
|
||||
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
||||
from cognee.modules.data.extraction.extract_summary import extract_summary
|
||||
|
||||
|
||||
async def summarize_text(
|
||||
data_points_and_relationships: tuple[list[DocumentChunk], list],
|
||||
data_points_and_relationships: tuple[list[DocumentChunk], list[DataPoint], list],
|
||||
summarization_model: Type[BaseModel],
|
||||
):
|
||||
data_chunks = data_points_and_relationships[0]
|
||||
edges = data_points_and_relationships[1]
|
||||
document_chunks = data_points_and_relationships[0]
|
||||
nodes = data_points_and_relationships[1]
|
||||
relationships = data_points_and_relationships[2]
|
||||
|
||||
if len(data_chunks) == 0:
|
||||
return data_chunks
|
||||
if len(document_chunks) == 0:
|
||||
return document_chunks
|
||||
|
||||
chunk_summaries = await asyncio.gather(
|
||||
*[extract_summary(chunk.text, summarization_model) for chunk in data_chunks]
|
||||
*[extract_summary(chunk.text, summarization_model) for chunk in document_chunks]
|
||||
)
|
||||
|
||||
summaries = [
|
||||
|
|
@ -27,7 +30,56 @@ async def summarize_text(
|
|||
made_from=chunk,
|
||||
text=chunk_summaries[chunk_index].summary,
|
||||
)
|
||||
for (chunk_index, chunk) in enumerate(data_chunks)
|
||||
for (chunk_index, chunk) in enumerate(document_chunks)
|
||||
]
|
||||
|
||||
return summaries, edges
|
||||
data_points = summaries + nodes
|
||||
|
||||
added_nodes = {}
|
||||
added_edges = {}
|
||||
visited_properties = {}
|
||||
|
||||
nodes_and_edges: list[tuple] = await asyncio.gather(
|
||||
*[
|
||||
get_graph_from_model(
|
||||
data_point,
|
||||
added_nodes=added_nodes,
|
||||
added_edges=added_edges,
|
||||
visited_properties=visited_properties,
|
||||
)
|
||||
for data_point in data_points
|
||||
]
|
||||
)
|
||||
|
||||
graph_data_deduplication = GraphDataDeduplication()
|
||||
deduplicated_nodes_and_edges = [graph_data_deduplication.deduplicate_nodes_and_edges(nodes, edges + relationships) for nodes, edges in nodes_and_edges]
|
||||
|
||||
return deduplicated_nodes_and_edges
|
||||
|
||||
|
||||
class GraphDataDeduplication:
|
||||
nodes_and_edges_map: dict
|
||||
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.nodes_and_edges_map = {}
|
||||
|
||||
def deduplicate_nodes_and_edges(self, nodes: list, edges: list):
|
||||
final_nodes = []
|
||||
final_edges = []
|
||||
|
||||
for node in nodes:
|
||||
node_key = str(node.id)
|
||||
if node_key not in self.nodes_and_edges_map:
|
||||
self.nodes_and_edges_map[node_key] = True
|
||||
final_nodes.append(node)
|
||||
|
||||
for edge in edges:
|
||||
edge_key = str(edge[0]) + str(edge[2]) + str(edge[1])
|
||||
if edge_key not in self.nodes_and_edges_map:
|
||||
self.nodes_and_edges_map[edge_key] = True
|
||||
final_edges.append(edge)
|
||||
|
||||
return final_nodes, final_edges
|
||||
|
|
|
|||
|
|
@ -1,32 +1,37 @@
|
|||
import asyncio
|
||||
|
||||
from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
|
||||
|
||||
from distributed.app import app
|
||||
from distributed.modal_image import image
|
||||
from distributed.queues import finished_jobs_queue, save_data_points_queue
|
||||
from distributed.queues import save_data_points_queue
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
|
||||
|
||||
@app.function(image=image, timeout=86400, max_containers=100)
|
||||
async def data_point_saver_worker(total_number_of_workers: int):
|
||||
@app.function(image=image, timeout=7200, max_containers=100)
|
||||
async def data_point_saver_worker():
|
||||
print("Started processing of nodes and edges; starting graph engine queue.")
|
||||
graph_engine = await get_graph_engine()
|
||||
|
||||
while True:
|
||||
if save_data_points_queue.len() != 0:
|
||||
nodes_and_edges = save_data_points_queue.get(block=False)
|
||||
if nodes_and_edges and len(nodes_and_edges) == 2:
|
||||
await graph_engine.add_nodes(nodes_and_edges[0])
|
||||
await graph_engine.add_edges(nodes_and_edges[1])
|
||||
else:
|
||||
print(f"Nodes and edges are: {nodes_and_edges}")
|
||||
else:
|
||||
await asyncio.sleep(5)
|
||||
|
||||
number_of_finished_jobs = finished_jobs_queue.get(block=False)
|
||||
|
||||
if number_of_finished_jobs == total_number_of_workers:
|
||||
# We put it back for the other consumers to see that we finished
|
||||
finished_jobs_queue.put(number_of_finished_jobs)
|
||||
|
||||
if len(nodes_and_edges) == 0:
|
||||
print("Finished processing all nodes and edges; stopping graph engine queue.")
|
||||
return True
|
||||
|
||||
if len(nodes_and_edges) == 2:
|
||||
print(f"Processing {len(nodes_and_edges[0])} nodes and {len(nodes_and_edges[1])} edges.")
|
||||
nodes = nodes_and_edges[0]
|
||||
edges = nodes_and_edges[1]
|
||||
|
||||
if nodes:
|
||||
await graph_engine.add_nodes(nodes)
|
||||
|
||||
if edges:
|
||||
await graph_engine.add_edges(edges)
|
||||
print(f"Finished processing nodes and edges.")
|
||||
|
||||
else:
|
||||
print(f"No jobs, go to sleep.")
|
||||
await asyncio.sleep(5)
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ from distributed.tasks.extract_graph_from_data import extract_graph_from_data
|
|||
from distributed.tasks.save_data_points import save_data_points
|
||||
|
||||
|
||||
@app.function(image=image, timeout=86400, max_containers=100)
|
||||
@app.function(image=image, timeout=7200, max_containers=100)
|
||||
async def graph_extraction_worker(user, document_name: str, document_chunks: list):
|
||||
cognee_config = get_cognify_config()
|
||||
|
||||
|
|
|
|||
43
poetry.lock
generated
43
poetry.lock
generated
|
|
@ -1225,7 +1225,7 @@ description = "Cross-platform colored terminal text."
|
|||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||
groups = ["main"]
|
||||
markers = "(sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\") and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\" or extra == \"codegraph\" or platform_system == \"Windows\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\" or extra == \"chromadb\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\" or extra == \"chromadb\" or extra == \"codegraph\") and (python_version < \"3.13\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\")"
|
||||
markers = "(sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\") and (platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\" or extra == \"codegraph\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\" or extra == \"chromadb\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\" or extra == \"chromadb\" or extra == \"codegraph\") and (sys_platform == \"win32\" or platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dev\") and (python_version < \"3.13\" or platform_system == \"Windows\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"chromadb\")"
|
||||
files = [
|
||||
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
|
||||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
|
|
@ -1238,7 +1238,7 @@ description = "Colored terminal output for Python's logging module"
|
|||
optional = true
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
groups = ["main"]
|
||||
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
|
||||
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
|
||||
files = [
|
||||
{file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
|
||||
{file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
|
||||
|
|
@ -2298,7 +2298,7 @@ description = "The FlatBuffers serialization format for Python"
|
|||
optional = true
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
|
||||
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
|
||||
files = [
|
||||
{file = "flatbuffers-25.2.10-py2.py3-none-any.whl", hash = "sha256:ebba5f4d5ea615af3f7fd70fc310636fbb2bbd1f566ac0a23d98dd412de50051"},
|
||||
{file = "flatbuffers-25.2.10.tar.gz", hash = "sha256:97e451377a41262f8d9bd4295cc836133415cc03d8cb966410a4af92eb00d26e"},
|
||||
|
|
@ -2975,7 +2975,7 @@ description = "HTTP/2-based RPC framework"
|
|||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
markers = "extra == \"gemini\" or extra == \"deepeval\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"milvus\""
|
||||
markers = "extra == \"gemini\" or extra == \"deepeval\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"milvus\" or python_version < \"3.11\" and (extra == \"deepeval\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"milvus\")"
|
||||
files = [
|
||||
{file = "grpcio-1.67.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:8b0341d66a57f8a3119b77ab32207072be60c9bf79760fa609c5609f2deb1f3f"},
|
||||
{file = "grpcio-1.67.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:f5a27dddefe0e2357d3e617b9079b4bfdc91341a91565111a21ed6ebbc51b22d"},
|
||||
|
|
@ -3470,7 +3470,7 @@ description = "Human friendly output for text interfaces using Python"
|
|||
optional = true
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
groups = ["main"]
|
||||
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
|
||||
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
|
||||
files = [
|
||||
{file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
|
||||
{file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
|
||||
|
|
@ -6003,7 +6003,7 @@ description = "Python library for arbitrary-precision floating-point arithmetic"
|
|||
optional = true
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
|
||||
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
|
||||
files = [
|
||||
{file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
|
||||
{file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
|
||||
|
|
@ -6585,7 +6585,7 @@ description = "ONNX Runtime is a runtime accelerator for Machine Learning models
|
|||
optional = true
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
|
||||
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
|
||||
files = [
|
||||
{file = "onnxruntime-1.21.1-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:daedb5d33d8963062a25f4a3c788262074587f685a19478ef759a911b4b12c25"},
|
||||
{file = "onnxruntime-1.21.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a402f9bda0b1cc791d9cf31d23c471e8189a55369b49ef2b9d0854eb11d22c4"},
|
||||
|
|
@ -6985,8 +6985,8 @@ files = [
|
|||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
]
|
||||
python-dateutil = ">=2.8.2"
|
||||
pytz = ">=2020.1"
|
||||
|
|
@ -7605,7 +7605,7 @@ description = ""
|
|||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
markers = "(extra == \"chromadb\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"deepeval\" or extra == \"milvus\" or extra == \"codegraph\") and python_version < \"3.13\" or (extra == \"chromadb\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"deepeval\" or extra == \"milvus\" or extra == \"codegraph\") and (extra == \"chromadb\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"deepeval\" or extra == \"milvus\")"
|
||||
markers = "python_version == \"3.10\" and extra == \"codegraph\" or (extra == \"chromadb\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"gemini\" or extra == \"deepeval\" or extra == \"milvus\") and python_version < \"3.11\" or (python_version == \"3.12\" or extra == \"gemini\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"deepeval\" or extra == \"milvus\" or extra == \"chromadb\") and (extra == \"codegraph\" or extra == \"chromadb\" or extra == \"gemini\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"deepeval\" or extra == \"milvus\") and python_version >= \"3.12\" or python_version == \"3.11\" and (extra == \"codegraph\" or extra == \"chromadb\" or extra == \"gemini\" or extra == \"distributed\" or extra == \"weaviate\" or extra == \"qdrant\" or extra == \"deepeval\" or extra == \"milvus\")"
|
||||
files = [
|
||||
{file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"},
|
||||
{file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"},
|
||||
|
|
@ -8189,8 +8189,8 @@ astroid = ">=3.3.8,<=3.4.0.dev0"
|
|||
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
||||
dill = [
|
||||
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
||||
{version = ">=0.3.7", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=0.3.6", markers = "python_version == \"3.11\""},
|
||||
]
|
||||
isort = ">=4.2.5,<5.13 || >5.13,<7"
|
||||
mccabe = ">=0.6,<0.8"
|
||||
|
|
@ -8249,6 +8249,21 @@ bulk-writer = ["azure-storage-blob", "minio (>=7.0.0)", "pyarrow (>=12.0.0)", "r
|
|||
dev = ["black", "grpcio (==1.62.2)", "grpcio-testing (==1.62.2)", "grpcio-tools (==1.62.2)", "pytest (>=5.3.4)", "pytest-cov (>=2.8.1)", "pytest-timeout (>=1.3.4)", "ruff (>0.4.0)"]
|
||||
model = ["pymilvus.model (>=0.3.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pympler"
|
||||
version = "1.1"
|
||||
description = "A development tool to measure, monitor and analyze the memory behavior of Python objects."
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "Pympler-1.1-py3-none-any.whl", hash = "sha256:5b223d6027d0619584116a0cbc28e8d2e378f7a79c1e5e024f9ff3b673c58506"},
|
||||
{file = "pympler-1.1.tar.gz", hash = "sha256:1eaa867cb8992c218430f1708fdaccda53df064144d1c5656b1e6f1ee6000424"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[[package]]
|
||||
name = "pypandoc"
|
||||
version = "1.15"
|
||||
|
|
@ -8319,7 +8334,7 @@ description = "A python implementation of GNU readline."
|
|||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
markers = "sys_platform == \"win32\" and (extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or sys_platform == \"win32\" and extra == \"chromadb\""
|
||||
markers = "sys_platform == \"win32\" and (python_version == \"3.10\" or extra == \"chromadb\" or extra == \"codegraph\") and (extra == \"chromadb\" or python_version == \"3.12\" or python_version == \"3.10\" or python_version == \"3.11\") and (extra == \"codegraph\" or extra == \"chromadb\")"
|
||||
files = [
|
||||
{file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"},
|
||||
{file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"},
|
||||
|
|
@ -8681,7 +8696,7 @@ description = "Python for Window Extensions"
|
|||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
markers = "(extra == \"qdrant\" or extra == \"deepeval\") and (extra == \"qdrant\" or extra == \"deepeval\" or extra == \"notebook\" or extra == \"dev\") and platform_system == \"Windows\" or sys_platform == \"win32\""
|
||||
markers = "sys_platform == \"win32\" or platform_system == \"Windows\""
|
||||
files = [
|
||||
{file = "pywin32-310-cp310-cp310-win32.whl", hash = "sha256:6dd97011efc8bf51d6793a82292419eba2c71cf8e7250cfac03bba284454abc1"},
|
||||
{file = "pywin32-310-cp310-cp310-win_amd64.whl", hash = "sha256:c3e78706e4229b915a0821941a84e7ef420bf2b77e08c9dae3c76fd03fd2ae3d"},
|
||||
|
|
@ -10388,7 +10403,7 @@ description = "Computer algebra system (CAS) in Python"
|
|||
optional = true
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
markers = "(extra == \"codegraph\" or extra == \"chromadb\") and python_version < \"3.13\" or extra == \"chromadb\""
|
||||
markers = "python_version == \"3.10\" and (extra == \"chromadb\" or extra == \"codegraph\") or extra == \"chromadb\" or python_version == \"3.12\" and (extra == \"chromadb\" or extra == \"codegraph\") or python_version == \"3.11\" and (extra == \"chromadb\" or extra == \"codegraph\")"
|
||||
files = [
|
||||
{file = "sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5"},
|
||||
{file = "sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517"},
|
||||
|
|
@ -12138,4 +12153,4 @@ weaviate = ["weaviate-client"]
|
|||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.10,<=3.13"
|
||||
content-hash = "7936d855b80d15eba11e15ebeb7b8c50c5550cd9908aae006f42e554136b656d"
|
||||
content-hash = "b957125d983288f42fea2de62a5dcd14fedd474063968e34bb685460ec73a658"
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ dependencies = [
|
|||
"dlt[sqlalchemy]>=1.9.0,<2",
|
||||
"sentry-sdk[fastapi]>=2.9.0,<3",
|
||||
"structlog>=25.2.0,<26",
|
||||
"pympler>=1.1",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue