cognee/cognee/modules/cognify/graph/add_data_chunks.py
Vasilije bb679c2dd7
Improve processing, update networkx client, and Neo4j, and dspy (#69)
* Update cognify and the networkx client to prepare for running in Neo4j

* Fix for openai model

* Add the fix to the infra so that the models can be passed to the library. Enable llm_provider to be passed.

* Auto graph generation now works with neo4j

* Added fixes for both neo4j and networkx

* Explicitly name semantic node connections

* Added updated docs, readme, chunkers and updates to cognify

* Make docs build trigger only when changes on it happen

* Update docs, test git actions

* Separate cognify logic into tasks

* Introduce dspy knowledge graph extraction

---------
Co-authored-by: Boris Arzentar <borisarzentar@gmail.com>
2024-04-20 19:05:40 +02:00

43 lines
1.2 KiB
Python

from typing import TypedDict
from uuid import uuid4
from cognee.infrastructure import infrastructure_config
from cognee.infrastructure.databases.vector import DataPoint
class TextChunk(TypedDict):
text: str
file_metadata: dict
async def add_data_chunks(dataset_data_chunks: dict[str, list[TextChunk]]):
vector_client = infrastructure_config.get_config("vector_engine")
identified_chunks = []
for (dataset_name, chunks) in dataset_data_chunks.items():
try:
await vector_client.create_collection(dataset_name)
except Exception:
pass
dataset_chunks = [
dict(
id = str(uuid4()),
collection = dataset_name,
text = chunk["text"],
file_metadata = chunk["file_metadata"],
) for chunk in chunks
]
identified_chunks.extend(dataset_chunks)
await vector_client.create_data_points(
dataset_name,
[
DataPoint(
id = chunk["id"],
payload = dict(text = chunk["text"]),
embed_field = "text"
) for chunk in dataset_chunks
],
)
return identified_chunks