chore: increase the lib version (#138)
This commit is contained in:
parent
a9433e9283
commit
58db1ac2c8
8 changed files with 13 additions and 13 deletions
|
|
@ -15,12 +15,12 @@ class AudioDocument(Document):
|
||||||
self.raw_data_location = raw_data_location
|
self.raw_data_location = raw_data_location
|
||||||
self.chunking_strategy = chunking_strategy
|
self.chunking_strategy = chunking_strategy
|
||||||
|
|
||||||
def read(self):
|
def read(self, chunk_size: int):
|
||||||
# Transcribe the audio file
|
# Transcribe the audio file
|
||||||
result = get_llm_client().create_transcript(self.raw_data_location)
|
result = get_llm_client().create_transcript(self.raw_data_location)
|
||||||
text = result.text
|
text = result.text
|
||||||
|
|
||||||
chunker = TextChunker(self.id, get_text = lambda: text)
|
chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,5 +7,5 @@ class Document(Protocol):
|
||||||
title: str
|
title: str
|
||||||
raw_data_location: str
|
raw_data_location: str
|
||||||
|
|
||||||
def read(self) -> str:
|
def read(self, chunk_size: int) -> str:
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
|
|
@ -14,12 +14,12 @@ class ImageDocument(Document):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.raw_data_location = raw_data_location
|
self.raw_data_location = raw_data_location
|
||||||
|
|
||||||
def read(self):
|
def read(self, chunk_size: int):
|
||||||
# Transcribe the image file
|
# Transcribe the image file
|
||||||
result = get_llm_client().transcribe_image(self.raw_data_location)
|
result = get_llm_client().transcribe_image(self.raw_data_location)
|
||||||
text = result.choices[0].message.content
|
text = result.choices[0].message.content
|
||||||
|
|
||||||
chunker = TextChunker(self.id, get_text = lambda: text)
|
chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = lambda: text)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ class PdfDocument(Document):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.raw_data_location = raw_data_location
|
self.raw_data_location = raw_data_location
|
||||||
|
|
||||||
def read(self) -> PdfReader:
|
def read(self, chunk_size: int) -> PdfReader:
|
||||||
file = PdfReader(self.raw_data_location)
|
file = PdfReader(self.raw_data_location)
|
||||||
|
|
||||||
def get_text():
|
def get_text():
|
||||||
|
|
@ -21,7 +21,7 @@ class PdfDocument(Document):
|
||||||
page_text = page.extract_text()
|
page_text = page.extract_text()
|
||||||
yield page_text
|
yield page_text
|
||||||
|
|
||||||
chunker = TextChunker(self.id, get_text = get_text)
|
chunker = TextChunker(self.id, chunk_size = chunk_size, get_text = get_text)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ class TextDocument(Document):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.raw_data_location = raw_data_location
|
self.raw_data_location = raw_data_location
|
||||||
|
|
||||||
def read(self):
|
def read(self, chunk_size: int):
|
||||||
def get_text():
|
def get_text():
|
||||||
with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
|
with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -24,7 +24,7 @@ class TextDocument(Document):
|
||||||
yield text
|
yield text
|
||||||
|
|
||||||
|
|
||||||
chunker = TextChunker(self.id, get_text = get_text)
|
chunker = TextChunker(self.id,chunk_size = chunk_size, get_text = get_text)
|
||||||
|
|
||||||
yield from chunker.read()
|
yield from chunker.read()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine
|
||||||
from cognee.modules.data.processing.document_types.Document import Document
|
from cognee.modules.data.processing.document_types.Document import Document
|
||||||
|
|
||||||
|
|
||||||
async def source_documents_to_chunks(documents: list[Document], parent_node_id: str = None):
|
async def source_documents_to_chunks(documents: list[Document], chunk_size: int = 1024, parent_node_id: str = None):
|
||||||
graph_engine = await get_graph_engine()
|
graph_engine = await get_graph_engine()
|
||||||
|
|
||||||
if parent_node_id is None:
|
if parent_node_id is None:
|
||||||
|
|
@ -40,5 +40,5 @@ async def source_documents_to_chunks(documents: list[Document], parent_node_id:
|
||||||
await graph_engine.add_edges(edges)
|
await graph_engine.add_edges(edges)
|
||||||
|
|
||||||
for document in documents:
|
for document in documents:
|
||||||
for document_chunk in document.read():
|
for document_chunk in document.read(chunk_size = chunk_size):
|
||||||
yield document_chunk
|
yield document_chunk
|
||||||
|
|
|
||||||
|
|
@ -391,7 +391,7 @@
|
||||||
" Task(classify_documents),\n",
|
" Task(classify_documents),\n",
|
||||||
" Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
|
" Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
|
||||||
" Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n",
|
" Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n",
|
||||||
" Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
|
" Task(source_documents_to_chunks, chunk_size = 800, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
|
||||||
" Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n",
|
" Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n",
|
||||||
" Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n",
|
" Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n",
|
||||||
" Task(\n",
|
" Task(\n",
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "cognee"
|
name = "cognee"
|
||||||
version = "0.1.15"
|
version = "0.1.16"
|
||||||
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
|
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
|
||||||
authors = ["Vasilije Markovic", "Boris Arzentar"]
|
authors = ["Vasilije Markovic", "Boris Arzentar"]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue