Compare commits

...
Sign in to create a new pull request.

4 commits

Author SHA1 Message Date
hajdul88
6a7f5be561 fix: Changes exception handling to catch 'em all 2025-07-03 07:54:19 +02:00
hajdul88
278ede9dbe Fix: fixes damaged pdf file errors 2025-07-02 19:29:23 +02:00
hajdul88
e68b19ec83 changes modal entry + cognify batching 2025-07-02 18:56:11 +02:00
hajdul88
7429f3253c adds_entry 2025-07-02 13:16:30 +02:00
3 changed files with 71 additions and 7 deletions

47
1gb_nonparallel_cognee.py Normal file
View file

@ -0,0 +1,47 @@
import modal
import os
import asyncio
import cognee
from cognee.shared.logging_utils import get_logger, setup_logging, INFO
logger = get_logger()
app = modal.App("1gb_nonparallel_cognee")
image = (
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
.copy_local_file("pyproject.toml", "pyproject.toml")
.copy_local_file("poetry.lock", "poetry.lock")
.pip_install(
"protobuf",
"h2",
"deepeval",
"gdown",
"plotly",
"psycopg2-binary==2.9.10",
"asyncpg==0.30.0",
)
)
@app.function(
image=image,
max_containers=1,
timeout=86400,
secrets=[modal.Secret.from_name("1gb_nonparallel_cognee")],
)
async def run_cognee_1gb():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
setup_logging(log_level=INFO)
await cognee.add("s3://s3-test-laszlo")
await cognee.cognify()
return True
@app.local_entrypoint()
async def main():
modal_tasks = [run_cognee_1gb.remote.aio()]
await asyncio.gather(*modal_tasks)

View file

@ -151,13 +151,13 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
extract_graph_from_data,
graph_model=graph_model,
ontology_adapter=OntologyResolver(ontology_file=ontology_file_path),
task_config={"batch_size": 10},
task_config={"batch_size": 50},
), # Generate knowledge graphs from the document chunks.
Task(
summarize_text,
task_config={"batch_size": 10},
task_config={"batch_size": 50},
),
Task(add_data_points, task_config={"batch_size": 10}),
Task(add_data_points, task_config={"batch_size": 50}),
]
return default_tasks

View file

@ -1,7 +1,11 @@
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from cognee.modules.chunking.Chunker import Chunker
from .open_data_file import open_data_file
from .Document import Document
from cognee.shared.logging_utils import get_logger
logger = get_logger("PDFDocument")
class PdfDocument(Document):
@ -9,12 +13,25 @@ class PdfDocument(Document):
def read(self, chunker_cls: Chunker, max_chunk_size: int):
with open_data_file(self.raw_data_location, mode="rb") as stream:
file = PdfReader(stream)
logger.info(f"Reading PDF:{self.raw_data_location}")
try:
file = PdfReader(stream, strict=False)
except Exception as e:
logger.warning(
f"PyPDF couldnt open PDF—skipping: {self.raw_data_location} with error: {e}"
)
return
def get_text():
for page in file.pages:
page_text = page.extract_text()
yield page_text
try:
for page in file.pages:
page_text = page.extract_text()
yield page_text
except Exception as e:
logger.warning(
f"PyPDF couldnt open PDF—skipping: {self.raw_data_location} with error: {e}"
)
return
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)