fix: Changes exception handling to catch 'em all

Fix: fixes damaged pdf file errors
changes modal entry + cognify batching
2025-07-03 07:54:19 +02:00 · 2025-07-02 19:29:23 +02:00 · 2025-07-02 18:56:11 +02:00 · 2025-07-02 13:16:30 +02:00
3 changed files with 71 additions and 7 deletions
--- a/1gb_nonparallel_cognee.py
+++ b/1gb_nonparallel_cognee.py
@ -0,0 +1,47 @@
 import modal
 import os
 import asyncio
 import cognee
 from cognee.shared.logging_utils import get_logger, setup_logging, INFO
 logger = get_logger()
 app = modal.App("1gb_nonparallel_cognee")
 image = (
    modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
    .copy_local_file("pyproject.toml", "pyproject.toml")
    .copy_local_file("poetry.lock", "poetry.lock")
    .pip_install(
        "protobuf",
        "h2",
        "deepeval",
        "gdown",
        "plotly",
        "psycopg2-binary==2.9.10",
        "asyncpg==0.30.0",
    )
 )
@app.function(
    image=image,
    max_containers=1,
    timeout=86400,
    secrets=[modal.Secret.from_name("1gb_nonparallel_cognee")],
 )
 async def run_cognee_1gb():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    setup_logging(log_level=INFO)
    await cognee.add("s3://s3-test-laszlo")
    await cognee.cognify()
    return True
@app.local_entrypoint()
 async def main():
    modal_tasks = [run_cognee_1gb.remote.aio()]
    await asyncio.gather(*modal_tasks)
--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@ -151,13 +151,13 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
            extract_graph_from_data,
            graph_model=graph_model,
            ontology_adapter=OntologyResolver(ontology_file=ontology_file_path),
-            task_config={"batch_size": 10},
+            task_config={"batch_size": 50},
        ),  # Generate knowledge graphs from the document chunks.
        Task(
            summarize_text,
-            task_config={"batch_size": 10},
+            task_config={"batch_size": 50},
        ),
-        Task(add_data_points, task_config={"batch_size": 10}),
+        Task(add_data_points, task_config={"batch_size": 50}),
    ]
    return default_tasks
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@ -1,7 +1,11 @@
 from pypdf import PdfReader
 from pypdf.errors import PdfReadError
 from cognee.modules.chunking.Chunker import Chunker
 from .open_data_file import open_data_file
 from .Document import Document
 from cognee.shared.logging_utils import get_logger
 logger = get_logger("PDFDocument")
 class PdfDocument(Document):
@ -9,12 +13,25 @@ class PdfDocument(Document):
    def read(self, chunker_cls: Chunker, max_chunk_size: int):
        with open_data_file(self.raw_data_location, mode="rb") as stream:
-            file = PdfReader(stream)
+            logger.info(f"Reading PDF:{self.raw_data_location}")
            try:
                file = PdfReader(stream, strict=False)
            except Exception as e:
                logger.warning(
                    f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location} with error: {e}"
                )
                return
            def get_text():
-                for page in file.pages:
+                try:
-                    page_text = page.extract_text()
+                    for page in file.pages:
-                    yield page_text
+                        page_text = page.extract_text()
                        yield page_text
                except Exception as e:
                    logger.warning(
                        f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location} with error: {e}"
                    )
                    return
            chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
Author	SHA1	Message	Date
hajdul88	6a7f5be561	fix: Changes exception handling to catch 'em all	2025-07-03 07:54:19 +02:00
hajdul88	278ede9dbe	Fix: fixes damaged pdf file errors	2025-07-02 19:29:23 +02:00
hajdul88	e68b19ec83	changes modal entry + cognify batching	2025-07-02 18:56:11 +02:00
hajdul88	7429f3253c	adds_entry	2025-07-02 13:16:30 +02:00