Compare commits
4 commits
main
...
1gb_data_t
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6a7f5be561 | ||
|
|
278ede9dbe | ||
|
|
e68b19ec83 | ||
|
|
7429f3253c |
3 changed files with 71 additions and 7 deletions
47
1gb_nonparallel_cognee.py
Normal file
47
1gb_nonparallel_cognee.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import modal
|
||||
import os
|
||||
import asyncio
|
||||
import cognee
|
||||
from cognee.shared.logging_utils import get_logger, setup_logging, INFO
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
app = modal.App("1gb_nonparallel_cognee")
|
||||
|
||||
image = (
|
||||
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
|
||||
.copy_local_file("pyproject.toml", "pyproject.toml")
|
||||
.copy_local_file("poetry.lock", "poetry.lock")
|
||||
.pip_install(
|
||||
"protobuf",
|
||||
"h2",
|
||||
"deepeval",
|
||||
"gdown",
|
||||
"plotly",
|
||||
"psycopg2-binary==2.9.10",
|
||||
"asyncpg==0.30.0",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@app.function(
|
||||
image=image,
|
||||
max_containers=1,
|
||||
timeout=86400,
|
||||
secrets=[modal.Secret.from_name("1gb_nonparallel_cognee")],
|
||||
)
|
||||
async def run_cognee_1gb():
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
setup_logging(log_level=INFO)
|
||||
|
||||
await cognee.add("s3://s3-test-laszlo")
|
||||
await cognee.cognify()
|
||||
return True
|
||||
|
||||
|
||||
@app.local_entrypoint()
|
||||
async def main():
|
||||
modal_tasks = [run_cognee_1gb.remote.aio()]
|
||||
await asyncio.gather(*modal_tasks)
|
||||
|
|
@ -151,13 +151,13 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|||
extract_graph_from_data,
|
||||
graph_model=graph_model,
|
||||
ontology_adapter=OntologyResolver(ontology_file=ontology_file_path),
|
||||
task_config={"batch_size": 10},
|
||||
task_config={"batch_size": 50},
|
||||
), # Generate knowledge graphs from the document chunks.
|
||||
Task(
|
||||
summarize_text,
|
||||
task_config={"batch_size": 10},
|
||||
task_config={"batch_size": 50},
|
||||
),
|
||||
Task(add_data_points, task_config={"batch_size": 10}),
|
||||
Task(add_data_points, task_config={"batch_size": 50}),
|
||||
]
|
||||
|
||||
return default_tasks
|
||||
|
|
|
|||
|
|
@ -1,7 +1,11 @@
|
|||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfReadError
|
||||
from cognee.modules.chunking.Chunker import Chunker
|
||||
from .open_data_file import open_data_file
|
||||
from .Document import Document
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger("PDFDocument")
|
||||
|
||||
|
||||
class PdfDocument(Document):
|
||||
|
|
@ -9,12 +13,25 @@ class PdfDocument(Document):
|
|||
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||
with open_data_file(self.raw_data_location, mode="rb") as stream:
|
||||
file = PdfReader(stream)
|
||||
logger.info(f"Reading PDF:{self.raw_data_location}")
|
||||
try:
|
||||
file = PdfReader(stream, strict=False)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location} with error: {e}"
|
||||
)
|
||||
return
|
||||
|
||||
def get_text():
|
||||
for page in file.pages:
|
||||
page_text = page.extract_text()
|
||||
yield page_text
|
||||
try:
|
||||
for page in file.pages:
|
||||
page_text = page.extract_text()
|
||||
yield page_text
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location} with error: {e}"
|
||||
)
|
||||
return
|
||||
|
||||
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue